├── .github
└── workflows
│ └── go.yml
├── BENCHMARKS.md
├── LICENSE.md
├── README.md
├── TODO.md
├── benchmark_test.go
├── builder.go
├── go.mod
├── go.sum
├── inflate.go
├── regex
├── README.md
├── backtrack.go
├── machine.go
├── onepass.go
└── regexp.go
├── restructure.go
├── restructure_test.go
├── samples
├── email-address
│ └── email-address.go
├── find-all-floats
│ └── find-all-floats.go
├── floating-point
│ └── floating-point.go
├── name-dot-name
│ └── name-dot-name.go
├── python-import
│ └── python-import.go
├── quaternion-in-json
│ └── quaternion-in-json.go
└── simple-email
│ └── simple-email.go
└── transform.go
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on: [push]
4 |
5 | jobs:
6 |
7 | test:
8 | name: Test
9 | runs-on: ubuntu-latest
10 |
11 | strategy:
12 | fail-fast: false
13 | matrix:
14 | go: ['1.22', '1.23', 'stable']
15 |
16 | steps:
17 | - name: Checkout
18 | uses: actions/checkout@v4
19 |
20 | - id: go
21 | name: Setup
22 | uses: actions/setup-go@v5
23 | with:
24 | go-version: ${{ matrix.go }}
25 |
26 | - name: Build
27 | run: go build -v .
28 |
29 | - name: Test
30 | run: go test -v -coverprofile=profile.cov .
31 |
32 | - name: Coverage
33 | run: bash <(curl -s https://codecov.io/bash) -f profile.cov
34 |
--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
1 | ## Performance Benchmarks
2 |
3 | There are three benchmarks in `benchmark_test.go` that compare the performance of `go-restructure` to that of the standard library `regexp` package. `go-restructure` uses a very slightly modified version of the `regexp` package so the performance of the core regular expression evaluator is very similar; most of the difference is therefore associated with the overhead of reflection.
4 |
5 | These benchmarks were computed using `go test -bench=.` on an 2.8 GHz Intel Core i7 processor running OSX 10.10.5.
6 |
7 | The first benchmark involves finding the first floating point number in a string of a few thousand characters. `go-restructure` takes around 8% longer than the standard library:
8 |
9 | ```
10 | go-restructure 32428 ns/op
11 | stdlib/regexp 30060 ns/op
12 | ```
13 |
14 | The second benchmark involves parsing a short email address. `go-restructure` takes around
15 | 40% longer than the standard library:
16 |
17 | ```
18 | go-restructure 1188 ns/op
19 | stdlib/regexp 844 ns/op
20 | ```
21 |
22 | The third benchmark involves finding all python import statements in a file of around one hundred lines of python source. `go-restructure` takes around 2x longer than the standard library:
23 |
24 | ```
25 | go-restructure 695 ns/op
26 | stdlib/regexp 337 ns/op
27 | ```
28 |
29 | The high overhead for `go-restructure` on the last benchmark is probably due to `go-restructure` allocating a struct to hold the results of each match found by `FindAll`. In most cases this performance overhead will be a small price to pay for composable, inspectable regular expressions, particularly when it amonuts to the difference between one third of a microsecond and two thirds of a microsecond. However, applications that execute a very large number of regular expressions for which performance is critical may be well advised to use the standard library `regexp` package directly.
30 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Alex Flint
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Struct-based regular expressions for Go
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | ## Match regular expressions into struct fields
11 |
12 | ```shell
13 | go get github.com/alexflint/go-restructure
14 | ```
15 |
16 | This package allows you to express regular expressions by defining a struct, and then capture matched sub-expressions into struct fields. Here is a very simple email address parser:
17 |
18 | ```go
19 | import "github.com/alexflint/go-restructure"
20 |
21 | type EmailAddress struct {
22 | _ struct{} `^`
23 | User string `\w+`
24 | _ struct{} `@`
25 | Host string `[^@]+`
26 | _ struct{} `$`
27 | }
28 |
29 | func main() {
30 | var addr EmailAddress
31 | restructure.Find(&addr, "joe@example.com")
32 | fmt.Println(addr.User) // prints "joe"
33 | fmt.Println(addr.Host) // prints "example.com"
34 | }
35 | ```
36 | (Note that the above is far too simplistic to be used as a serious email address validator.)
37 |
38 | The regular expression that was executed was the concatenation of the struct tags:
39 |
40 | ```
41 | ^(\w+)@([^@]+)$
42 | ```
43 |
44 | The first submatch was inserted into the `User` field and the second into the `Host` field.
45 |
46 | You may also use the `regexp:` tag key, but keep in mind that you must escape quotes and backslashes:
47 |
48 | ```go
49 | type EmailAddress struct {
50 | _ string `regexp:"^"`
51 | User string `regexp:"\\w+"`
52 | _ string `regexp:"@"`
53 | Host string `regexp:"[^@]+"`
54 | _ string `regexp:"$"`
55 | }
56 | ```
57 |
58 | ### Nested Structs
59 |
60 | Here is a slightly more sophisticated email address parser that uses nested structs:
61 |
62 | ```go
63 | type Hostname struct {
64 | Domain string `\w+`
65 | _ struct{} `\.`
66 | TLD string `\w+`
67 | }
68 |
69 | type EmailAddress struct {
70 | _ struct{} `^`
71 | User string `[a-zA-Z0-9._%+-]+`
72 | _ struct{} `@`
73 | Host *Hostname
74 | _ struct{} `$`
75 | }
76 |
77 | func main() {
78 | var addr EmailAddress
79 | success, _ := restructure.Find(&addr, "joe@example.com")
80 | if success {
81 | fmt.Println(addr.User) // prints "joe"
82 | fmt.Println(addr.Host.Domain) // prints "example"
83 | fmt.Println(addr.Host.TLD) // prints "com"
84 | }
85 | }
86 | ```
87 |
88 | Compare this to using the standard library `regexp.FindStringSubmatchIndex` directly:
89 |
90 | ```go
91 | func main() {
92 | content := "joe@example.com"
93 | expr := regexp.MustCompile(`^([a-zA-Z0-9._%+-]+)@((\w+)\.(\w+))$`)
94 | indices := expr.FindStringSubmatchIndex(content)
95 | if len(indices) > 0 {
96 | userBegin, userEnd := indices[2], indices[3]
97 | var user string
98 | if userBegin != -1 && userEnd != -1 {
99 | user = content[userBegin:userEnd]
100 | }
101 |
102 | domainBegin, domainEnd := indices[6], indices[7]
103 | var domain string
104 | if domainBegin != -1 && domainEnd != -1 {
105 | domain = content[domainBegin:domainEnd]
106 | }
107 |
108 | tldBegin, tldEnd := indices[8], indices[9]
109 | var tld string
110 | if tldBegin != -1 && tldEnd != -1 {
111 | tld = content[tldBegin:tldEnd]
112 | }
113 |
114 | fmt.Println(user) // prints "joe"
115 | fmt.Println(domain) // prints "example"
116 | fmt.Println(tld) // prints "com"
117 | }
118 | }
119 | ```
120 |
121 | ### Ints
122 |
123 | It is also possible to set struct fields as `int` to get the string automatically converted.
124 |
125 | ```go
126 | // Matches "12 wombats", "1 wombat" and store the number as int
127 | type Wisdom struct {
128 | Number int `^\d+`
129 | _ string `\s+`
130 | Animal string `\w+`
131 | }
132 | ```
133 |
134 | ### Optional fields
135 |
136 | When nesting one struct within another, you can make the nested struct optional by marking it with `?`. The following example parses floating point numbers with optional sign and exponent:
137 |
138 | ```go
139 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
140 | type Float struct {
141 | Sign *Sign `?` // sign is optional
142 | Whole string `[0-9]*`
143 | Period struct{} `\.?`
144 | Frac string `[0-9]+`
145 | Exponent *Exponent `?` // exponent is optional
146 | }
147 |
148 | // Matches "e+4", "E6", "e-03"
149 | type Exponent struct {
150 | _ struct{} `[eE]`
151 | Sign *Sign `?` // sign is optional
152 | Num string `[0-9]+`
153 | }
154 |
155 | // Matches "+" or "-"
156 | type Sign struct {
157 | Ch string `[+-]`
158 | }
159 | ```
160 |
161 | When an optional sub-struct is not matched, it will be set to nil:
162 |
163 | ```javascript
164 | "1.23" -> {
165 | "Sign": nil,
166 | "Whole": "1",
167 | "Frac": "23",
168 | "Exponent": nil
169 | }
170 |
171 | "1.23e+45" -> {
172 | "Sign": nil,
173 | "Whole": "1",
174 | "Frac": "23",
175 | "Exponent": {
176 | "Sign": {
177 | "Ch": "+"
178 | },
179 | "Num": "45"
180 | }
181 | }
182 | ```
183 |
184 | ### Finding multiple matches
185 |
186 | The following example uses `Regexp.FindAll` to extract all floating point numbers from
187 | a string, using the same `Float` struct as in the example above.
188 |
189 | ```go
190 | src := "There are 10.4 cats for every 100 dogs in the United States."
191 | floatRegexp := restructure.MustCompile(Float{}, restructure.Options{})
192 | var floats []Float
193 | floatRegexp.FindAll(&floats, src, -1)
194 | ```
195 |
196 | To limit the number of matches set the third parameter to a positive number.
197 |
198 | ### Getting begin and end positions for submatches
199 |
200 | To get the begin and end position of submatches, use the `restructure.Submatch` struct in place of `string`:
201 |
202 | Here is an example of matching python imports such as `import foo as bar`:
203 |
204 | ```go
205 | type Import struct {
206 | _ struct{} `^import\s+`
207 | Package restructure.Submatch `\w+`
208 | _ struct{} `\s+as\s+`
209 | Alias restructure.Submatch `\w+`
210 | }
211 |
212 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{})
213 |
214 | func main() {
215 | var imp Import
216 | importRegexp.Find(&imp, "import foo as bar")
217 | fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End)
218 | fmt.Printf(" AS %s (bytes %d...%d)\n", imp.Alias.String(), imp.Alias.Begin, imp.Alias.End)
219 | }
220 | ```
221 | Output:
222 | ```
223 | IMPORT foo (bytes 7...10)
224 | AS bar (bytes 14...17)
225 | ```
226 |
227 | ### Regular expressions inside JSON
228 |
229 | To run a regular expression as part of a json unmarshal, just implement the `JSONUnmarshaler` interface. Here is an example that parses the following JSON string containing a quaternion:
230 |
231 | ```javascript
232 | {
233 | "Var": "foo",
234 | "Val": "1+2i+3j+4k"
235 | }
236 | ```
237 |
238 | First we define the expressions for matching quaternions in the form `1+2i+3j+4k`:
239 |
240 | ```go
241 | // Matches "1", "-12", "+12"
242 | type RealPart struct {
243 | Sign string `regexp:"[+-]?"`
244 | Real string `regexp:"[0-9]+"`
245 | }
246 |
247 | // Matches "+123", "-1"
248 | type SignedInt struct {
249 | Sign string `regexp:"[+-]"`
250 | Real string `regexp:"[0-9]+"`
251 | }
252 |
253 | // Matches "+12i", "-123i"
254 | type IPart struct {
255 | Magnitude SignedInt
256 | _ struct{} `regexp:"i"`
257 | }
258 |
259 | // Matches "+12j", "-123j"
260 | type JPart struct {
261 | Magnitude SignedInt
262 | _ struct{} `regexp:"j"`
263 | }
264 |
265 | // Matches "+12k", "-123k"
266 | type KPart struct {
267 | Magnitude SignedInt
268 | _ struct{} `regexp:"k"`
269 | }
270 |
271 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc
272 | type Quaternion struct {
273 | Real *RealPart
274 | I *IPart `regexp:"?"`
275 | J *JPart `regexp:"?"`
276 | K *KPart `regexp:"?"`
277 | }
278 |
279 | // matches the quoted strings `"-1+2i"`, `"3-4i"`, `"12+34i"`, etc
280 | type QuotedQuaternion struct {
281 | _ struct{} `regexp:"^"`
282 | _ struct{} `regexp:"\""`
283 | Quaternion *Quaternion
284 | _ struct{} `regexp:"\""`
285 | _ struct{} `regexp:"$"`
286 | }
287 | ```
288 |
289 | Next we implement `UnmarshalJSON` for the `QuotedQuaternion` type:
290 | ```go
291 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{})
292 |
293 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error {
294 | if !quaternionRegexp.Find(c, string(b)) {
295 | return fmt.Errorf("%s is not a quaternion", string(b))
296 | }
297 | return nil
298 | }
299 |
300 | ```
301 |
302 | Now we can define a struct and unmarshal JSON into it:
303 | ```go
304 | type Var struct {
305 | Name string
306 | Value *QuotedQuaternion
307 | }
308 |
309 | func main() {
310 | src := `{"name": "foo", "value": "1+2i+3j+4k"}`
311 | var v Var
312 | json.Unmarshal([]byte(src), &v)
313 | }
314 | ```
315 | The result is:
316 | ```javascript
317 | {
318 | "Name": "foo",
319 | "Value": {
320 | "Quaternion": {
321 | "Real": {
322 | "Sign": "",
323 | "Real": "1"
324 | },
325 | "I": {
326 | "Magnitude": {
327 | "Sign": "+",
328 | "Real": "2"
329 | }
330 | },
331 | "J": {
332 | "Magnitude": {
333 | "Sign": "+",
334 | "Real": "3"
335 | }
336 | },
337 | "K": {
338 | "Magnitude": {
339 | "Sign": "+",
340 | "Real": "4"
341 | }
342 | }
343 | }
344 | }
345 | }
346 | ```
347 |
348 | ### Index of examples
349 |
350 | - [Parse an email address](samples/simple-email/simple-email.go)
351 | - [Parse an email address using nested structs](samples/email-address/email-address.go)
352 | - [Parse a floating point number](samples/floating-point/floating-point.go)
353 | - [Find all floats in a string](samples/find-all-floats/find-all-floats.go)
354 | - [Parse a dotted name](samples/name-dot-name/name-dot-name.go)
355 | - [Parse a python import statement](samples/python-import/python-import.go)
356 | - [Regular expression inside a JSON struct](samples/quaternion-in-json/quaternion-in-json.go)
357 |
358 | ### Benchmarks
359 |
360 | See [benchmarks document](BENCHMARKS.md)
361 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | ## TODO
2 | - repeated subexpressions
3 | - optional terminal matches (look at top node in AST)
4 | - remove OpCaptures from terminals
5 |
--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import (
4 | "io/ioutil"
5 | "os"
6 | "regexp"
7 | "testing"
8 | )
9 |
10 | var src = `
11 | The US economy went through an economic downturn following the financial
12 | crisis of 2007–08, with output as late as 2013 still below potential
13 | according to the Congressional Budget Office.[57] The economy, however,
14 | began to recover in the second half of 2009, and as of November 2015,
15 | unemployment had declined from a high of 10% to 5%; the government's
16 | broader U-6 unemployment rate, which includes the part-time underemployed,
17 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of
18 | the lowest labor union participation rates in the OECD.[58] Households
19 | living on less than $2 per day before government benefits, doubled from
20 | 1996 levels to 1.5 million households in 2011, including 2.8 million
21 | children.[59] The gap in income between rich and poor is greater in the
22 | United States than in any other developed country.[60] Total public and
23 | private debt was $50 trillion at the end of the first quarter of 2010,
24 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more
25 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion
26 | and domestic financial liabilities totaled $106 trillion.[63]
27 | `
28 |
29 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
30 | type Float struct {
31 | Sign *Sign `?`
32 | Whole string `[0-9]*`
33 | Period struct{} `\.?`
34 | Frac string `[0-9]+`
35 | Exponent *Exponent `?`
36 | }
37 |
38 | // Matches "+" or "-"
39 | type Sign struct {
40 | Ch string `[+-]`
41 | }
42 |
43 | // Matches "e+4", "E6", "e-03"
44 | type Exponent struct {
45 | _ struct{} `[eE]`
46 | Sign *Sign `?`
47 | Num string `[0-9]+`
48 | }
49 |
50 | func BenchmarkFindFloat(b *testing.B) {
51 | pattern := MustCompile(Float{}, Options{})
52 | var f Float
53 | b.ResetTimer()
54 | for i := 0; i < b.N; i++ {
55 | pattern.Find(&f, src)
56 | }
57 | }
58 |
59 | func BenchmarkFindFloatStdlib(b *testing.B) {
60 | pattern := regexp.MustCompile(`((?P((?P[\+\-]))?)(?P[0-9]*)(?P\.?)(?P[0-9]+)(?P((?i:E)(?P((?P[\+\-]))?)(?P[0-9]+))?))`)
61 | b.ResetTimer()
62 | for i := 0; i < b.N; i++ {
63 | pattern.FindSubmatch([]byte(src))
64 | }
65 | }
66 |
67 | type EmailAddress struct {
68 | _ struct{} `^`
69 | User string `[a-zA-Z0-9._%+-]+`
70 | _ struct{} `@`
71 | Host string `.+`
72 | _ struct{} `$`
73 | }
74 |
75 | func BenchmarkParseEmail(b *testing.B) {
76 | var addr EmailAddress
77 | pattern := MustCompile(EmailAddress{}, Options{})
78 | b.ResetTimer()
79 | for i := 0; i < b.N; i++ {
80 | pattern.Find(&addr, "joe@example.com")
81 | }
82 | }
83 |
84 | func BenchmarkParseEmailStdlib(b *testing.B) {
85 | //pattern := regexp.MustCompile(`(\A(?P[%\+\--\.0-9A-Z_a-z]+)@(?P((?P[0-9A-Z_a-z]+)\.(?P[0-9A-Z_a-z]+)))(?-m:$))`)
86 | pattern := regexp.MustCompile(`(\A(?P[%\+\--\.0-9A-Z_a-z]+)@(?P.+)(?-m:$))`)
87 | b.ResetTimer()
88 | for i := 0; i < b.N; i++ {
89 | pattern.FindStringSubmatch("joe@example.com")
90 | }
91 | }
92 |
93 | // Import matches "import foo" and "import foo as bar"
94 | type Import struct {
95 | _ struct{} `^import\s+`
96 | Package Submatch `\w+`
97 | Alias *AsName `?`
98 | _ struct{} `$`
99 | }
100 |
101 | // AsName matches "as xyz"
102 | type AsName struct {
103 | _ struct{} `\s+as\s+`
104 | Name Submatch `\w+`
105 | }
106 |
107 | func BenchmarkFindAllImports(b *testing.B) {
108 | path := os.Getenv("TESTDATA")
109 | if path == "" {
110 | b.Skip("skipping because TESTDATA environment var was not set")
111 | }
112 | buf, err := ioutil.ReadFile(path)
113 | if err != nil {
114 | b.Error(err)
115 | }
116 | pattern := MustCompile(Import{}, Options{})
117 | var imports []Import
118 | b.ResetTimer()
119 | for i := 0; i < b.N; i++ {
120 | pattern.FindAll(&imports, string(buf), -1)
121 | }
122 | }
123 |
124 | func BenchmarkFindAllImportsStdlib(b *testing.B) {
125 | path := os.Getenv("TESTDATA")
126 | if path == "" {
127 | b.Skip("skipping because TESTDATA environment var was not set")
128 | }
129 | buf, err := ioutil.ReadFile(path)
130 | if err != nil {
131 | b.Error(err)
132 | }
133 | pattern := regexp.MustCompile(`(\Aimport[\t-\n\f-\r ]+(?P[0-9A-Z_a-z]+)(?P([\t-\n\f-\r ]+as[\t-\n\f-\r ]+(?P[0-9A-Z_a-z]+))?)(?-m:$))`)
134 | b.ResetTimer()
135 | for i := 0; i < b.N; i++ {
136 | pattern.FindAllSubmatchIndex(buf, -1)
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/builder.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "reflect"
7 | "regexp/syntax"
8 | "strings"
9 | )
10 |
11 | // A Role determines how a struct field is inflated
12 | type Role int
13 |
14 | const (
15 | EmptyRole Role = iota
16 | PosRole
17 | SubstructRole
18 | StringScalarRole
19 | IntScalarRole
20 | ByteSliceScalarRole
21 | SubmatchScalarRole
22 | )
23 |
24 | // A Struct describes how to inflate a match into a struct
25 | type Struct struct {
26 | capture int
27 | fields []*Field
28 | }
29 |
30 | // A Field describes how to inflate a match into a field
31 | type Field struct {
32 | capture int // index of the capture for this field
33 | index []int // index of this field within its parent struct
34 | child *Struct // descendant struct; nil for terminals
35 | role Role
36 | }
37 |
38 | func isExported(f reflect.StructField) bool {
39 | return f.PkgPath == ""
40 | }
41 |
42 | // A builder builds stencils from structs using reflection
43 | type builder struct {
44 | numCaptures int
45 | opts Options
46 | }
47 |
48 | func newBuilder(opts Options) *builder {
49 | return &builder{
50 | opts: opts,
51 | }
52 | }
53 |
54 | func (b *builder) nextCaptureIndex() int {
55 | k := b.numCaptures
56 | b.numCaptures++
57 | return k
58 | }
59 |
60 | func (b *builder) extractTag(tag reflect.StructTag) (string, error) {
61 | // Allow tags that look like either `regexp:"\\w+"` or just `\w+`
62 | if s := tag.Get("regexp"); s != "" {
63 | return s, nil
64 | } else if strings.Contains(string(tag), `regexp:"`) {
65 | return "", errors.New("incorrectly escaped struct tag")
66 | } else {
67 | return string(tag), nil
68 | }
69 | }
70 |
71 | func removeCaptures(expr *syntax.Regexp) ([]*syntax.Regexp, error) {
72 | if expr.Op == syntax.OpCapture {
73 | return expr.Sub, nil
74 | }
75 | return []*syntax.Regexp{expr}, nil
76 | }
77 |
78 | func (b *builder) terminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
79 | pattern, err := b.extractTag(f.Tag)
80 | if err != nil {
81 | return nil, nil, fmt.Errorf("%s: %v", fullName, err)
82 | }
83 | if pattern == "" {
84 | return nil, nil, nil
85 | }
86 |
87 | // Parse the pattern
88 | expr, err := syntax.Parse(pattern, b.opts.SyntaxFlags)
89 | if err != nil {
90 | return nil, nil, fmt.Errorf(`%s: %v (pattern was "%s")`, fullName, err, f.Tag)
91 | }
92 |
93 | // Remove capture nodes within the AST
94 | expr, err = transform(expr, removeCaptures)
95 | if err != nil {
96 | return nil, nil, fmt.Errorf(`failed to remove captures from "%s": %v`, pattern, err)
97 | }
98 |
99 | // Determine the kind
100 | t := f.Type
101 | if t.Kind() == reflect.Ptr {
102 | t = t.Elem()
103 | }
104 | var role Role
105 | switch t {
106 | case emptyType:
107 | role = EmptyRole
108 | case stringType:
109 | role = StringScalarRole
110 | case intType:
111 | role = IntScalarRole
112 | case byteSliceType:
113 | role = ByteSliceScalarRole
114 | case submatchType:
115 | role = SubmatchScalarRole
116 | }
117 |
118 | captureIndex := -1
119 | if isExported(f) {
120 | captureIndex = b.nextCaptureIndex()
121 | expr = &syntax.Regexp{
122 | Op: syntax.OpCapture,
123 | Sub: []*syntax.Regexp{expr},
124 | Name: f.Name,
125 | Cap: captureIndex,
126 | }
127 | }
128 | field := &Field{
129 | index: f.Index,
130 | capture: captureIndex,
131 | role: role,
132 | }
133 |
134 | return field, expr, nil
135 | }
136 |
137 | func (b *builder) pos(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
138 | if !isExported(f) {
139 | return nil, nil, nil
140 | }
141 | captureIndex := b.nextCaptureIndex()
142 | empty := &syntax.Regexp{
143 | Op: syntax.OpEmptyMatch,
144 | }
145 | expr := &syntax.Regexp{
146 | Op: syntax.OpCapture,
147 | Sub: []*syntax.Regexp{empty},
148 | Name: f.Name,
149 | Cap: captureIndex,
150 | }
151 | field := &Field{
152 | index: f.Index,
153 | capture: captureIndex,
154 | role: PosRole,
155 | }
156 |
157 | return field, expr, nil
158 | }
159 |
160 | func (b *builder) nonterminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
161 | opstr, err := b.extractTag(f.Tag)
162 | if err != nil {
163 | return nil, nil, err
164 | }
165 | child, expr, err := b.structure(f.Type)
166 | if err != nil {
167 | return nil, nil, err
168 | }
169 |
170 | switch opstr {
171 | case "?":
172 | if f.Type.Kind() != reflect.Ptr {
173 | return nil, nil, fmt.Errorf(`%s is marked with "?" but is not a pointer`, fullName)
174 | }
175 | expr = &syntax.Regexp{
176 | Sub: []*syntax.Regexp{expr},
177 | Op: syntax.OpQuest,
178 | }
179 | case "":
180 | // nothing to do
181 | default:
182 | return nil, nil, fmt.Errorf("invalid op \"%s\" for non-slice field on %s", opstr, fullName)
183 | }
184 |
185 | captureIndex := b.nextCaptureIndex()
186 | expr = &syntax.Regexp{
187 | Op: syntax.OpCapture,
188 | Sub: []*syntax.Regexp{expr},
189 | Name: f.Name,
190 | Cap: captureIndex,
191 | }
192 | field := &Field{
193 | index: f.Index,
194 | capture: captureIndex,
195 | child: child,
196 | role: SubstructRole,
197 | }
198 |
199 | return field, expr, nil
200 | }
201 |
202 | func (b *builder) field(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
203 | if isScalar(f.Type) {
204 | return b.terminal(f, fullName)
205 | } else if isStruct(f.Type) {
206 | return b.nonterminal(f, fullName)
207 | } else if f.Type == posType {
208 | return b.pos(f, fullName)
209 | }
210 | return nil, nil, nil
211 | }
212 |
213 | func (b *builder) structure(t reflect.Type) (*Struct, *syntax.Regexp, error) {
214 | if t.Kind() == reflect.Ptr {
215 | t = t.Elem()
216 | }
217 |
218 | // Select a capture index first so that the struct comes before its fields
219 | captureIndex := b.nextCaptureIndex()
220 |
221 | var exprs []*syntax.Regexp
222 | var fields []*Field
223 | for i := 0; i < t.NumField(); i++ {
224 | f := t.Field(i)
225 | field, expr, err := b.field(f, t.Name()+"."+f.Name)
226 | if err != nil {
227 | return nil, nil, err
228 | }
229 | if field != nil {
230 | exprs = append(exprs, expr)
231 | fields = append(fields, field)
232 | }
233 | }
234 |
235 | // Wrap in a concat
236 | expr := &syntax.Regexp{
237 | Sub: exprs,
238 | Op: syntax.OpConcat,
239 | }
240 |
241 | // Wrap in a capture
242 | expr = &syntax.Regexp{
243 | Sub: []*syntax.Regexp{expr},
244 | Op: syntax.OpCapture,
245 | Cap: captureIndex,
246 | }
247 |
248 | st := &Struct{
249 | fields: fields,
250 | capture: captureIndex,
251 | }
252 |
253 | return st, expr, nil
254 | }
255 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/alexflint/go-restructure
2 |
3 | go 1.15
4 |
5 | require (
6 | github.com/stretchr/testify v1.7.0
7 | )
8 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28 h1:p32gFVhF4WnI/qpSpZ0//GGb6BAAFLVnkd4Vowg7im8=
2 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28/go.mod h1:8Mq15S+jJn5TWrSU0Ua7L8rFWmY06lu0UCbhJrrcGBY=
3 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
8 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
9 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
10 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
11 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
12 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
13 |
--------------------------------------------------------------------------------
/inflate.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import (
4 | "fmt"
5 | "reflect"
6 | "strconv"
7 | )
8 |
9 | var (
10 | posType = reflect.TypeOf(Pos(0))
11 |
12 | emptyType = reflect.TypeOf(struct{}{})
13 | stringType = reflect.TypeOf("")
14 | intType = reflect.TypeOf(1)
15 | byteSliceType = reflect.TypeOf([]byte{})
16 | submatchType = reflect.TypeOf(Submatch{})
17 | scalarTypes = []reflect.Type{
18 | emptyType,
19 | stringType,
20 | intType,
21 | byteSliceType,
22 | submatchType,
23 | }
24 | )
25 |
26 | // determines whether t is a scalar type or a pointer to a scalar type
27 | func isScalar(t reflect.Type) bool {
28 | if t.Kind() == reflect.Ptr {
29 | t = t.Elem()
30 | }
31 | for _, u := range scalarTypes {
32 | if t == u {
33 | return true
34 | }
35 | }
36 | return false
37 | }
38 |
39 | // determines whether t is a struct type or a pointer to a struct type
40 | func isStruct(t reflect.Type) bool {
41 | if t.Kind() == reflect.Ptr {
42 | t = t.Elem()
43 | }
44 | return t.Kind() == reflect.Struct
45 | }
46 |
47 | // ensureAlloc replaces nil pointers with newly allocated objects
48 | func ensureAlloc(dest reflect.Value) reflect.Value {
49 | if dest.Kind() == reflect.Ptr {
50 | if dest.IsNil() {
51 | dest.Set(reflect.New(dest.Type().Elem()))
52 | }
53 | return dest.Elem()
54 | }
55 | return dest
56 | }
57 |
58 | // inflate the results of a match into a string
59 | func inflateScalar(dest reflect.Value, match *match, captureIndex int, role Role) error {
60 | if captureIndex == -1 {
61 | // This means the field generated a regex but we did not want the results
62 | return nil
63 | }
64 |
65 | // Get the subcapture for this field
66 | subcapture := match.captures[captureIndex]
67 | if !subcapture.wasMatched() {
68 | // This means the subcapture was optional and was not matched
69 | return nil
70 | }
71 |
72 | // Get the matched bytes
73 | buf := match.input[subcapture.begin:subcapture.end]
74 |
75 | // If dest is a nil pointer then allocate a new instance and assign the pointer to dest
76 | dest = ensureAlloc(dest)
77 |
78 | // Deal with each recognized type
79 | switch role {
80 | case StringScalarRole:
81 | dest.SetString(string(buf))
82 | return nil
83 | case IntScalarRole:
84 | if intVal, err := strconv.Atoi(string(buf)); err != nil {
85 | return fmt.Errorf("unable to capture into %s", dest.Type().String())
86 | } else {
87 | dest.SetInt(int64(intVal))
88 | return nil
89 | }
90 | case ByteSliceScalarRole:
91 | dest.SetBytes(buf)
92 | return nil
93 | case SubmatchScalarRole:
94 | submatch := dest.Addr().Interface().(*Submatch)
95 | submatch.Begin = Pos(subcapture.begin)
96 | submatch.End = Pos(subcapture.end)
97 | submatch.Bytes = buf
98 | return nil
99 | }
100 | return fmt.Errorf("unable to capture into %s", dest.Type().String())
101 | }
102 |
103 | // inflate the position of a match into a Pos
104 | func inflatePos(dest reflect.Value, match *match, captureIndex int) error {
105 | if captureIndex == -1 {
106 | // This means the field generated a regex but we did not want the results
107 | return nil
108 | }
109 |
110 | // Get the subcapture for this field
111 | subcapture := match.captures[captureIndex]
112 | if !subcapture.wasMatched() {
113 | // This means the subcapture was optional and was not matched
114 | return nil
115 | }
116 |
117 | // If dest is a nil pointer then allocate a new instance and assign the pointer to dest
118 | dest.SetInt(int64(subcapture.begin))
119 | return nil
120 | }
121 |
122 | // inflate the results of a match into a struct
123 | func inflateStruct(dest reflect.Value, match *match, structure *Struct) error {
124 | // Get the subcapture for this field
125 | subcapture := match.captures[structure.capture]
126 | if !subcapture.wasMatched() {
127 | return nil
128 | }
129 |
130 | // If the field is a nil pointer then allocate an instance and assign pointer to dest
131 | dest = ensureAlloc(dest)
132 |
133 | // Inflate values into the struct fields
134 | for _, field := range structure.fields {
135 | switch field.role {
136 | case PosRole:
137 | val := dest.FieldByIndex(field.index)
138 | if err := inflatePos(val, match, field.capture); err != nil {
139 | return err
140 | }
141 | case StringScalarRole, ByteSliceScalarRole, SubmatchScalarRole, IntScalarRole:
142 | val := dest.FieldByIndex(field.index)
143 | if err := inflateScalar(val, match, field.capture, field.role); err != nil {
144 | return err
145 | }
146 | case SubstructRole:
147 | val := dest.FieldByIndex(field.index)
148 | if err := inflateStruct(val, match, field.child); err != nil {
149 | return err
150 | }
151 | }
152 | }
153 | return nil
154 | }
155 |
--------------------------------------------------------------------------------
/regex/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a slightly modified version of the Go 1.5.2 standard library `regexp` package.
--------------------------------------------------------------------------------
/regex/backtrack.go:
--------------------------------------------------------------------------------
1 | // Copyright 2015 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // backtrack is a regular expression search with submatch
6 | // tracking for small regular expressions and texts. It allocates
7 | // a bit vector with (length of input) * (length of prog) bits,
8 | // to make sure it never explores the same (character position, instruction)
9 | // state multiple times. This limits the search to run in time linear in
10 | // the length of the test.
11 | //
12 | // backtrack is a fast replacement for the NFA code on small
13 | // regexps when onepass cannot be used.
14 |
15 | package regex
16 |
17 | import "regexp/syntax"
18 |
19 | // A job is an entry on the backtracker's job stack. It holds
20 | // the instruction pc and the position in the input.
21 | type job struct {
22 | pc uint32
23 | arg int
24 | pos int
25 | }
26 |
27 | const (
28 | visitedBits = 32
29 | maxBacktrackProg = 500 // len(prog.Inst) <= max
30 | maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
31 | )
32 |
33 | // bitState holds state for the backtracker.
34 | type bitState struct {
35 | prog *syntax.Prog
36 |
37 | end int
38 | cap []int
39 | input input
40 | jobs []job
41 | visited []uint32
42 | }
43 |
44 | var notBacktrack *bitState = nil
45 |
46 | // maxBitStateLen returns the maximum length of a string to search with
47 | // the backtracker using prog.
48 | func maxBitStateLen(prog *syntax.Prog) int {
49 | if !shouldBacktrack(prog) {
50 | return 0
51 | }
52 | return maxBacktrackVector / len(prog.Inst)
53 | }
54 |
55 | // newBitState returns a new bitState for the given prog,
56 | // or notBacktrack if the size of the prog exceeds the maximum size that
57 | // the backtracker will be run for.
58 | func newBitState(prog *syntax.Prog) *bitState {
59 | if !shouldBacktrack(prog) {
60 | return notBacktrack
61 | }
62 | return &bitState{
63 | prog: prog,
64 | }
65 | }
66 |
67 | // shouldBacktrack reports whether the program is too
68 | // long for the backtracker to run.
69 | func shouldBacktrack(prog *syntax.Prog) bool {
70 | return len(prog.Inst) <= maxBacktrackProg
71 | }
72 |
73 | // reset resets the state of the backtracker.
74 | // end is the end position in the input.
75 | // ncap is the number of captures.
76 | func (b *bitState) reset(end int, ncap int) {
77 | b.end = end
78 |
79 | if cap(b.jobs) == 0 {
80 | b.jobs = make([]job, 0, 256)
81 | } else {
82 | b.jobs = b.jobs[:0]
83 | }
84 |
85 | visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
86 | if cap(b.visited) < visitedSize {
87 | b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
88 | } else {
89 | b.visited = b.visited[:visitedSize]
90 | for i := range b.visited {
91 | b.visited[i] = 0
92 | }
93 | }
94 |
95 | if cap(b.cap) < ncap {
96 | b.cap = make([]int, ncap)
97 | } else {
98 | b.cap = b.cap[:ncap]
99 | }
100 | for i := range b.cap {
101 | b.cap[i] = -1
102 | }
103 | }
104 |
105 | // shouldVisit reports whether the combination of (pc, pos) has not
106 | // been visited yet.
107 | func (b *bitState) shouldVisit(pc uint32, pos int) bool {
108 | n := uint(int(pc)*(b.end+1) + pos)
109 | if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
110 | return false
111 | }
112 | b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
113 | return true
114 | }
115 |
116 | // push pushes (pc, pos, arg) onto the job stack if it should be
117 | // visited.
118 | func (b *bitState) push(pc uint32, pos int, arg int) {
119 | if b.prog.Inst[pc].Op == syntax.InstFail {
120 | return
121 | }
122 |
123 | // Only check shouldVisit when arg == 0.
124 | // When arg > 0, we are continuing a previous visit.
125 | if arg == 0 && !b.shouldVisit(pc, pos) {
126 | return
127 | }
128 |
129 | b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
130 | }
131 |
132 | // tryBacktrack runs a backtracking search starting at pos.
133 | func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
134 | longest := m.re.longest
135 | m.matched = false
136 |
137 | b.push(pc, pos, 0)
138 | for len(b.jobs) > 0 {
139 | l := len(b.jobs) - 1
140 | // Pop job off the stack.
141 | pc := b.jobs[l].pc
142 | pos := b.jobs[l].pos
143 | arg := b.jobs[l].arg
144 | b.jobs = b.jobs[:l]
145 |
146 | // Optimization: rather than push and pop,
147 | // code that is going to Push and continue
148 | // the loop simply updates ip, p, and arg
149 | // and jumps to CheckAndLoop. We have to
150 | // do the ShouldVisit check that Push
151 | // would have, but we avoid the stack
152 | // manipulation.
153 | goto Skip
154 | CheckAndLoop:
155 | if !b.shouldVisit(pc, pos) {
156 | continue
157 | }
158 | Skip:
159 |
160 | inst := b.prog.Inst[pc]
161 |
162 | switch inst.Op {
163 | default:
164 | panic("bad inst")
165 | case syntax.InstFail:
166 | panic("unexpected InstFail")
167 | case syntax.InstAlt:
168 | // Cannot just
169 | // b.push(inst.Out, pos, 0)
170 | // b.push(inst.Arg, pos, 0)
171 | // If during the processing of inst.Out, we encounter
172 | // inst.Arg via another path, we want to process it then.
173 | // Pushing it here will inhibit that. Instead, re-push
174 | // inst with arg==1 as a reminder to push inst.Arg out
175 | // later.
176 | switch arg {
177 | case 0:
178 | b.push(pc, pos, 1)
179 | pc = inst.Out
180 | goto CheckAndLoop
181 | case 1:
182 | // Finished inst.Out; try inst.Arg.
183 | arg = 0
184 | pc = inst.Arg
185 | goto CheckAndLoop
186 | }
187 | panic("bad arg in InstAlt")
188 |
189 | case syntax.InstAltMatch:
190 | // One opcode consumes runes; the other leads to match.
191 | switch b.prog.Inst[inst.Out].Op {
192 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
193 | // inst.Arg is the match.
194 | b.push(inst.Arg, pos, 0)
195 | pc = inst.Arg
196 | pos = b.end
197 | goto CheckAndLoop
198 | }
199 | // inst.Out is the match - non-greedy
200 | b.push(inst.Out, b.end, 0)
201 | pc = inst.Out
202 | goto CheckAndLoop
203 |
204 | case syntax.InstRune:
205 | r, width := i.step(pos)
206 | if !inst.MatchRune(r) {
207 | continue
208 | }
209 | pos += width
210 | pc = inst.Out
211 | goto CheckAndLoop
212 |
213 | case syntax.InstRune1:
214 | r, width := i.step(pos)
215 | if r != inst.Rune[0] {
216 | continue
217 | }
218 | pos += width
219 | pc = inst.Out
220 | goto CheckAndLoop
221 |
222 | case syntax.InstRuneAnyNotNL:
223 | r, width := i.step(pos)
224 | if r == '\n' || r == endOfText {
225 | continue
226 | }
227 | pos += width
228 | pc = inst.Out
229 | goto CheckAndLoop
230 |
231 | case syntax.InstRuneAny:
232 | r, width := i.step(pos)
233 | if r == endOfText {
234 | continue
235 | }
236 | pos += width
237 | pc = inst.Out
238 | goto CheckAndLoop
239 |
240 | case syntax.InstCapture:
241 | switch arg {
242 | case 0:
243 | if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
244 | // Capture pos to register, but save old value.
245 | b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
246 | b.cap[inst.Arg] = pos
247 | }
248 | pc = inst.Out
249 | goto CheckAndLoop
250 | case 1:
251 | // Finished inst.Out; restore the old value.
252 | b.cap[inst.Arg] = pos
253 | continue
254 |
255 | }
256 | panic("bad arg in InstCapture")
257 | continue
258 |
259 | case syntax.InstEmptyWidth:
260 | if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
261 | continue
262 | }
263 | pc = inst.Out
264 | goto CheckAndLoop
265 |
266 | case syntax.InstNop:
267 | pc = inst.Out
268 | goto CheckAndLoop
269 |
270 | case syntax.InstMatch:
271 | // We found a match. If the caller doesn't care
272 | // where the match is, no point going further.
273 | if len(b.cap) == 0 {
274 | m.matched = true
275 | return m.matched
276 | }
277 |
278 | // Record best match so far.
279 | // Only need to check end point, because this entire
280 | // call is only considering one start position.
281 | if len(b.cap) > 1 {
282 | b.cap[1] = pos
283 | }
284 | if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
285 | copy(m.matchcap, b.cap)
286 | }
287 | m.matched = true
288 |
289 | // If going for first match, we're done.
290 | if !longest {
291 | return m.matched
292 | }
293 |
294 | // If we used the entire text, no longer match is possible.
295 | if pos == b.end {
296 | return m.matched
297 | }
298 |
299 | // Otherwise, continue on in hope of a longer match.
300 | continue
301 | }
302 | panic("unreachable")
303 | }
304 |
305 | return m.matched
306 | }
307 |
308 | // backtrack runs a backtracking search of prog on the input starting at pos.
309 | func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
310 | if !i.canCheckPrefix() {
311 | panic("backtrack called for a RuneReader")
312 | }
313 |
314 | startCond := m.re.cond
315 | if startCond == ^syntax.EmptyOp(0) { // impossible
316 | return false
317 | }
318 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
319 | // Anchored match, past beginning of text.
320 | return false
321 | }
322 |
323 | b := m.b
324 | b.reset(end, ncap)
325 |
326 | m.matchcap = m.matchcap[:ncap]
327 | for i := range m.matchcap {
328 | m.matchcap[i] = -1
329 | }
330 |
331 | // Anchored search must start at the beginning of the input
332 | if startCond&syntax.EmptyBeginText != 0 {
333 | if len(b.cap) > 0 {
334 | b.cap[0] = pos
335 | }
336 | return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
337 | }
338 |
339 | // Unanchored search, starting from each possible text position.
340 | // Notice that we have to try the empty string at the end of
341 | // the text, so the loop condition is pos <= end, not pos < end.
342 | // This looks like it's quadratic in the size of the text,
343 | // but we are not clearing visited between calls to TrySearch,
344 | // so no work is duplicated and it ends up still being linear.
345 | width := -1
346 | for ; pos <= end && width != 0; pos += width {
347 | if len(m.re.prefix) > 0 {
348 | // Match requires literal prefix; fast search for it.
349 | advance := i.index(m.re, pos)
350 | if advance < 0 {
351 | return false
352 | }
353 | pos += advance
354 | }
355 |
356 | if len(b.cap) > 0 {
357 | b.cap[0] = pos
358 | }
359 | if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
360 | // Match must be leftmost; done.
361 | return true
362 | }
363 | _, width = i.step(pos)
364 | }
365 | return false
366 | }
367 |
--------------------------------------------------------------------------------
/regex/machine.go:
--------------------------------------------------------------------------------
1 | // Copyright 2011 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package regex
6 |
7 | import (
8 | "io"
9 | "regexp/syntax"
10 | )
11 |
12 | // A queue is a 'sparse array' holding pending threads of execution.
13 | // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
14 | type queue struct {
15 | sparse []uint32
16 | dense []entry
17 | }
18 |
19 | // A entry is an entry on a queue.
20 | // It holds both the instruction pc and the actual thread.
21 | // Some queue entries are just place holders so that the machine
22 | // knows it has considered that pc. Such entries have t == nil.
23 | type entry struct {
24 | pc uint32
25 | t *thread
26 | }
27 |
28 | // A thread is the state of a single path through the machine:
29 | // an instruction and a corresponding capture array.
30 | // See http://swtch.com/~rsc/regexp/regexp2.html
31 | type thread struct {
32 | inst *syntax.Inst
33 | cap []int
34 | }
35 |
36 | // A machine holds all the state during an NFA simulation for p.
37 | type machine struct {
38 | re *Regexp // corresponding Regexp
39 | p *syntax.Prog // compiled program
40 | op *onePassProg // compiled onepass program, or notOnePass
41 | maxBitStateLen int // max length of string to search with bitstate
42 | b *bitState // state for backtracker, allocated lazily
43 | q0, q1 queue // two queues for runq, nextq
44 | pool []*thread // pool of available threads
45 | matched bool // whether a match was found
46 | matchcap []int // capture information for the match
47 |
48 | // cached inputs, to avoid allocation
49 | inputBytes inputBytes
50 | inputString inputString
51 | inputReader inputReader
52 | }
53 |
54 | func (m *machine) newInputBytes(b []byte) input {
55 | m.inputBytes.str = b
56 | return &m.inputBytes
57 | }
58 |
59 | func (m *machine) newInputString(s string) input {
60 | m.inputString.str = s
61 | return &m.inputString
62 | }
63 |
64 | func (m *machine) newInputReader(r io.RuneReader) input {
65 | m.inputReader.r = r
66 | m.inputReader.atEOT = false
67 | m.inputReader.pos = 0
68 | return &m.inputReader
69 | }
70 |
71 | // progMachine returns a new machine running the prog p.
72 | func progMachine(p *syntax.Prog, op *onePassProg) *machine {
73 | m := &machine{p: p, op: op}
74 | n := len(m.p.Inst)
75 | m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
76 | m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
77 | ncap := p.NumCap
78 | if ncap < 2 {
79 | ncap = 2
80 | }
81 | if op == notOnePass {
82 | m.maxBitStateLen = maxBitStateLen(p)
83 | }
84 | m.matchcap = make([]int, ncap)
85 | return m
86 | }
87 |
88 | func (m *machine) init(ncap int) {
89 | for _, t := range m.pool {
90 | t.cap = t.cap[:ncap]
91 | }
92 | m.matchcap = m.matchcap[:ncap]
93 | }
94 |
95 | // alloc allocates a new thread with the given instruction.
96 | // It uses the free pool if possible.
97 | func (m *machine) alloc(i *syntax.Inst) *thread {
98 | var t *thread
99 | if n := len(m.pool); n > 0 {
100 | t = m.pool[n-1]
101 | m.pool = m.pool[:n-1]
102 | } else {
103 | t = new(thread)
104 | t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
105 | }
106 | t.inst = i
107 | return t
108 | }
109 |
110 | // free returns t to the free pool.
111 | func (m *machine) free(t *thread) {
112 | m.inputBytes.str = nil
113 | m.inputString.str = ""
114 | m.inputReader.r = nil
115 | m.pool = append(m.pool, t)
116 | }
117 |
118 | // match runs the machine over the input starting at pos.
119 | // It reports whether a match was found.
120 | // If so, m.matchcap holds the submatch information.
121 | func (m *machine) match(i input, pos int) bool {
122 | startCond := m.re.cond
123 | if startCond == ^syntax.EmptyOp(0) { // impossible
124 | return false
125 | }
126 | m.matched = false
127 | for i := range m.matchcap {
128 | m.matchcap[i] = -1
129 | }
130 | runq, nextq := &m.q0, &m.q1
131 | r, r1 := endOfText, endOfText
132 | width, width1 := 0, 0
133 | r, width = i.step(pos)
134 | if r != endOfText {
135 | r1, width1 = i.step(pos + width)
136 | }
137 | var flag syntax.EmptyOp
138 | if pos == 0 {
139 | flag = syntax.EmptyOpContext(-1, r)
140 | } else {
141 | flag = i.context(pos)
142 | }
143 | for {
144 | if len(runq.dense) == 0 {
145 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
146 | // Anchored match, past beginning of text.
147 | break
148 | }
149 | if m.matched {
150 | // Have match; finished exploring alternatives.
151 | break
152 | }
153 | if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
154 | // Match requires literal prefix; fast search for it.
155 | advance := i.index(m.re, pos)
156 | if advance < 0 {
157 | break
158 | }
159 | pos += advance
160 | r, width = i.step(pos)
161 | r1, width1 = i.step(pos + width)
162 | }
163 | }
164 | if !m.matched {
165 | if len(m.matchcap) > 0 {
166 | m.matchcap[0] = pos
167 | }
168 | m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
169 | }
170 | flag = syntax.EmptyOpContext(r, r1)
171 | m.step(runq, nextq, pos, pos+width, r, flag)
172 | if width == 0 {
173 | break
174 | }
175 | if len(m.matchcap) == 0 && m.matched {
176 | // Found a match and not paying attention
177 | // to where it is, so any match will do.
178 | break
179 | }
180 | pos += width
181 | r, width = r1, width1
182 | if r != endOfText {
183 | r1, width1 = i.step(pos + width)
184 | }
185 | runq, nextq = nextq, runq
186 | }
187 | m.clear(nextq)
188 | return m.matched
189 | }
190 |
191 | // clear frees all threads on the thread queue.
192 | func (m *machine) clear(q *queue) {
193 | for _, d := range q.dense {
194 | if d.t != nil {
195 | // m.free(d.t)
196 | m.pool = append(m.pool, d.t)
197 | }
198 | }
199 | q.dense = q.dense[:0]
200 | }
201 |
202 | // step executes one step of the machine, running each of the threads
203 | // on runq and appending new threads to nextq.
204 | // The step processes the rune c (which may be endOfText),
205 | // which starts at position pos and ends at nextPos.
206 | // nextCond gives the setting for the empty-width flags after c.
207 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
208 | longest := m.re.longest
209 | for j := 0; j < len(runq.dense); j++ {
210 | d := &runq.dense[j]
211 | t := d.t
212 | if t == nil {
213 | continue
214 | }
215 | if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
216 | // m.free(t)
217 | m.pool = append(m.pool, t)
218 | continue
219 | }
220 | i := t.inst
221 | add := false
222 | switch i.Op {
223 | default:
224 | panic("bad inst")
225 |
226 | case syntax.InstMatch:
227 | if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
228 | t.cap[1] = pos
229 | copy(m.matchcap, t.cap)
230 | }
231 | if !longest {
232 | // First-match mode: cut off all lower-priority threads.
233 | for _, d := range runq.dense[j+1:] {
234 | if d.t != nil {
235 | // m.free(d.t)
236 | m.pool = append(m.pool, d.t)
237 | }
238 | }
239 | runq.dense = runq.dense[:0]
240 | }
241 | m.matched = true
242 |
243 | case syntax.InstRune:
244 | add = i.MatchRune(c)
245 | case syntax.InstRune1:
246 | add = c == i.Rune[0]
247 | case syntax.InstRuneAny:
248 | add = true
249 | case syntax.InstRuneAnyNotNL:
250 | add = c != '\n'
251 | }
252 | if add {
253 | t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
254 | }
255 | if t != nil {
256 | // m.free(t)
257 | m.pool = append(m.pool, t)
258 | }
259 | }
260 | runq.dense = runq.dense[:0]
261 | }
262 |
263 | // add adds an entry to q for pc, unless the q already has such an entry.
264 | // It also recursively adds an entry for all instructions reachable from pc by following
265 | // empty-width conditions satisfied by cond. pos gives the current position
266 | // in the input.
267 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
268 | if pc == 0 {
269 | return t
270 | }
271 | if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
272 | return t
273 | }
274 |
275 | j := len(q.dense)
276 | q.dense = q.dense[:j+1]
277 | d := &q.dense[j]
278 | d.t = nil
279 | d.pc = pc
280 | q.sparse[pc] = uint32(j)
281 |
282 | i := &m.p.Inst[pc]
283 | switch i.Op {
284 | default:
285 | panic("unhandled")
286 | case syntax.InstFail:
287 | // nothing
288 | case syntax.InstAlt, syntax.InstAltMatch:
289 | t = m.add(q, i.Out, pos, cap, cond, t)
290 | t = m.add(q, i.Arg, pos, cap, cond, t)
291 | case syntax.InstEmptyWidth:
292 | if syntax.EmptyOp(i.Arg)&^cond == 0 {
293 | t = m.add(q, i.Out, pos, cap, cond, t)
294 | }
295 | case syntax.InstNop:
296 | t = m.add(q, i.Out, pos, cap, cond, t)
297 | case syntax.InstCapture:
298 | if int(i.Arg) < len(cap) {
299 | opos := cap[i.Arg]
300 | cap[i.Arg] = pos
301 | m.add(q, i.Out, pos, cap, cond, nil)
302 | cap[i.Arg] = opos
303 | } else {
304 | t = m.add(q, i.Out, pos, cap, cond, t)
305 | }
306 | case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
307 | if t == nil {
308 | t = m.alloc(i)
309 | } else {
310 | t.inst = i
311 | }
312 | if len(cap) > 0 && &t.cap[0] != &cap[0] {
313 | copy(t.cap, cap)
314 | }
315 | d.t = t
316 | t = nil
317 | }
318 | return t
319 | }
320 |
321 | // empty is a non-nil 0-element slice,
322 | // so doExecute can avoid an allocation
323 | // when 0 captures are requested from a successful match.
324 | var empty = make([]int, 0)
325 |
326 | // doExecute finds the leftmost match in the input and returns
327 | // the position of its subexpressions.
328 | func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int {
329 | m := re.get()
330 | var i input
331 | var size int
332 | if r != nil {
333 | i = m.newInputReader(r)
334 | } else if b != nil {
335 | i = m.newInputBytes(b)
336 | size = len(b)
337 | } else {
338 | i = m.newInputString(s)
339 | size = len(s)
340 | }
341 | if size < m.maxBitStateLen && r == nil {
342 | if m.b == nil {
343 | m.b = newBitState(m.p)
344 | }
345 | if !m.backtrack(i, pos, size, ncap) {
346 | re.put(m)
347 | return nil
348 | }
349 | } else {
350 | m.init(ncap)
351 | if !m.match(i, pos) {
352 | re.put(m)
353 | return nil
354 | }
355 | }
356 | if ncap == 0 {
357 | re.put(m)
358 | return empty // empty but not nil
359 | }
360 | cap := make([]int, len(m.matchcap))
361 | copy(cap, m.matchcap)
362 | re.put(m)
363 | return cap
364 | }
365 |
--------------------------------------------------------------------------------
/regex/onepass.go:
--------------------------------------------------------------------------------
1 | // Copyright 2014 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package regex
6 |
7 | import (
8 | "bytes"
9 | "regexp/syntax"
10 | "sort"
11 | "unicode"
12 | )
13 |
14 | // "One-pass" regexp execution.
15 | // Some regexps can be analyzed to determine that they never need
16 | // backtracking: they are guaranteed to run in one pass over the string
17 | // without bothering to save all the usual NFA state.
18 | // Detect those and execute them more quickly.
19 |
20 | // A onePassProg is a compiled one-pass regular expression program.
21 | // It is the same as syntax.Prog except for the use of onePassInst.
22 | type onePassProg struct {
23 | Inst []onePassInst
24 | Start int // index of start instruction
25 | NumCap int // number of InstCapture insts in re
26 | }
27 |
28 | // A onePassInst is a single instruction in a one-pass regular expression program.
29 | // It is the same as syntax.Inst except for the new 'Next' field.
30 | type onePassInst struct {
31 | syntax.Inst
32 | Next []uint32
33 | }
34 |
35 | // OnePassPrefix returns a literal string that all matches for the
36 | // regexp must start with. Complete is true if the prefix
37 | // is the entire match. Pc is the index of the last rune instruction
38 | // in the string. The OnePassPrefix skips over the mandatory
39 | // EmptyBeginText
40 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
41 | i := &p.Inst[p.Start]
42 | if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
43 | return "", i.Op == syntax.InstMatch, uint32(p.Start)
44 | }
45 | pc = i.Out
46 | i = &p.Inst[pc]
47 | for i.Op == syntax.InstNop {
48 | pc = i.Out
49 | i = &p.Inst[pc]
50 | }
51 | // Avoid allocation of buffer if prefix is empty.
52 | if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
53 | return "", i.Op == syntax.InstMatch, uint32(p.Start)
54 | }
55 |
56 | // Have prefix; gather characters.
57 | var buf bytes.Buffer
58 | for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
59 | buf.WriteRune(i.Rune[0])
60 | pc, i = i.Out, &p.Inst[i.Out]
61 | }
62 | return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
63 | }
64 |
65 | // OnePassNext selects the next actionable state of the prog, based on the input character.
66 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
67 | // One of the alternates may ultimately lead without input to end of line. If the instruction
68 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
69 | func onePassNext(i *onePassInst, r rune) uint32 {
70 | next := i.MatchRunePos(r)
71 | if next >= 0 {
72 | return i.Next[next]
73 | }
74 | if i.Op == syntax.InstAltMatch {
75 | return i.Out
76 | }
77 | return 0
78 | }
79 |
80 | func iop(i *syntax.Inst) syntax.InstOp {
81 | op := i.Op
82 | switch op {
83 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
84 | op = syntax.InstRune
85 | }
86 | return op
87 | }
88 |
89 | // Sparse Array implementation is used as a queueOnePass.
90 | type queueOnePass struct {
91 | sparse []uint32
92 | dense []uint32
93 | size, nextIndex uint32
94 | }
95 |
96 | func (q *queueOnePass) empty() bool {
97 | return q.nextIndex >= q.size
98 | }
99 |
100 | func (q *queueOnePass) next() (n uint32) {
101 | n = q.dense[q.nextIndex]
102 | q.nextIndex++
103 | return
104 | }
105 |
106 | func (q *queueOnePass) clear() {
107 | q.size = 0
108 | q.nextIndex = 0
109 | }
110 |
111 | func (q *queueOnePass) reset() {
112 | q.nextIndex = 0
113 | }
114 |
115 | func (q *queueOnePass) contains(u uint32) bool {
116 | if u >= uint32(len(q.sparse)) {
117 | return false
118 | }
119 | return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
120 | }
121 |
122 | func (q *queueOnePass) insert(u uint32) {
123 | if !q.contains(u) {
124 | q.insertNew(u)
125 | }
126 | }
127 |
128 | func (q *queueOnePass) insertNew(u uint32) {
129 | if u >= uint32(len(q.sparse)) {
130 | return
131 | }
132 | q.sparse[u] = q.size
133 | q.dense[q.size] = u
134 | q.size++
135 | }
136 |
137 | func newQueue(size int) (q *queueOnePass) {
138 | return &queueOnePass{
139 | sparse: make([]uint32, size),
140 | dense: make([]uint32, size),
141 | }
142 | }
143 |
144 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
145 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
146 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
147 | // NextIp array with the single element mergeFailed is returned.
148 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
149 | const mergeFailed = uint32(0xffffffff)
150 |
151 | var (
152 | noRune = []rune{}
153 | noNext = []uint32{mergeFailed}
154 | )
155 |
156 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
157 | leftLen := len(*leftRunes)
158 | rightLen := len(*rightRunes)
159 | if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
160 | panic("mergeRuneSets odd length []rune")
161 | }
162 | var (
163 | lx, rx int
164 | )
165 | merged := make([]rune, 0)
166 | next := make([]uint32, 0)
167 | ok := true
168 | defer func() {
169 | if !ok {
170 | merged = nil
171 | next = nil
172 | }
173 | }()
174 |
175 | ix := -1
176 | extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
177 | if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
178 | return false
179 | }
180 | merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
181 | *newLow += 2
182 | ix += 2
183 | next = append(next, pc)
184 | return true
185 | }
186 |
187 | for lx < leftLen || rx < rightLen {
188 | switch {
189 | case rx >= rightLen:
190 | ok = extend(&lx, leftRunes, leftPC)
191 | case lx >= leftLen:
192 | ok = extend(&rx, rightRunes, rightPC)
193 | case (*rightRunes)[rx] < (*leftRunes)[lx]:
194 | ok = extend(&rx, rightRunes, rightPC)
195 | default:
196 | ok = extend(&lx, leftRunes, leftPC)
197 | }
198 | if !ok {
199 | return noRune, noNext
200 | }
201 | }
202 | return merged, next
203 | }
204 |
205 | // cleanupOnePass drops working memory, and restores certain shortcut instructions.
206 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
207 | for ix, instOriginal := range original.Inst {
208 | switch instOriginal.Op {
209 | case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
210 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
211 | prog.Inst[ix].Next = nil
212 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
213 | prog.Inst[ix].Next = nil
214 | prog.Inst[ix] = onePassInst{Inst: instOriginal}
215 | }
216 | }
217 | }
218 |
219 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it
220 | func onePassCopy(prog *syntax.Prog) *onePassProg {
221 | p := &onePassProg{
222 | Start: prog.Start,
223 | NumCap: prog.NumCap,
224 | }
225 | for _, inst := range prog.Inst {
226 | p.Inst = append(p.Inst, onePassInst{Inst: inst})
227 | }
228 |
229 | // rewrites one or more common Prog constructs that enable some otherwise
230 | // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
231 | // ip A, that points to ips B & C.
232 | // A:BC + B:DA => A:BC + B:CD
233 | // A:BC + B:DC => A:DC + B:DC
234 | for pc := range p.Inst {
235 | switch p.Inst[pc].Op {
236 | default:
237 | continue
238 | case syntax.InstAlt, syntax.InstAltMatch:
239 | // A:Bx + B:Ay
240 | p_A_Other := &p.Inst[pc].Out
241 | p_A_Alt := &p.Inst[pc].Arg
242 | // make sure a target is another Alt
243 | instAlt := p.Inst[*p_A_Alt]
244 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
245 | p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
246 | instAlt = p.Inst[*p_A_Alt]
247 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
248 | continue
249 | }
250 | }
251 | instOther := p.Inst[*p_A_Other]
252 | // Analyzing both legs pointing to Alts is for another day
253 | if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
254 | // too complicated
255 | continue
256 | }
257 | // simple empty transition loop
258 | // A:BC + B:DA => A:BC + B:DC
259 | p_B_Alt := &p.Inst[*p_A_Alt].Out
260 | p_B_Other := &p.Inst[*p_A_Alt].Arg
261 | patch := false
262 | if instAlt.Out == uint32(pc) {
263 | patch = true
264 | } else if instAlt.Arg == uint32(pc) {
265 | patch = true
266 | p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
267 | }
268 | if patch {
269 | *p_B_Alt = *p_A_Other
270 | }
271 |
272 | // empty transition to common target
273 | // A:BC + B:DC => A:DC + B:DC
274 | if *p_A_Other == *p_B_Alt {
275 | *p_A_Alt = *p_B_Other
276 | }
277 | }
278 | }
279 | return p
280 | }
281 |
282 | // runeSlice exists to permit sorting the case-folded rune sets.
283 | type runeSlice []rune
284 |
285 | func (p runeSlice) Len() int { return len(p) }
286 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
287 | func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
288 |
289 | // Sort is a convenience method.
290 | func (p runeSlice) Sort() {
291 | sort.Sort(p)
292 | }
293 |
294 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
295 | var anyRune = []rune{0, unicode.MaxRune}
296 |
297 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
298 | // the match engine can always tell which branch to take. The routine may modify
299 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a
300 | // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
301 | // to the size of the Prog.
302 | func makeOnePass(p *onePassProg) *onePassProg {
303 | // If the machine is very long, it's not worth the time to check if we can use one pass.
304 | if len(p.Inst) >= 1000 {
305 | return notOnePass
306 | }
307 |
308 | var (
309 | instQueue = newQueue(len(p.Inst))
310 | visitQueue = newQueue(len(p.Inst))
311 | build func(uint32, *queueOnePass)
312 | check func(uint32, map[uint32]bool) bool
313 | onePassRunes = make([][]rune, len(p.Inst))
314 | )
315 | build = func(pc uint32, q *queueOnePass) {
316 | if q.contains(pc) {
317 | return
318 | }
319 | inst := p.Inst[pc]
320 | switch inst.Op {
321 | case syntax.InstAlt, syntax.InstAltMatch:
322 | q.insert(inst.Out)
323 | build(inst.Out, q)
324 | q.insert(inst.Arg)
325 | case syntax.InstMatch, syntax.InstFail:
326 | default:
327 | q.insert(inst.Out)
328 | }
329 | }
330 |
331 | // check that paths from Alt instructions are unambiguous, and rebuild the new
332 | // program as a onepass program
333 | check = func(pc uint32, m map[uint32]bool) (ok bool) {
334 | ok = true
335 | inst := &p.Inst[pc]
336 | if visitQueue.contains(pc) {
337 | return
338 | }
339 | visitQueue.insert(pc)
340 | switch inst.Op {
341 | case syntax.InstAlt, syntax.InstAltMatch:
342 | ok = check(inst.Out, m) && check(inst.Arg, m)
343 | // check no-input paths to InstMatch
344 | matchOut := m[inst.Out]
345 | matchArg := m[inst.Arg]
346 | if matchOut && matchArg {
347 | ok = false
348 | break
349 | }
350 | // Match on empty goes in inst.Out
351 | if matchArg {
352 | inst.Out, inst.Arg = inst.Arg, inst.Out
353 | matchOut, matchArg = matchArg, matchOut
354 | }
355 | if matchOut {
356 | m[pc] = true
357 | inst.Op = syntax.InstAltMatch
358 | }
359 |
360 | // build a dispatch operator from the two legs of the alt.
361 | onePassRunes[pc], inst.Next = mergeRuneSets(
362 | &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
363 | if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
364 | ok = false
365 | break
366 | }
367 | case syntax.InstCapture, syntax.InstNop:
368 | ok = check(inst.Out, m)
369 | m[pc] = m[inst.Out]
370 | // pass matching runes back through these no-ops.
371 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
372 | inst.Next = []uint32{}
373 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
374 | inst.Next = append(inst.Next, inst.Out)
375 | }
376 | case syntax.InstEmptyWidth:
377 | ok = check(inst.Out, m)
378 | m[pc] = m[inst.Out]
379 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
380 | inst.Next = []uint32{}
381 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
382 | inst.Next = append(inst.Next, inst.Out)
383 | }
384 | case syntax.InstMatch, syntax.InstFail:
385 | m[pc] = inst.Op == syntax.InstMatch
386 | break
387 | case syntax.InstRune:
388 | ok = check(inst.Out, m)
389 | m[pc] = false
390 | if len(inst.Next) > 0 {
391 | break
392 | }
393 | if len(inst.Rune) == 0 {
394 | onePassRunes[pc] = []rune{}
395 | inst.Next = []uint32{inst.Out}
396 | break
397 | }
398 | runes := make([]rune, 0)
399 | if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
400 | r0 := inst.Rune[0]
401 | runes = append(runes, r0, r0)
402 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
403 | runes = append(runes, r1, r1)
404 | }
405 | sort.Sort(runeSlice(runes))
406 | } else {
407 | runes = append(runes, inst.Rune...)
408 | }
409 | onePassRunes[pc] = runes
410 | inst.Next = []uint32{}
411 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
412 | inst.Next = append(inst.Next, inst.Out)
413 | }
414 | inst.Op = syntax.InstRune
415 | case syntax.InstRune1:
416 | ok = check(inst.Out, m)
417 | m[pc] = false
418 | if len(inst.Next) > 0 {
419 | break
420 | }
421 | runes := []rune{}
422 | // expand case-folded runes
423 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
424 | r0 := inst.Rune[0]
425 | runes = append(runes, r0, r0)
426 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
427 | runes = append(runes, r1, r1)
428 | }
429 | sort.Sort(runeSlice(runes))
430 | } else {
431 | runes = append(runes, inst.Rune[0], inst.Rune[0])
432 | }
433 | onePassRunes[pc] = runes
434 | inst.Next = []uint32{}
435 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
436 | inst.Next = append(inst.Next, inst.Out)
437 | }
438 | inst.Op = syntax.InstRune
439 | case syntax.InstRuneAny:
440 | ok = check(inst.Out, m)
441 | m[pc] = false
442 | if len(inst.Next) > 0 {
443 | break
444 | }
445 | onePassRunes[pc] = append([]rune{}, anyRune...)
446 | inst.Next = []uint32{inst.Out}
447 | case syntax.InstRuneAnyNotNL:
448 | ok = check(inst.Out, m)
449 | m[pc] = false
450 | if len(inst.Next) > 0 {
451 | break
452 | }
453 | onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
454 | inst.Next = []uint32{}
455 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
456 | inst.Next = append(inst.Next, inst.Out)
457 | }
458 | }
459 | return
460 | }
461 |
462 | instQueue.clear()
463 | instQueue.insert(uint32(p.Start))
464 | m := make(map[uint32]bool, len(p.Inst))
465 | for !instQueue.empty() {
466 | pc := instQueue.next()
467 | inst := p.Inst[pc]
468 | visitQueue.clear()
469 | if !check(uint32(pc), m) {
470 | p = notOnePass
471 | break
472 | }
473 | switch inst.Op {
474 | case syntax.InstAlt, syntax.InstAltMatch:
475 | instQueue.insert(inst.Out)
476 | instQueue.insert(inst.Arg)
477 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
478 | instQueue.insert(inst.Out)
479 | case syntax.InstMatch:
480 | case syntax.InstFail:
481 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
482 | default:
483 | }
484 | }
485 | if p != notOnePass {
486 | for i := range p.Inst {
487 | p.Inst[i].Rune = onePassRunes[i]
488 | }
489 | }
490 | return p
491 | }
492 |
493 | // walk visits each Inst in the prog once, and applies the argument
494 | // function(ip, next), in pre-order.
495 | func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
496 | var walk1 func(uint32)
497 | progQueue := newQueue(len(prog.Inst))
498 | walk1 = func(ip uint32) {
499 | if progQueue.contains(ip) {
500 | return
501 | }
502 | progQueue.insert(ip)
503 | inst := prog.Inst[ip]
504 | switch inst.Op {
505 | case syntax.InstAlt, syntax.InstAltMatch:
506 | for _, f := range funcs {
507 | f(ip, inst.Out)
508 | f(ip, inst.Arg)
509 | }
510 | walk1(inst.Out)
511 | walk1(inst.Arg)
512 | default:
513 | for _, f := range funcs {
514 | f(ip, inst.Out)
515 | }
516 | walk1(inst.Out)
517 | }
518 | }
519 | walk1(uint32(prog.Start))
520 | }
521 |
522 | // find returns the Insts that match the argument predicate function
523 | func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
524 | matches = []uint32{}
525 |
526 | for ip := range prog.Inst {
527 | if f(prog, ip) {
528 | matches = append(matches, uint32(ip))
529 | }
530 | }
531 | return
532 | }
533 |
534 | var notOnePass *onePassProg = nil
535 |
536 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
537 | // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
538 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must
539 | // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
540 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
541 | if prog.Start == 0 {
542 | return notOnePass
543 | }
544 | // onepass regexp is anchored
545 | if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
546 | syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
547 | return notOnePass
548 | }
549 | // every instruction leading to InstMatch must be EmptyEndText
550 | for _, inst := range prog.Inst {
551 | opOut := prog.Inst[inst.Out].Op
552 | switch inst.Op {
553 | default:
554 | if opOut == syntax.InstMatch {
555 | return notOnePass
556 | }
557 | case syntax.InstAlt, syntax.InstAltMatch:
558 | if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
559 | return notOnePass
560 | }
561 | case syntax.InstEmptyWidth:
562 | if opOut == syntax.InstMatch {
563 | if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
564 | continue
565 | }
566 | return notOnePass
567 | }
568 | }
569 | }
570 | // Creates a slightly optimized copy of the original Prog
571 | // that cleans up some Prog idioms that block valid onepass programs
572 | p = onePassCopy(prog)
573 |
574 | // checkAmbiguity on InstAlts, build onepass Prog if possible
575 | p = makeOnePass(p)
576 |
577 | if p != notOnePass {
578 | cleanupOnePass(p, prog)
579 | }
580 | return p
581 | }
582 |
--------------------------------------------------------------------------------
/regex/regexp.go:
--------------------------------------------------------------------------------
1 | // Copyright 2009 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Package regex is a fork of the standard library regexp package.
6 | // It contains a few small additions that make it possible to
7 | // interact more directly with the underlying DFA machinery.
8 | //
9 | // Package regexp implements regular expression search.
10 | //
11 | // The syntax of the regular expressions accepted is the same
12 | // general syntax used by Perl, Python, and other languages.
13 | // More precisely, it is the syntax accepted by RE2 and described at
14 | // https://golang.org/s/re2syntax, except for \C.
15 | // For an overview of the syntax, run
16 | // go doc regexp/syntax
17 | //
18 | // The regexp implementation provided by this package is
19 | // guaranteed to run in time linear in the size of the input.
20 | // (This is a property not guaranteed by most open source
21 | // implementations of regular expressions.) For more information
22 | // about this property, see
23 | // http://swtch.com/~rsc/regexp/regexp1.html
24 | // or any book about automata theory.
25 | //
26 | // All characters are UTF-8-encoded code points.
27 | //
28 | // There are 16 methods of Regexp that match a regular expression and identify
29 | // the matched text. Their names are matched by this regular expression:
30 | //
31 | // Find(All)?(String)?(Submatch)?(Index)?
32 | //
33 | // If 'All' is present, the routine matches successive non-overlapping
34 | // matches of the entire expression. Empty matches abutting a preceding
35 | // match are ignored. The return value is a slice containing the successive
36 | // return values of the corresponding non-'All' routine. These routines take
37 | // an extra integer argument, n; if n >= 0, the function returns at most n
38 | // matches/submatches.
39 | //
40 | // If 'String' is present, the argument is a string; otherwise it is a slice
41 | // of bytes; return values are adjusted as appropriate.
42 | //
43 | // If 'Submatch' is present, the return value is a slice identifying the
44 | // successive submatches of the expression. Submatches are matches of
45 | // parenthesized subexpressions (also known as capturing groups) within the
46 | // regular expression, numbered from left to right in order of opening
47 | // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
48 | // the match of the first parenthesized subexpression, and so on.
49 | //
50 | // If 'Index' is present, matches and submatches are identified by byte index
51 | // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
52 | // the nth submatch. The pair for n==0 identifies the match of the entire
53 | // expression. If 'Index' is not present, the match is identified by the
54 | // text of the match/submatch. If an index is negative, it means that
55 | // subexpression did not match any string in the input.
56 | //
57 | // There is also a subset of the methods that can be applied to text read
58 | // from a RuneReader:
59 | //
60 | // MatchReader, FindReaderIndex, FindReaderSubmatchIndex
61 | //
62 | // This set may grow. Note that regular expression matches may need to
63 | // examine text beyond the text returned by a match, so the methods that
64 | // match text from a RuneReader may read arbitrarily far into the input
65 | // before returning.
66 | //
67 | // (There are a few other methods that do not match this pattern.)
68 | //
69 | package regex
70 |
71 | import (
72 | "bytes"
73 | "io"
74 | "regexp/syntax"
75 | "strconv"
76 | "strings"
77 | "sync"
78 | "unicode"
79 | "unicode/utf8"
80 | )
81 |
82 | var debug = false
83 |
84 | // Regexp is the representation of a compiled regular expression.
85 | // A Regexp is safe for concurrent use by multiple goroutines.
86 | type Regexp struct {
87 | // read-only after Compile
88 | expr string
89 | prog *syntax.Prog // compiled program
90 | onepass *onePassProg // onepass program or nil
91 | prefix string // required prefix in unanchored matches
92 | prefixBytes []byte // prefix, as a []byte
93 | prefixComplete bool // prefix is the entire regexp
94 | prefixRune rune // first rune in prefix
95 | prefixEnd uint32 // pc for last rune in prefix
96 | cond syntax.EmptyOp // empty-width conditions required at start of match
97 | numSubexp int
98 | subexpNames []string
99 | longest bool
100 |
101 | // cache of machines for running regexp
102 | mu sync.Mutex
103 | machine []*machine
104 | }
105 |
106 | // String returns the source text used to compile the regular expression.
107 | func (re *Regexp) String() string {
108 | return re.expr
109 | }
110 |
111 | // Compile parses a regular expression and returns, if successful,
112 | // a Regexp object that can be used to match against text.
113 | //
114 | // When matching against text, the regexp returns a match that
115 | // begins as early as possible in the input (leftmost), and among those
116 | // it chooses the one that a backtracking search would have found first.
117 | // This so-called leftmost-first matching is the same semantics
118 | // that Perl, Python, and other implementations use, although this
119 | // package implements it without the expense of backtracking.
120 | // For POSIX leftmost-longest matching, see CompilePOSIX.
121 | func Compile(expr string) (*Regexp, error) {
122 | return compile(expr, syntax.Perl, false)
123 | }
124 |
125 | // CompileSyntax is like Compile but takes a syntax tree as input.
126 | func CompileSyntax(ast *syntax.Regexp) (*Regexp, error) {
127 | return compileSyntax(ast, ast.String(), true)
128 | }
129 |
130 | // CompilePOSIX is like Compile but restricts the regular expression
131 | // to POSIX ERE (egrep) syntax and changes the match semantics to
132 | // leftmost-longest.
133 | //
134 | // That is, when matching against text, the regexp returns a match that
135 | // begins as early as possible in the input (leftmost), and among those
136 | // it chooses a match that is as long as possible.
137 | // This so-called leftmost-longest matching is the same semantics
138 | // that early regular expression implementations used and that POSIX
139 | // specifies.
140 | //
141 | // However, there can be multiple leftmost-longest matches, with different
142 | // submatch choices, and here this package diverges from POSIX.
143 | // Among the possible leftmost-longest matches, this package chooses
144 | // the one that a backtracking search would have found first, while POSIX
145 | // specifies that the match be chosen to maximize the length of the first
146 | // subexpression, then the second, and so on from left to right.
147 | // The POSIX rule is computationally prohibitive and not even well-defined.
148 | // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
149 | func CompilePOSIX(expr string) (*Regexp, error) {
150 | return compile(expr, syntax.POSIX, true)
151 | }
152 |
153 | // Longest makes future searches prefer the leftmost-longest match.
154 | // That is, when matching against text, the regexp returns a match that
155 | // begins as early as possible in the input (leftmost), and among those
156 | // it chooses a match that is as long as possible.
157 | func (re *Regexp) Longest() {
158 | re.longest = true
159 | }
160 |
161 | func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
162 | re, err := syntax.Parse(expr, mode)
163 | if err != nil {
164 | return nil, err
165 | }
166 | return compileSyntax(re, expr, longest)
167 | }
168 |
169 | func compileSyntax(re *syntax.Regexp, expr string, longest bool) (*Regexp, error) {
170 | maxCap := re.MaxCap()
171 | capNames := re.CapNames()
172 |
173 | re = re.Simplify()
174 | prog, err := syntax.Compile(re)
175 | if err != nil {
176 | return nil, err
177 | }
178 | regexp := &Regexp{
179 | expr: expr,
180 | prog: prog,
181 | onepass: compileOnePass(prog),
182 | numSubexp: maxCap,
183 | subexpNames: capNames,
184 | cond: prog.StartCond(),
185 | longest: longest,
186 | }
187 | if regexp.onepass == notOnePass {
188 | regexp.prefix, regexp.prefixComplete = prog.Prefix()
189 | } else {
190 | regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
191 | }
192 | if regexp.prefix != "" {
193 | // TODO(rsc): Remove this allocation by adding
194 | // IndexString to package bytes.
195 | regexp.prefixBytes = []byte(regexp.prefix)
196 | regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
197 | }
198 | return regexp, nil
199 | }
200 |
201 | // get returns a machine to use for matching re.
202 | // It uses the re's machine cache if possible, to avoid
203 | // unnecessary allocation.
204 | func (re *Regexp) get() *machine {
205 | re.mu.Lock()
206 | if n := len(re.machine); n > 0 {
207 | z := re.machine[n-1]
208 | re.machine = re.machine[:n-1]
209 | re.mu.Unlock()
210 | return z
211 | }
212 | re.mu.Unlock()
213 | z := progMachine(re.prog, re.onepass)
214 | z.re = re
215 | return z
216 | }
217 |
218 | // put returns a machine to the re's machine cache.
219 | // There is no attempt to limit the size of the cache, so it will
220 | // grow to the maximum number of simultaneous matches
221 | // run using re. (The cache empties when re gets garbage collected.)
222 | func (re *Regexp) put(z *machine) {
223 | re.mu.Lock()
224 | re.machine = append(re.machine, z)
225 | re.mu.Unlock()
226 | }
227 |
228 | // MustCompile is like Compile but panics if the expression cannot be parsed.
229 | // It simplifies safe initialization of global variables holding compiled regular
230 | // expressions.
231 | func MustCompile(str string) *Regexp {
232 | regexp, error := Compile(str)
233 | if error != nil {
234 | panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
235 | }
236 | return regexp
237 | }
238 |
239 | // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
240 | // It simplifies safe initialization of global variables holding compiled regular
241 | // expressions.
242 | func MustCompilePOSIX(str string) *Regexp {
243 | regexp, error := CompilePOSIX(str)
244 | if error != nil {
245 | panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
246 | }
247 | return regexp
248 | }
249 |
250 | func quote(s string) string {
251 | if strconv.CanBackquote(s) {
252 | return "`" + s + "`"
253 | }
254 | return strconv.Quote(s)
255 | }
256 |
257 | // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
258 | func (re *Regexp) NumSubexp() int {
259 | return re.numSubexp
260 | }
261 |
262 | // SubexpNames returns the names of the parenthesized subexpressions
263 | // in this Regexp. The name for the first sub-expression is names[1],
264 | // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
265 | // Since the Regexp as a whole cannot be named, names[0] is always
266 | // the empty string. The slice should not be modified.
267 | func (re *Regexp) SubexpNames() []string {
268 | return re.subexpNames
269 | }
270 |
271 | const endOfText rune = -1
272 |
273 | // input abstracts different representations of the input text. It provides
274 | // one-character lookahead.
275 | type input interface {
276 | step(pos int) (r rune, width int) // advance one rune
277 | canCheckPrefix() bool // can we look ahead without losing info?
278 | hasPrefix(re *Regexp) bool
279 | index(re *Regexp, pos int) int
280 | context(pos int) syntax.EmptyOp
281 | }
282 |
283 | // inputString scans a string.
284 | type inputString struct {
285 | str string
286 | }
287 |
288 | func (i *inputString) step(pos int) (rune, int) {
289 | if pos < len(i.str) {
290 | c := i.str[pos]
291 | if c < utf8.RuneSelf {
292 | return rune(c), 1
293 | }
294 | return utf8.DecodeRuneInString(i.str[pos:])
295 | }
296 | return endOfText, 0
297 | }
298 |
299 | func (i *inputString) canCheckPrefix() bool {
300 | return true
301 | }
302 |
303 | func (i *inputString) hasPrefix(re *Regexp) bool {
304 | return strings.HasPrefix(i.str, re.prefix)
305 | }
306 |
307 | func (i *inputString) index(re *Regexp, pos int) int {
308 | return strings.Index(i.str[pos:], re.prefix)
309 | }
310 |
311 | func (i *inputString) context(pos int) syntax.EmptyOp {
312 | r1, r2 := endOfText, endOfText
313 | if pos > 0 && pos <= len(i.str) {
314 | r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
315 | }
316 | if pos < len(i.str) {
317 | r2, _ = utf8.DecodeRuneInString(i.str[pos:])
318 | }
319 | return syntax.EmptyOpContext(r1, r2)
320 | }
321 |
322 | // inputBytes scans a byte slice.
323 | type inputBytes struct {
324 | str []byte
325 | }
326 |
327 | func (i *inputBytes) step(pos int) (rune, int) {
328 | if pos < len(i.str) {
329 | c := i.str[pos]
330 | if c < utf8.RuneSelf {
331 | return rune(c), 1
332 | }
333 | return utf8.DecodeRune(i.str[pos:])
334 | }
335 | return endOfText, 0
336 | }
337 |
338 | func (i *inputBytes) canCheckPrefix() bool {
339 | return true
340 | }
341 |
342 | func (i *inputBytes) hasPrefix(re *Regexp) bool {
343 | return bytes.HasPrefix(i.str, re.prefixBytes)
344 | }
345 |
346 | func (i *inputBytes) index(re *Regexp, pos int) int {
347 | return bytes.Index(i.str[pos:], re.prefixBytes)
348 | }
349 |
350 | func (i *inputBytes) context(pos int) syntax.EmptyOp {
351 | r1, r2 := endOfText, endOfText
352 | if pos > 0 && pos <= len(i.str) {
353 | r1, _ = utf8.DecodeLastRune(i.str[:pos])
354 | }
355 | if pos < len(i.str) {
356 | r2, _ = utf8.DecodeRune(i.str[pos:])
357 | }
358 | return syntax.EmptyOpContext(r1, r2)
359 | }
360 |
361 | // inputReader scans a RuneReader.
362 | type inputReader struct {
363 | r io.RuneReader
364 | atEOT bool
365 | pos int
366 | }
367 |
368 | func (i *inputReader) step(pos int) (rune, int) {
369 | if !i.atEOT && pos != i.pos {
370 | return endOfText, 0
371 |
372 | }
373 | r, w, err := i.r.ReadRune()
374 | if err != nil {
375 | i.atEOT = true
376 | return endOfText, 0
377 | }
378 | i.pos += w
379 | return r, w
380 | }
381 |
382 | func (i *inputReader) canCheckPrefix() bool {
383 | return false
384 | }
385 |
386 | func (i *inputReader) hasPrefix(re *Regexp) bool {
387 | return false
388 | }
389 |
390 | func (i *inputReader) index(re *Regexp, pos int) int {
391 | return -1
392 | }
393 |
394 | func (i *inputReader) context(pos int) syntax.EmptyOp {
395 | return 0
396 | }
397 |
398 | // LiteralPrefix returns a literal string that must begin any match
399 | // of the regular expression re. It returns the boolean true if the
400 | // literal string comprises the entire regular expression.
401 | func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
402 | return re.prefix, re.prefixComplete
403 | }
404 |
405 | // MatchReader reports whether the Regexp matches the text read by the
406 | // RuneReader.
407 | func (re *Regexp) MatchReader(r io.RuneReader) bool {
408 | return re.doExecute(r, nil, "", 0, 0) != nil
409 | }
410 |
411 | // MatchString reports whether the Regexp matches the string s.
412 | func (re *Regexp) MatchString(s string) bool {
413 | return re.doExecute(nil, nil, s, 0, 0) != nil
414 | }
415 |
416 | // Match reports whether the Regexp matches the byte slice b.
417 | func (re *Regexp) Match(b []byte) bool {
418 | return re.doExecute(nil, b, "", 0, 0) != nil
419 | }
420 |
421 | // MatchReader checks whether a textual regular expression matches the text
422 | // read by the RuneReader. More complicated queries need to use Compile and
423 | // the full Regexp interface.
424 | func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
425 | re, err := Compile(pattern)
426 | if err != nil {
427 | return false, err
428 | }
429 | return re.MatchReader(r), nil
430 | }
431 |
432 | // MatchString checks whether a textual regular expression
433 | // matches a string. More complicated queries need
434 | // to use Compile and the full Regexp interface.
435 | func MatchString(pattern string, s string) (matched bool, err error) {
436 | re, err := Compile(pattern)
437 | if err != nil {
438 | return false, err
439 | }
440 | return re.MatchString(s), nil
441 | }
442 |
443 | // Match checks whether a textual regular expression
444 | // matches a byte slice. More complicated queries need
445 | // to use Compile and the full Regexp interface.
446 | func Match(pattern string, b []byte) (matched bool, err error) {
447 | re, err := Compile(pattern)
448 | if err != nil {
449 | return false, err
450 | }
451 | return re.Match(b), nil
452 | }
453 |
454 | // ReplaceAllString returns a copy of src, replacing matches of the Regexp
455 | // with the replacement string repl. Inside repl, $ signs are interpreted as
456 | // in Expand, so for instance $1 represents the text of the first submatch.
457 | func (re *Regexp) ReplaceAllString(src, repl string) string {
458 | n := 2
459 | if strings.Index(repl, "$") >= 0 {
460 | n = 2 * (re.numSubexp + 1)
461 | }
462 | b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
463 | return re.expand(dst, repl, nil, src, match)
464 | })
465 | return string(b)
466 | }
467 |
468 | // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
469 | // with the replacement string repl. The replacement repl is substituted directly,
470 | // without using Expand.
471 | func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
472 | return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
473 | return append(dst, repl...)
474 | }))
475 | }
476 |
477 | // ReplaceAllStringFunc returns a copy of src in which all matches of the
478 | // Regexp have been replaced by the return value of function repl applied
479 | // to the matched substring. The replacement returned by repl is substituted
480 | // directly, without using Expand.
481 | func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
482 | b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
483 | return append(dst, repl(src[match[0]:match[1]])...)
484 | })
485 | return string(b)
486 | }
487 |
488 | func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
489 | lastMatchEnd := 0 // end position of the most recent match
490 | searchPos := 0 // position where we next look for a match
491 | var buf []byte
492 | var endPos int
493 | if bsrc != nil {
494 | endPos = len(bsrc)
495 | } else {
496 | endPos = len(src)
497 | }
498 | for searchPos <= endPos {
499 | a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
500 | if len(a) == 0 {
501 | break // no more matches
502 | }
503 |
504 | // Copy the unmatched characters before this match.
505 | if bsrc != nil {
506 | buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
507 | } else {
508 | buf = append(buf, src[lastMatchEnd:a[0]]...)
509 | }
510 |
511 | // Now insert a copy of the replacement string, but not for a
512 | // match of the empty string immediately after another match.
513 | // (Otherwise, we get double replacement for patterns that
514 | // match both empty and nonempty strings.)
515 | if a[1] > lastMatchEnd || a[0] == 0 {
516 | buf = repl(buf, a)
517 | }
518 | lastMatchEnd = a[1]
519 |
520 | // Advance past this match; always advance at least one character.
521 | var width int
522 | if bsrc != nil {
523 | _, width = utf8.DecodeRune(bsrc[searchPos:])
524 | } else {
525 | _, width = utf8.DecodeRuneInString(src[searchPos:])
526 | }
527 | if searchPos+width > a[1] {
528 | searchPos += width
529 | } else if searchPos+1 > a[1] {
530 | // This clause is only needed at the end of the input
531 | // string. In that case, DecodeRuneInString returns width=0.
532 | searchPos++
533 | } else {
534 | searchPos = a[1]
535 | }
536 | }
537 |
538 | // Copy the unmatched characters after the last match.
539 | if bsrc != nil {
540 | buf = append(buf, bsrc[lastMatchEnd:]...)
541 | } else {
542 | buf = append(buf, src[lastMatchEnd:]...)
543 | }
544 |
545 | return buf
546 | }
547 |
548 | // ReplaceAll returns a copy of src, replacing matches of the Regexp
549 | // with the replacement text repl. Inside repl, $ signs are interpreted as
550 | // in Expand, so for instance $1 represents the text of the first submatch.
551 | func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
552 | n := 2
553 | if bytes.IndexByte(repl, '$') >= 0 {
554 | n = 2 * (re.numSubexp + 1)
555 | }
556 | srepl := ""
557 | b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
558 | if len(srepl) != len(repl) {
559 | srepl = string(repl)
560 | }
561 | return re.expand(dst, srepl, src, "", match)
562 | })
563 | return b
564 | }
565 |
566 | // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
567 | // with the replacement bytes repl. The replacement repl is substituted directly,
568 | // without using Expand.
569 | func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
570 | return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
571 | return append(dst, repl...)
572 | })
573 | }
574 |
575 | // ReplaceAllFunc returns a copy of src in which all matches of the
576 | // Regexp have been replaced by the return value of function repl applied
577 | // to the matched byte slice. The replacement returned by repl is substituted
578 | // directly, without using Expand.
579 | func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
580 | return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
581 | return append(dst, repl(src[match[0]:match[1]])...)
582 | })
583 | }
584 |
585 | var specialBytes = []byte(`\.+*?()|[]{}^$`)
586 |
587 | func special(b byte) bool {
588 | return bytes.IndexByte(specialBytes, b) >= 0
589 | }
590 |
591 | // QuoteMeta returns a string that quotes all regular expression metacharacters
592 | // inside the argument text; the returned string is a regular expression matching
593 | // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
594 | func QuoteMeta(s string) string {
595 | b := make([]byte, 2*len(s))
596 |
597 | // A byte loop is correct because all metacharacters are ASCII.
598 | j := 0
599 | for i := 0; i < len(s); i++ {
600 | if special(s[i]) {
601 | b[j] = '\\'
602 | j++
603 | }
604 | b[j] = s[i]
605 | j++
606 | }
607 | return string(b[0:j])
608 | }
609 |
610 | // The number of capture values in the program may correspond
611 | // to fewer capturing expressions than are in the regexp.
612 | // For example, "(a){0}" turns into an empty program, so the
613 | // maximum capture in the program is 0 but we need to return
614 | // an expression for \1. Pad appends -1s to the slice a as needed.
615 | func (re *Regexp) pad(a []int) []int {
616 | if a == nil {
617 | // No match.
618 | return nil
619 | }
620 | n := (1 + re.numSubexp) * 2
621 | for len(a) < n {
622 | a = append(a, -1)
623 | }
624 | return a
625 | }
626 |
627 | // Find matches in slice b if b is non-nil, otherwise find matches in string s.
628 | func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
629 | var end int
630 | if b == nil {
631 | end = len(s)
632 | } else {
633 | end = len(b)
634 | }
635 |
636 | for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
637 | matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
638 | if len(matches) == 0 {
639 | break
640 | }
641 |
642 | accept := true
643 | if matches[1] == pos {
644 | // We've found an empty match.
645 | if matches[0] == prevMatchEnd {
646 | // We don't allow an empty match right
647 | // after a previous match, so ignore it.
648 | accept = false
649 | }
650 | var width int
651 | // TODO: use step()
652 | if b == nil {
653 | _, width = utf8.DecodeRuneInString(s[pos:end])
654 | } else {
655 | _, width = utf8.DecodeRune(b[pos:end])
656 | }
657 | if width > 0 {
658 | pos += width
659 | } else {
660 | pos = end + 1
661 | }
662 | } else {
663 | pos = matches[1]
664 | }
665 | prevMatchEnd = matches[1]
666 |
667 | if accept {
668 | deliver(re.pad(matches))
669 | i++
670 | }
671 | }
672 | }
673 |
674 | // Find returns a slice holding the text of the leftmost match in b of the regular expression.
675 | // A return value of nil indicates no match.
676 | func (re *Regexp) Find(b []byte) []byte {
677 | a := re.doExecute(nil, b, "", 0, 2)
678 | if a == nil {
679 | return nil
680 | }
681 | return b[a[0]:a[1]]
682 | }
683 |
684 | // FindIndex returns a two-element slice of integers defining the location of
685 | // the leftmost match in b of the regular expression. The match itself is at
686 | // b[loc[0]:loc[1]].
687 | // A return value of nil indicates no match.
688 | func (re *Regexp) FindIndex(b []byte) (loc []int) {
689 | a := re.doExecute(nil, b, "", 0, 2)
690 | if a == nil {
691 | return nil
692 | }
693 | return a[0:2]
694 | }
695 |
696 | // FindString returns a string holding the text of the leftmost match in s of the regular
697 | // expression. If there is no match, the return value is an empty string,
698 | // but it will also be empty if the regular expression successfully matches
699 | // an empty string. Use FindStringIndex or FindStringSubmatch if it is
700 | // necessary to distinguish these cases.
701 | func (re *Regexp) FindString(s string) string {
702 | a := re.doExecute(nil, nil, s, 0, 2)
703 | if a == nil {
704 | return ""
705 | }
706 | return s[a[0]:a[1]]
707 | }
708 |
709 | // FindStringIndex returns a two-element slice of integers defining the
710 | // location of the leftmost match in s of the regular expression. The match
711 | // itself is at s[loc[0]:loc[1]].
712 | // A return value of nil indicates no match.
713 | func (re *Regexp) FindStringIndex(s string) (loc []int) {
714 | a := re.doExecute(nil, nil, s, 0, 2)
715 | if a == nil {
716 | return nil
717 | }
718 | return a[0:2]
719 | }
720 |
721 | // FindReaderIndex returns a two-element slice of integers defining the
722 | // location of the leftmost match of the regular expression in text read from
723 | // the RuneReader. The match text was found in the input stream at
724 | // byte offset loc[0] through loc[1]-1.
725 | // A return value of nil indicates no match.
726 | func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
727 | a := re.doExecute(r, nil, "", 0, 2)
728 | if a == nil {
729 | return nil
730 | }
731 | return a[0:2]
732 | }
733 |
734 | // FindSubmatch returns a slice of slices holding the text of the leftmost
735 | // match of the regular expression in b and the matches, if any, of its
736 | // subexpressions, as defined by the 'Submatch' descriptions in the package
737 | // comment.
738 | // A return value of nil indicates no match.
739 | func (re *Regexp) FindSubmatch(b []byte) [][]byte {
740 | a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
741 | if a == nil {
742 | return nil
743 | }
744 | ret := make([][]byte, 1+re.numSubexp)
745 | for i := range ret {
746 | if 2*i < len(a) && a[2*i] >= 0 {
747 | ret[i] = b[a[2*i]:a[2*i+1]]
748 | }
749 | }
750 | return ret
751 | }
752 |
753 | // Expand appends template to dst and returns the result; during the
754 | // append, Expand replaces variables in the template with corresponding
755 | // matches drawn from src. The match slice should have been returned by
756 | // FindSubmatchIndex.
757 | //
758 | // In the template, a variable is denoted by a substring of the form
759 | // $name or ${name}, where name is a non-empty sequence of letters,
760 | // digits, and underscores. A purely numeric name like $1 refers to
761 | // the submatch with the corresponding index; other names refer to
762 | // capturing parentheses named with the (?P...) syntax. A
763 | // reference to an out of range or unmatched index or a name that is not
764 | // present in the regular expression is replaced with an empty slice.
765 | //
766 | // In the $name form, name is taken to be as long as possible: $1x is
767 | // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
768 | //
769 | // To insert a literal $ in the output, use $$ in the template.
770 | func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
771 | return re.expand(dst, string(template), src, "", match)
772 | }
773 |
774 | // ExpandString is like Expand but the template and source are strings.
775 | // It appends to and returns a byte slice in order to give the calling
776 | // code control over allocation.
777 | func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
778 | return re.expand(dst, template, nil, src, match)
779 | }
780 |
781 | func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
782 | for len(template) > 0 {
783 | i := strings.Index(template, "$")
784 | if i < 0 {
785 | break
786 | }
787 | dst = append(dst, template[:i]...)
788 | template = template[i:]
789 | if len(template) > 1 && template[1] == '$' {
790 | // Treat $$ as $.
791 | dst = append(dst, '$')
792 | template = template[2:]
793 | continue
794 | }
795 | name, num, rest, ok := extract(template)
796 | if !ok {
797 | // Malformed; treat $ as raw text.
798 | dst = append(dst, '$')
799 | template = template[1:]
800 | continue
801 | }
802 | template = rest
803 | if num >= 0 {
804 | if 2*num+1 < len(match) && match[2*num] >= 0 {
805 | if bsrc != nil {
806 | dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
807 | } else {
808 | dst = append(dst, src[match[2*num]:match[2*num+1]]...)
809 | }
810 | }
811 | } else {
812 | for i, namei := range re.subexpNames {
813 | if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
814 | if bsrc != nil {
815 | dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
816 | } else {
817 | dst = append(dst, src[match[2*i]:match[2*i+1]]...)
818 | }
819 | break
820 | }
821 | }
822 | }
823 | }
824 | dst = append(dst, template...)
825 | return dst
826 | }
827 |
828 | // extract returns the name from a leading "$name" or "${name}" in str.
829 | // If it is a number, extract returns num set to that number; otherwise num = -1.
830 | func extract(str string) (name string, num int, rest string, ok bool) {
831 | if len(str) < 2 || str[0] != '$' {
832 | return
833 | }
834 | brace := false
835 | if str[1] == '{' {
836 | brace = true
837 | str = str[2:]
838 | } else {
839 | str = str[1:]
840 | }
841 | i := 0
842 | for i < len(str) {
843 | rune, size := utf8.DecodeRuneInString(str[i:])
844 | if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
845 | break
846 | }
847 | i += size
848 | }
849 | if i == 0 {
850 | // empty name is not okay
851 | return
852 | }
853 | name = str[:i]
854 | if brace {
855 | if i >= len(str) || str[i] != '}' {
856 | // missing closing brace
857 | return
858 | }
859 | i++
860 | }
861 |
862 | // Parse number.
863 | num = 0
864 | for i := 0; i < len(name); i++ {
865 | if name[i] < '0' || '9' < name[i] || num >= 1e8 {
866 | num = -1
867 | break
868 | }
869 | num = num*10 + int(name[i]) - '0'
870 | }
871 | // Disallow leading zeros.
872 | if name[0] == '0' && len(name) > 1 {
873 | num = -1
874 | }
875 |
876 | rest = str[i:]
877 | ok = true
878 | return
879 | }
880 |
881 | // FindSubmatchIndex returns a slice holding the index pairs identifying the
882 | // leftmost match of the regular expression in b and the matches, if any, of
883 | // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
884 | // in the package comment.
885 | // A return value of nil indicates no match.
886 | func (re *Regexp) FindSubmatchIndex(b []byte) []int {
887 | return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
888 | }
889 |
890 | // FindStringSubmatch returns a slice of strings holding the text of the
891 | // leftmost match of the regular expression in s and the matches, if any, of
892 | // its subexpressions, as defined by the 'Submatch' description in the
893 | // package comment.
894 | // A return value of nil indicates no match.
895 | func (re *Regexp) FindStringSubmatch(s string) []string {
896 | a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
897 | if a == nil {
898 | return nil
899 | }
900 | ret := make([]string, 1+re.numSubexp)
901 | for i := range ret {
902 | if 2*i < len(a) && a[2*i] >= 0 {
903 | ret[i] = s[a[2*i]:a[2*i+1]]
904 | }
905 | }
906 | return ret
907 | }
908 |
909 | // FindStringSubmatchIndex returns a slice holding the index pairs
910 | // identifying the leftmost match of the regular expression in s and the
911 | // matches, if any, of its subexpressions, as defined by the 'Submatch' and
912 | // 'Index' descriptions in the package comment.
913 | // A return value of nil indicates no match.
914 | func (re *Regexp) FindStringSubmatchIndex(s string) []int {
915 | return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
916 | }
917 |
918 | // FindReaderSubmatchIndex returns a slice holding the index pairs
919 | // identifying the leftmost match of the regular expression of text read by
920 | // the RuneReader, and the matches, if any, of its subexpressions, as defined
921 | // by the 'Submatch' and 'Index' descriptions in the package comment. A
922 | // return value of nil indicates no match.
923 | func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
924 | return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
925 | }
926 |
927 | const startSize = 10 // The size at which to start a slice in the 'All' routines.
928 |
929 | // FindAll is the 'All' version of Find; it returns a slice of all successive
930 | // matches of the expression, as defined by the 'All' description in the
931 | // package comment.
932 | // A return value of nil indicates no match.
933 | func (re *Regexp) FindAll(b []byte, n int) [][]byte {
934 | if n < 0 {
935 | n = len(b) + 1
936 | }
937 | result := make([][]byte, 0, startSize)
938 | re.allMatches("", b, n, func(match []int) {
939 | result = append(result, b[match[0]:match[1]])
940 | })
941 | if len(result) == 0 {
942 | return nil
943 | }
944 | return result
945 | }
946 |
947 | // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
948 | // successive matches of the expression, as defined by the 'All' description
949 | // in the package comment.
950 | // A return value of nil indicates no match.
951 | func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
952 | if n < 0 {
953 | n = len(b) + 1
954 | }
955 | result := make([][]int, 0, startSize)
956 | re.allMatches("", b, n, func(match []int) {
957 | result = append(result, match[0:2])
958 | })
959 | if len(result) == 0 {
960 | return nil
961 | }
962 | return result
963 | }
964 |
965 | // FindAllString is the 'All' version of FindString; it returns a slice of all
966 | // successive matches of the expression, as defined by the 'All' description
967 | // in the package comment.
968 | // A return value of nil indicates no match.
969 | func (re *Regexp) FindAllString(s string, n int) []string {
970 | if n < 0 {
971 | n = len(s) + 1
972 | }
973 | result := make([]string, 0, startSize)
974 | re.allMatches(s, nil, n, func(match []int) {
975 | result = append(result, s[match[0]:match[1]])
976 | })
977 | if len(result) == 0 {
978 | return nil
979 | }
980 | return result
981 | }
982 |
983 | // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
984 | // slice of all successive matches of the expression, as defined by the 'All'
985 | // description in the package comment.
986 | // A return value of nil indicates no match.
987 | func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
988 | if n < 0 {
989 | n = len(s) + 1
990 | }
991 | result := make([][]int, 0, startSize)
992 | re.allMatches(s, nil, n, func(match []int) {
993 | result = append(result, match[0:2])
994 | })
995 | if len(result) == 0 {
996 | return nil
997 | }
998 | return result
999 | }
1000 |
1001 | // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1002 | // of all successive matches of the expression, as defined by the 'All'
1003 | // description in the package comment.
1004 | // A return value of nil indicates no match.
1005 | func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1006 | if n < 0 {
1007 | n = len(b) + 1
1008 | }
1009 | result := make([][][]byte, 0, startSize)
1010 | re.allMatches("", b, n, func(match []int) {
1011 | slice := make([][]byte, len(match)/2)
1012 | for j := range slice {
1013 | if match[2*j] >= 0 {
1014 | slice[j] = b[match[2*j]:match[2*j+1]]
1015 | }
1016 | }
1017 | result = append(result, slice)
1018 | })
1019 | if len(result) == 0 {
1020 | return nil
1021 | }
1022 | return result
1023 | }
1024 |
1025 | // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1026 | // a slice of all successive matches of the expression, as defined by the
1027 | // 'All' description in the package comment.
1028 | // A return value of nil indicates no match.
1029 | func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1030 | if n < 0 {
1031 | n = len(b) + 1
1032 | }
1033 | result := make([][]int, 0, startSize)
1034 | re.allMatches("", b, n, func(match []int) {
1035 | result = append(result, match)
1036 | })
1037 | if len(result) == 0 {
1038 | return nil
1039 | }
1040 | return result
1041 | }
1042 |
1043 | // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1044 | // returns a slice of all successive matches of the expression, as defined by
1045 | // the 'All' description in the package comment.
1046 | // A return value of nil indicates no match.
1047 | func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1048 | if n < 0 {
1049 | n = len(s) + 1
1050 | }
1051 | result := make([][]string, 0, startSize)
1052 | re.allMatches(s, nil, n, func(match []int) {
1053 | slice := make([]string, len(match)/2)
1054 | for j := range slice {
1055 | if match[2*j] >= 0 {
1056 | slice[j] = s[match[2*j]:match[2*j+1]]
1057 | }
1058 | }
1059 | result = append(result, slice)
1060 | })
1061 | if len(result) == 0 {
1062 | return nil
1063 | }
1064 | return result
1065 | }
1066 |
1067 | // FindAllStringSubmatchIndex is the 'All' version of
1068 | // FindStringSubmatchIndex; it returns a slice of all successive matches of
1069 | // the expression, as defined by the 'All' description in the package
1070 | // comment.
1071 | // A return value of nil indicates no match.
1072 | func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1073 | if n < 0 {
1074 | n = len(s) + 1
1075 | }
1076 | result := make([][]int, 0, startSize)
1077 | re.allMatches(s, nil, n, func(match []int) {
1078 | result = append(result, match)
1079 | })
1080 | if len(result) == 0 {
1081 | return nil
1082 | }
1083 | return result
1084 | }
1085 |
--------------------------------------------------------------------------------
/restructure.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import (
4 | "fmt"
5 | "reflect"
6 | "regexp/syntax"
7 |
8 | "github.com/alexflint/go-restructure/regex"
9 | )
10 |
11 | // Style determines whether we are in Perl or POSIX or custom mode
12 | type Style int
13 |
14 | const (
15 | Perl Style = iota
16 | POSIX
17 | CustomStyle
18 | )
19 |
20 | // Options represents optional parameters for compilation
21 | type Options struct {
22 | Style Style // Style can be set to Perl, POSIX, or CustomStyle
23 | SyntaxFlags syntax.Flags
24 | }
25 |
26 | type subcapture struct {
27 | begin, end int
28 | }
29 |
30 | func (r subcapture) wasMatched() bool {
31 | return r.begin != -1 && r.end != -1
32 | }
33 |
34 | type match struct {
35 | input []byte
36 | captures []subcapture
37 | }
38 |
39 | func matchFromIndices(indices []int, input []byte) *match {
40 | match := &match{
41 | input: input,
42 | }
43 | for i := 0; i < len(indices); i += 2 {
44 | match.captures = append(match.captures, subcapture{indices[i], indices[i+1]})
45 | }
46 | return match
47 | }
48 |
49 | // Pos represents a position within a matched region. If a matched struct contains
50 | // a field of type Pos then this field will be assigned a value indicating a position
51 | // in the input string, where the position corresponds to the index of the Pos field.
52 | type Pos int
53 |
54 | // Submatch represents a matched region. It is a used to determine the begin and and
55 | // position of the match corresponding to a field. This library treats fields of type
56 | // `Submatch` just like `string` or `[]byte` fields, except that the matched string
57 | // is inserted into `Submatch.Str` and its begin and end position are inserted into
58 | // `Submatch.Begin` and `Submatch.End`.
59 | type Submatch struct {
60 | Begin Pos
61 | End Pos
62 | Bytes []byte
63 | }
64 |
65 | // String gets the matched substring
66 | func (r *Submatch) String() string {
67 | return string(r.Bytes)
68 | }
69 |
70 | // Regexp is a regular expression that captures submatches into struct fields.
71 | type Regexp struct {
72 | st *Struct
73 | re *regex.Regexp
74 | t reflect.Type
75 | opts Options
76 | }
77 |
78 | // Find attempts to match the regular expression against the input string. It
79 | // returns true if there was a match, and also populates the fields of the provided
80 | // struct with the contents of each submatch.
81 | func (r *Regexp) Find(dest interface{}, s string) bool {
82 | v := reflect.ValueOf(dest)
83 | input := []byte(s)
84 |
85 | // Check the type
86 | expected := reflect.PtrTo(r.t)
87 | if v.Type() != expected {
88 | panic(fmt.Errorf("expected destination to be *%s but got %T", r.t.String(), dest))
89 | }
90 |
91 | // Execute the regular expression
92 | indices := r.re.FindSubmatchIndex(input)
93 | if indices == nil {
94 | return false
95 | }
96 |
97 | // Inflate matches into original struct
98 | match := matchFromIndices(indices, input)
99 |
100 | err := inflateStruct(v, match, r.st)
101 | if err != nil {
102 | panic(err)
103 | }
104 | return true
105 | }
106 |
107 | // FindAll attempts to match the regular expression against the input string. It returns true
108 | // if there was at least one match.
109 | func (r *Regexp) FindAll(dest interface{}, s string, limit int) {
110 | // Check the type
111 | v := reflect.ValueOf(dest)
112 | t := v.Type()
113 | if t.Kind() != reflect.Ptr {
114 | panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest))
115 | }
116 |
117 | sliceType := t.Elem()
118 | if sliceType.Kind() != reflect.Slice {
119 | panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest))
120 | }
121 |
122 | itemType := sliceType.Elem()
123 | if itemType != r.t && itemType != reflect.PtrTo(r.t) {
124 | panic(fmt.Errorf("expected the slice element to be %s or *%s but it was %s", r.t, r.t, t))
125 | }
126 |
127 | // Execute the regular expression
128 | input := []byte(s)
129 | matches := r.re.FindAllSubmatchIndex(input, limit)
130 |
131 | // Allocate a slice with the desired length
132 | v.Elem().Set(reflect.MakeSlice(sliceType, len(matches), len(matches)))
133 |
134 | // Inflate the matches into the slice elements
135 | for i, indices := range matches {
136 | // Get the i-th element of the slice
137 | destItem := v.Elem().Index(i)
138 | if itemType.Kind() != reflect.Ptr {
139 | destItem = destItem.Addr()
140 | }
141 |
142 | // Create the match object
143 | match := matchFromIndices(indices, input)
144 |
145 | // Inflate the match into the dest item
146 | err := inflateStruct(destItem, match, r.st)
147 | if err != nil {
148 | panic(err)
149 | }
150 | }
151 | }
152 |
153 | // String returns a string representation of the regular expression
154 | func (r *Regexp) String() string {
155 | return r.re.String()
156 | }
157 |
158 | // Compile constructs a regular expression from the struct fields on the
159 | // provided struct.
160 | func Compile(proto interface{}, opts Options) (*Regexp, error) {
161 | return CompileType(reflect.TypeOf(proto), opts)
162 | }
163 |
164 | // CompileType is like Compile but takes a reflect.Type instead.
165 | func CompileType(t reflect.Type, opts Options) (*Regexp, error) {
166 | // We do this so that the zero value for Options gives us Perl mode,
167 | // which is also the default used by the standard library regexp package
168 | switch opts.Style {
169 | case Perl:
170 | opts.SyntaxFlags = syntax.Perl
171 | case POSIX:
172 | opts.SyntaxFlags = syntax.POSIX
173 | }
174 |
175 | if t.Kind() == reflect.Ptr {
176 | t = t.Elem()
177 | }
178 |
179 | // Traverse the struct
180 | b := newBuilder(opts)
181 | st, expr, err := b.structure(t)
182 | if err != nil {
183 | return nil, err
184 | }
185 |
186 | // Compile regular expression
187 | re, err := regex.CompileSyntax(expr)
188 | if err != nil {
189 | return nil, err
190 | }
191 |
192 | // Return
193 | return &Regexp{
194 | st: st,
195 | re: re,
196 | t: t,
197 | opts: opts,
198 | }, nil
199 | }
200 |
201 | // MustCompile is like Compile but panics if there is a compilation error
202 | func MustCompile(proto interface{}, opts Options) *Regexp {
203 | re, err := Compile(proto, opts)
204 | if err != nil {
205 | panic(err)
206 | }
207 | return re
208 | }
209 |
210 | // MustCompileType is like CompileType but panics if there is a compilation error
211 | func MustCompileType(t reflect.Type, opts Options) *Regexp {
212 | re, err := CompileType(t, opts)
213 | if err != nil {
214 | panic(err)
215 | }
216 | return re
217 | }
218 |
219 | // Find constructs a regular expression from the given struct and executes it on the
220 | // given string, placing submatches into the fields of the struct. The first parameter
221 | // must be a non-nil struct pointer. It returns true if the match succeeded. The only
222 | // errors that are returned are compilation errors.
223 | func Find(dest interface{}, s string) (bool, error) {
224 | re, err := Compile(dest, Options{})
225 | if err != nil {
226 | return false, err
227 | }
228 | return re.Find(dest, s), nil
229 | }
230 |
--------------------------------------------------------------------------------
/restructure_test.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import (
4 | "encoding/json"
5 | "testing"
6 |
7 | "github.com/stretchr/testify/assert"
8 | "github.com/stretchr/testify/require"
9 | )
10 |
11 | func assertRegion(t *testing.T, s string, begin int, end int, r *Submatch) {
12 | assert.NotNil(t, r)
13 | assert.Equal(t, s, string(r.Bytes))
14 | assert.EqualValues(t, begin, r.Begin)
15 | assert.EqualValues(t, end, r.End)
16 | }
17 |
18 | type DotName struct {
19 | Dot string `regexp:"\\."`
20 | Name string `regexp:"\\w+"`
21 | }
22 |
23 | type DotExpr struct {
24 | _ struct{} `regexp:"^"`
25 | Head string `regexp:"\\w+"`
26 | Tail *DotName `regexp:"?"`
27 | _ struct{} `regexp:"$"`
28 | }
29 |
30 | func TestMatchNameDotName(t *testing.T) {
31 | pattern, err := Compile(DotExpr{}, Options{})
32 | require.NoError(t, err)
33 |
34 | var v DotExpr
35 | assert.True(t, pattern.Find(&v, "foo.bar"))
36 | assert.Equal(t, "foo", v.Head)
37 | require.NotNil(t, v.Tail)
38 | assert.Equal(t, ".", v.Tail.Dot)
39 | assert.Equal(t, "bar", v.Tail.Name)
40 | }
41 |
42 | func TestMatchNameDotNameHeadOnly(t *testing.T) {
43 | pattern, err := Compile(DotExpr{}, Options{})
44 | require.NoError(t, err)
45 |
46 | var v DotExpr
47 | assert.True(t, pattern.Find(&v, "head"))
48 | assert.Equal(t, "head", v.Head)
49 | assert.Nil(t, v.Tail)
50 | }
51 |
52 | func TestMatchNameDotNameFails(t *testing.T) {
53 | pattern, err := Compile(DotExpr{}, Options{})
54 | require.NoError(t, err)
55 |
56 | var v DotExpr
57 | assert.False(t, pattern.Find(&v, ".oops"))
58 | }
59 |
60 | type URL struct {
61 | _ string `regexp:"^"`
62 | Scheme string `regexp:"[[:alpha:]]+" json:"scheme"`
63 | _ string `regexp:"://"`
64 | Host string `regexp:".*" json:"host"`
65 | _ string `regexp:"$"`
66 | }
67 |
68 | func TestMatchURL(t *testing.T) {
69 | pattern, err := Compile(URL{}, Options{})
70 | require.NoError(t, err)
71 |
72 | var v URL
73 | require.True(t, pattern.Find(&v, "http://example.com"))
74 | assert.Equal(t, "http", v.Scheme)
75 | assert.Equal(t, "example.com", v.Host)
76 | }
77 |
78 | func TestCombinationWithJSONTags(t *testing.T) {
79 | pattern, err := Compile(URL{}, Options{})
80 | require.NoError(t, err)
81 |
82 | var v URL
83 | require.True(t, pattern.Find(&v, "http://example.com"))
84 |
85 | js, err := json.Marshal(&v)
86 | require.NoError(t, err)
87 |
88 | assert.Equal(t, "{\"scheme\":\"http\",\"host\":\"example.com\"}", string(js))
89 | }
90 |
91 | type PtrURL struct {
92 | _ struct{} `regexp:"^"`
93 | Scheme *string `regexp:"[[:alpha:]]+"`
94 | _ struct{} `regexp:"://"`
95 | Host *string `regexp:".*"`
96 | _ struct{} `regexp:"$"`
97 | }
98 |
99 | func TestMatchPtrURL(t *testing.T) {
100 | pattern, err := Compile(PtrURL{}, Options{})
101 | require.NoError(t, err)
102 |
103 | var v PtrURL
104 | require.True(t, pattern.Find(&v, "http://example.com"))
105 | require.NotNil(t, v.Scheme)
106 | require.NotNil(t, v.Host)
107 | assert.Equal(t, "http", *v.Scheme)
108 | assert.Equal(t, "example.com", *v.Host)
109 | }
110 |
111 | func TestMatchPtrURLFailed(t *testing.T) {
112 | pattern, err := Compile(PtrURL{}, Options{})
113 | require.NoError(t, err)
114 |
115 | var v PtrURL
116 | require.False(t, pattern.Find(&v, "oops"))
117 | assert.Nil(t, v.Scheme)
118 | assert.Nil(t, v.Host)
119 | }
120 |
121 | type NakedURL struct {
122 | _ string `^`
123 | Scheme string `[[:alpha:]]+`
124 | _ string `://`
125 | Host string `.*`
126 | _ string `$`
127 | }
128 |
129 | func TestMatchNakedURL(t *testing.T) {
130 | pattern, err := Compile(NakedURL{}, Options{})
131 | require.NoError(t, err)
132 |
133 | var v NakedURL
134 | require.True(t, pattern.Find(&v, "http://example.com"))
135 | assert.Equal(t, "http", v.Scheme)
136 | assert.Equal(t, "example.com", v.Host)
137 | }
138 |
139 | type Nothing struct {
140 | X string
141 | }
142 |
143 | func TestEmptyPattern(t *testing.T) {
144 | pattern, err := Compile(Nothing{}, Options{})
145 | require.NoError(t, err)
146 |
147 | var v Nothing
148 | require.True(t, pattern.Find(&v, "abc"))
149 | }
150 |
151 | type Malformed struct {
152 | X string `regexp:"\w"` // this is malformed because \w is not a valid escape sequence
153 | }
154 |
155 | func TestErrorOnMalformedTag(t *testing.T) {
156 | _, err := Compile(Malformed{}, Options{})
157 | assert.Error(t, err)
158 | }
159 |
160 | type HasSubcaptures struct {
161 | Name string `a(bc)?d`
162 | }
163 |
164 | func TestRemoveSubcaptures(t *testing.T) {
165 | pattern, err := Compile(HasSubcaptures{}, Options{})
166 | require.NoError(t, err)
167 |
168 | var v HasSubcaptures
169 | require.True(t, pattern.Find(&v, "abcd"))
170 | assert.Equal(t, "abcd", v.Name)
171 | }
172 |
173 | type DotNameRegion struct {
174 | Dot *Submatch `regexp:"\\."`
175 | Name *Submatch `regexp:"\\w+"`
176 | }
177 |
178 | type DotExprRegion struct {
179 | _ struct{} `regexp:"^"`
180 | Head Submatch `regexp:"\\w+"`
181 | Tail *DotNameRegion `regexp:"?"`
182 | _ struct{} `regexp:"$"`
183 | }
184 |
185 | func TestMatchNameDotNameRegion(t *testing.T) {
186 | pattern, err := Compile(DotExprRegion{}, Options{})
187 | require.NoError(t, err)
188 |
189 | var v DotExprRegion
190 | assert.True(t, pattern.Find(&v, "foo.bar"))
191 | assertRegion(t, "foo", 0, 3, &v.Head)
192 | assert.NotNil(t, v.Tail)
193 | assertRegion(t, ".", 3, 4, v.Tail.Dot)
194 | assertRegion(t, "bar", 4, 7, v.Tail.Name)
195 | }
196 |
197 | type DotNamePos struct {
198 | Begin Pos
199 | Dot string `regexp:"\\."`
200 | Middle Pos
201 | Name string `regexp:"\\w+"`
202 | End Pos
203 | }
204 |
205 | type DotExprPos struct {
206 | Begin Pos
207 | _ struct{} `regexp:"^"`
208 | Head string `regexp:"\\w+"`
209 | Middle Pos
210 | Tail *DotNamePos `regexp:"?"`
211 | _ struct{} `regexp:"$"`
212 | End Pos
213 | }
214 |
215 | func TestMatchNameDotNamePos(t *testing.T) {
216 | pattern, err := Compile(DotExprPos{}, Options{})
217 | require.NoError(t, err)
218 |
219 | var v DotExprPos
220 | assert.True(t, pattern.Find(&v, "foo.bar"))
221 | assert.EqualValues(t, 0, v.Begin)
222 | assert.EqualValues(t, 3, v.Middle)
223 | assert.EqualValues(t, 3, v.Tail.Begin)
224 | assert.EqualValues(t, 4, v.Tail.Middle)
225 | assert.EqualValues(t, 7, v.Tail.End)
226 | assert.EqualValues(t, 7, v.End)
227 | }
228 |
229 | type DegeneratePos struct {
230 | X Pos
231 | Y Pos
232 | }
233 |
234 | func TestDegeneratePos(t *testing.T) {
235 | // This tests what happens if there are degenerate position captures
236 | pattern, err := Compile(DegeneratePos{}, Options{})
237 | require.NoError(t, err)
238 | var v DegeneratePos
239 | assert.True(t, pattern.Find(&v, "abc"))
240 | assert.EqualValues(t, 0, v.X)
241 | assert.EqualValues(t, 0, v.Y)
242 | }
243 |
244 | type UnexportedPos struct {
245 | Exported Pos
246 | unexported Pos
247 | _ struct{} `regexp:"$"`
248 | }
249 |
250 | func TestUnexportedPos(t *testing.T) {
251 | // This tests what happens if there are non-exported Pos fields
252 | pattern, err := Compile(UnexportedPos{}, Options{})
253 | require.NoError(t, err)
254 | var v UnexportedPos
255 | assert.True(t, pattern.Find(&v, "abc"))
256 | assert.EqualValues(t, 3, v.Exported)
257 | assert.EqualValues(t, 0, v.unexported) // should be ignored
258 | }
259 |
260 | type Word struct {
261 | S string `\w+`
262 | }
263 |
264 | func TestFindAllWords_Simple(t *testing.T) {
265 | pattern := MustCompile(Word{}, Options{})
266 | var words []Word
267 | pattern.FindAll(&words, "ham is spam", -1)
268 | require.Len(t, words, 3)
269 | assert.EqualValues(t, "ham", words[0].S)
270 | assert.EqualValues(t, "is", words[1].S)
271 | assert.EqualValues(t, "spam", words[2].S)
272 | }
273 |
274 | func TestFindAllWords_Ptr(t *testing.T) {
275 | pattern := MustCompile(Word{}, Options{})
276 | var words []*Word
277 | pattern.FindAll(&words, "ham is spam", -1)
278 | require.Len(t, words, 3)
279 | assert.EqualValues(t, "ham", words[0].S)
280 | assert.EqualValues(t, "is", words[1].S)
281 | assert.EqualValues(t, "spam", words[2].S)
282 | }
283 |
284 | func TestFindAllWords_NoMatches(t *testing.T) {
285 | pattern := MustCompile(Word{}, Options{})
286 | var words []*Word
287 | pattern.FindAll(&words, "*&!", -1)
288 | require.Empty(t, words)
289 | }
290 |
291 | func TestFindAllWords_ByValueSlicePanics(t *testing.T) {
292 | pattern := MustCompile(Word{}, Options{})
293 | var words []*Word
294 | // This should panic because words is passed by value not by pointer:
295 | assert.Panics(t, func() { pattern.FindAll(words, "*&!", -1) })
296 | }
297 |
298 | type WordSubmatch struct {
299 | S *Submatch `\w+`
300 | }
301 |
302 | func TestFindAllWords_Regions(t *testing.T) {
303 | pattern := MustCompile(WordSubmatch{}, Options{})
304 | var words []*WordSubmatch
305 | pattern.FindAll(&words, "ham is spam", -1)
306 | require.Len(t, words, 3)
307 | assertRegion(t, "ham", 0, 3, words[0].S)
308 | assertRegion(t, "is", 4, 6, words[1].S)
309 | assertRegion(t, "spam", 7, 11, words[2].S)
310 | }
311 |
312 | type ExprWithInt struct {
313 | Number int `regexp:"^\\d+"`
314 | _ string `regexp:"\\s+"`
315 | Animal string `regexp:"\\w+$"`
316 | }
317 |
318 | func TestMatchWithInt(t *testing.T) {
319 | pattern, err := Compile(ExprWithInt{}, Options{})
320 | require.NoError(t, err)
321 |
322 | var v ExprWithInt
323 | assert.True(t, pattern.Find(&v, "4 wombats"))
324 | assert.Equal(t, 4, v.Number)
325 | assert.Equal(t, "wombats", v.Animal)
326 | }
327 |
--------------------------------------------------------------------------------
/samples/email-address/email-address.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/alexflint/go-restructure"
7 | )
8 |
9 | type Hostname struct {
10 | Domain string `\w+`
11 | _ struct{} `\.`
12 | TLD string `\w+`
13 | }
14 |
15 | type EmailAddress struct {
16 | _ struct{} `^`
17 | User string `[a-zA-Z0-9._%+-]+`
18 | _ struct{} `@`
19 | Host *Hostname
20 | _ struct{} `$`
21 | }
22 |
23 | func main() {
24 | var addr EmailAddress
25 | success, _ := restructure.Find(&addr, "joe@example.com")
26 | if success {
27 | fmt.Println(addr.User) // prints "joe"
28 | fmt.Println(addr.Host.Domain) // prints "example"
29 | fmt.Println(addr.Host.TLD) // prints "com"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/samples/find-all-floats/find-all-floats.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/alexflint/go-restructure"
7 | )
8 |
9 | var src = `
10 | The US economy went through an economic downturn following the financial
11 | crisis of 2007–08, with output as late as 2013 still below potential
12 | according to the Congressional Budget Office.[57] The economy, however,
13 | began to recover in the second half of 2009, and as of November 2015,
14 | unemployment had declined from a high of 10% to 5%; the government's
15 | broader U-6 unemployment rate, which includes the part-time underemployed,
16 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of
17 | the lowest labor union participation rates in the OECD.[58] Households
18 | living on less than $2 per day before government benefits, doubled from
19 | 1996 levels to 1.5 million households in 2011, including 2.8 million
20 | children.[59] The gap in income between rich and poor is greater in the
21 | United States than in any other developed country.[60] Total public and
22 | private debt was $50 trillion at the end of the first quarter of 2010,
23 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more
24 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion
25 | and domestic financial liabilities totaled $106 trillion.[63]
26 | `
27 |
28 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{})
29 |
30 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
31 | type Float struct {
32 | Begin restructure.Pos
33 | Sign *Sign `?`
34 | Whole string `[0-9]*`
35 | Period struct{} `\.?`
36 | Frac string `[0-9]+`
37 | Exponent *Exponent `?`
38 | End restructure.Pos
39 | }
40 |
41 | // Matches "+" or "-"
42 | type Sign struct {
43 | Ch string `[+-]`
44 | }
45 |
46 | // Matches "e+4", "E6", "e-03"
47 | type Exponent struct {
48 | _ struct{} `[eE]`
49 | Sign *Sign `?`
50 | Num string `[0-9]+`
51 | }
52 |
53 | func main() {
54 | var floats []Float
55 | floatRegexp.FindAll(&floats, src, -1)
56 | for _, f := range floats {
57 | fmt.Println(src[f.Begin:f.End])
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/samples/floating-point/floating-point.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 |
7 | "github.com/alexflint/go-restructure"
8 | )
9 |
10 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{})
11 |
12 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
13 | type Float struct {
14 | Sign *Sign `?`
15 | Whole string `[0-9]*`
16 | Period struct{} `\.?`
17 | Frac string `[0-9]+`
18 | Exponent *Exponent `?`
19 | }
20 |
21 | // Matches "+" or "-"
22 | type Sign struct {
23 | Ch string `[+-]`
24 | }
25 |
26 | // Matches "e+4", "E6", "e-03"
27 | type Exponent struct {
28 | _ struct{} `[eE]`
29 | Sign *Sign `?`
30 | Num string `[0-9]+`
31 | }
32 |
33 | func prettyPrint(x interface{}) string {
34 | buf, err := json.MarshalIndent(x, "", " ")
35 | if err != nil {
36 | return err.Error()
37 | }
38 | return string(buf)
39 | }
40 |
41 | func main() {
42 | var f Float
43 | for _, str := range []string{"1.23", "1.23e+45", ".123", "12e3"} {
44 | floatRegexp.Find(&f, str)
45 | fmt.Printf("\"%s\" -> %s\n\n", str, prettyPrint(f))
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/samples/name-dot-name/name-dot-name.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "log"
7 |
8 | "github.com/alexflint/go-arg"
9 | "github.com/alexflint/go-restructure"
10 | )
11 |
12 | type DotName struct {
13 | Dot string `\.`
14 | Name string `\w+`
15 | }
16 |
17 | type DotExpr struct {
18 | _ struct{} `^`
19 | Head string `foo`
20 | Tail *DotName `?`
21 | _ struct{} `$`
22 | }
23 |
24 | func prettyPrint(x interface{}) string {
25 | buf, err := json.MarshalIndent(x, "", " ")
26 | if err != nil {
27 | return err.Error()
28 | }
29 | return string(buf)
30 | }
31 |
32 | func main() {
33 | var args struct {
34 | Str string `arg:"positional"`
35 | }
36 | arg.MustParse(&args)
37 |
38 | // Construct the regular expression
39 | pattern, err := restructure.Compile(&DotExpr{}, restructure.Options{})
40 | if err != nil {
41 | log.Fatal(err)
42 | }
43 |
44 | // Match
45 | var v DotExpr
46 | fmt.Println(pattern.Find(&v, args.Str))
47 | }
48 |
--------------------------------------------------------------------------------
/samples/python-import/python-import.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/alexflint/go-restructure"
7 | )
8 |
9 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{})
10 |
11 | // Import matches "import foo" and "import foo as bar"
12 | type Import struct {
13 | _ struct{} `^import\s+`
14 | Package restructure.Submatch `\w+`
15 | Alias *AsName `?`
16 | _ struct{} `$`
17 | }
18 |
19 | // AsName matches "as xyz"
20 | type AsName struct {
21 | _ struct{} `\s+as\s+`
22 | Name restructure.Submatch `\w+`
23 | }
24 |
25 | func main() {
26 | var imp Import
27 | importRegexp.Find(&imp, "import foo as bar")
28 | fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End)
29 | fmt.Printf(" AS %s (bytes %d...%d)\n", imp.Alias.Name.String(), imp.Alias.Name.Begin, imp.Alias.Name.End)
30 | }
31 |
--------------------------------------------------------------------------------
/samples/quaternion-in-json/quaternion-in-json.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 |
7 | "github.com/alexflint/go-restructure"
8 | )
9 |
10 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{})
11 |
12 | type RealPart struct {
13 | Sign string `regexp:"[+-]?"`
14 | Real string `regexp:"[0-9]+"`
15 | }
16 |
17 | type SignedInt struct {
18 | Sign string `regexp:"[+-]"`
19 | Real string `regexp:"[0-9]+"`
20 | }
21 |
22 | type IPart struct {
23 | Magnitude SignedInt
24 | _ struct{} `regexp:"i"`
25 | }
26 |
27 | type JPart struct {
28 | Magnitude SignedInt
29 | _ struct{} `regexp:"j"`
30 | }
31 |
32 | type KPart struct {
33 | Magnitude SignedInt
34 | _ struct{} `regexp:"k"`
35 | }
36 |
37 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc
38 | type Quaternion struct {
39 | Real *RealPart
40 | I *IPart `regexp:"?"`
41 | J *JPart `regexp:"?"`
42 | K *KPart `regexp:"?"`
43 | }
44 |
45 | // matches the quoted strings `"-1+2i+3j+4k"`, `"3-4k"`, `"12+34i"`, etc
46 | type QuotedQuaternion struct {
47 | _ struct{} `regexp:"^"`
48 | _ struct{} `regexp:"\""`
49 | Quaternion *Quaternion
50 | _ struct{} `regexp:"\""`
51 | _ struct{} `regexp:"$"`
52 | }
53 |
54 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error {
55 | if !quaternionRegexp.Find(c, string(b)) {
56 | return fmt.Errorf("%s is not a quaternion number", string(b))
57 | }
58 | return nil
59 | }
60 |
61 | // this struct is handled by JSON
62 | type Var struct {
63 | Name string
64 | Value *QuotedQuaternion
65 | }
66 |
67 | func prettyPrint(x interface{}) string {
68 | buf, err := json.MarshalIndent(x, "", " ")
69 | if err != nil {
70 | return err.Error()
71 | }
72 | return string(buf)
73 | }
74 |
75 | func main() {
76 | src := `{"name": "foo", "value": "1+2i+3j+4k"}`
77 | var v Var
78 | err := json.Unmarshal([]byte(src), &v)
79 | if err != nil {
80 | fmt.Println(err)
81 | }
82 | fmt.Println(prettyPrint(v))
83 | }
84 |
--------------------------------------------------------------------------------
/samples/simple-email/simple-email.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/alexflint/go-restructure"
7 | )
8 |
9 | type EmailAddress struct {
10 | _ struct{} `^`
11 | User string `\w+`
12 | _ struct{} `@`
13 | Host string `[^@]+`
14 | _ struct{} `$`
15 | }
16 |
17 | func main() {
18 | var addr EmailAddress
19 | success, err := restructure.Find(&addr, "joe@example.com")
20 | if err != nil {
21 | fmt.Println(err)
22 | }
23 | if success {
24 | fmt.Println(addr.User) // prints "joe"
25 | fmt.Println(addr.Host) // prints "example.com"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/transform.go:
--------------------------------------------------------------------------------
1 | package restructure
2 |
3 | import "regexp/syntax"
4 |
5 | type transformer func(expr *syntax.Regexp) ([]*syntax.Regexp, error)
6 |
7 | // transform replaces each node in a regex AST with the return value of the given function
8 | // it processes the children of a node before the node itself
9 | func transform(expr *syntax.Regexp, f transformer) (*syntax.Regexp, error) {
10 | var newchildren []*syntax.Regexp
11 | for _, child := range expr.Sub {
12 | newchild, err := transform(child, f)
13 | if err != nil {
14 | return nil, err
15 | }
16 | replacements, err := f(newchild)
17 | if err != nil {
18 | return nil, err
19 | }
20 | newchildren = append(newchildren, replacements...)
21 | }
22 | expr.Sub = newchildren
23 | return expr, nil
24 | }
25 |
--------------------------------------------------------------------------------