├── .github
    └── workflows
    │   └── go.yml
├── BENCHMARKS.md
├── LICENSE.md
├── README.md
├── TODO.md
├── benchmark_test.go
├── builder.go
├── go.mod
├── go.sum
├── inflate.go
├── regex
    ├── README.md
    ├── backtrack.go
    ├── machine.go
    ├── onepass.go
    └── regexp.go
├── restructure.go
├── restructure_test.go
├── samples
    ├── email-address
    │   └── email-address.go
    ├── find-all-floats
    │   └── find-all-floats.go
    ├── floating-point
    │   └── floating-point.go
    ├── name-dot-name
    │   └── name-dot-name.go
    ├── python-import
    │   └── python-import.go
    ├── quaternion-in-json
    │   └── quaternion-in-json.go
    └── simple-email
    │   └── simple-email.go
└── transform.go


/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 | 
 7 |   test:
 8 |     name: Test
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         go: ['1.22', '1.23', 'stable']
15 | 
16 |     steps:
17 |     - name: Checkout
18 |       uses: actions/checkout@v4
19 | 
20 |     - id: go
21 |       name: Setup
22 |       uses: actions/setup-go@v5
23 |       with:
24 |         go-version: ${{ matrix.go }}
25 | 
26 |     - name: Build
27 |       run: go build -v .
28 | 
29 |     - name: Test
30 |       run: go test -v -coverprofile=profile.cov .
31 | 
32 |     - name: Coverage
33 |       run: bash <(curl -s https://codecov.io/bash) -f profile.cov
34 | 


--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
 1 | ## Performance Benchmarks
 2 | 
 3 | There are three benchmarks in `benchmark_test.go` that compare the performance of `go-restructure` to that of the standard library `regexp` package. `go-restructure` uses a very slightly modified version of the `regexp` package so the performance of the core regular expression evaluator is very similar; most of the difference is therefore associated with the overhead of reflection.
 4 | 
 5 | These benchmarks were computed using `go test -bench=.` on an 2.8 GHz Intel Core i7 processor running OSX 10.10.5.
 6 | 
 7 | The first benchmark involves finding the first floating point number in a string of a few thousand characters. `go-restructure` takes around 8% longer than the standard library:
 8 | 
 9 | ```
10 | go-restructure		32428 ns/op
11 | stdlib/regexp		30060 ns/op
12 | ```
13 | 
14 | The second benchmark involves parsing a short email address. `go-restructure` takes around 
15 | 40% longer than the standard library:
16 | 
17 | ```
18 | go-restructure		1188 ns/op
19 | stdlib/regexp		844 ns/op
20 | ```
21 | 
22 | The third benchmark involves finding all python import statements in a file of around one hundred lines of python source. `go-restructure` takes around 2x longer than the standard library:
23 | 
24 | ```
25 | go-restructure		695 ns/op
26 | stdlib/regexp		337 ns/op
27 | ```
28 | 
29 | The high overhead for `go-restructure` on the last benchmark is probably due to `go-restructure` allocating a struct to hold the results of each match found by `FindAll`. In most cases this performance overhead will be a small price to pay for composable, inspectable regular expressions, particularly when it amonuts to the difference between one third of a microsecond and two thirds of a microsecond. However, applications that execute a very large number of regular expressions for which performance is critical may be well advised to use the standard library `regexp` package directly.
30 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Alex Flint
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h4 align="center">Struct-based regular expressions for Go</h4>
  2 | <p align="center">
  3 |   <a href="https://pkg.go.dev/github.com/alexflint/go-restructure"><img src="https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square" alt="Documentation"></a>
  4 |   <a href="https://github.com/alexflint/go-restructure/actions"><img src="https://github.com/alexflint/go-restructure/workflows/Go/badge.svg" alt="Build Status"></a>
  5 |   <a href="https://codecov.io/gh/alexflint/go-restructure"><img src="https://codecov.io/gh/alexflint/go-restructure/branch/master/graph/badge.svg" alt="Coverage Status"></a>
  6 |   <a href="https://goreportcard.com/report/github.com/alexflint/go-restructure"><img src="https://goreportcard.com/badge/github.com/alexflint/go-restructure" alt="Go Report Card"></a>
  7 | </p>
  8 | <br>
  9 | 
 10 | ## Match regular expressions into struct fields
 11 | 
 12 | ```shell
 13 | go get github.com/alexflint/go-restructure
 14 | ```
 15 | 
 16 | This package allows you to express regular expressions by defining a struct, and then capture matched sub-expressions into struct fields. Here is a very simple email address parser:
 17 | 
 18 | ```go
 19 | import "github.com/alexflint/go-restructure"
 20 | 
 21 | type EmailAddress struct {
 22 | 	_    struct{} `^`
 23 | 	User string   `\w+`
 24 | 	_    struct{} `@`
 25 | 	Host string   `[^@]+`
 26 | 	_    struct{} `$`
 27 | }
 28 | 
 29 | func main() {
 30 | 	var addr EmailAddress
 31 | 	restructure.Find(&addr, "joe@example.com")
 32 | 	fmt.Println(addr.User) // prints "joe"
 33 | 	fmt.Println(addr.Host) // prints "example.com"
 34 | }
 35 | ```
 36 | (Note that the above is far too simplistic to be used as a serious email address validator.)
 37 | 
 38 | The regular expression that was executed was the concatenation of the struct tags:
 39 | 
 40 | ```
 41 | ^(\w+)@([^@]+)$
 42 | ```
 43 | 
 44 | The first submatch was inserted into the `User` field and the second into the `Host` field.
 45 | 
 46 | You may also use the `regexp:` tag key, but keep in mind that you must escape quotes and backslashes:
 47 | 
 48 | ```go
 49 | type EmailAddress struct {
 50 | 	_    string `regexp:"^"`
 51 | 	User string `regexp:"\\w+"`
 52 | 	_    string `regexp:"@"`
 53 | 	Host string `regexp:"[^@]+"`
 54 | 	_    string `regexp:"$"`
 55 | }
 56 | ```
 57 | 
 58 | ### Nested Structs
 59 | 
 60 | Here is a slightly more sophisticated email address parser that uses nested structs:
 61 | 
 62 | ```go
 63 | type Hostname struct {
 64 | 	Domain string   `\w+`
 65 | 	_      struct{} `\.`
 66 | 	TLD    string   `\w+`
 67 | }
 68 | 
 69 | type EmailAddress struct {
 70 | 	_    struct{} `^`
 71 | 	User string   `[a-zA-Z0-9._%+-]+`
 72 | 	_    struct{} `@`
 73 | 	Host *Hostname
 74 | 	_    struct{} `$`
 75 | }
 76 | 
 77 | func main() {
 78 | 	var addr EmailAddress
 79 | 	success, _ := restructure.Find(&addr, "joe@example.com")
 80 | 	if success {
 81 | 		fmt.Println(addr.User)        // prints "joe"
 82 | 		fmt.Println(addr.Host.Domain) // prints "example"
 83 | 		fmt.Println(addr.Host.TLD)    // prints "com"
 84 | 	}
 85 | }
 86 | ```
 87 | 
 88 | Compare this to using the standard library `regexp.FindStringSubmatchIndex` directly:
 89 | 
 90 | ```go
 91 | func main() {
 92 | 	content := "joe@example.com"
 93 | 	expr := regexp.MustCompile(`^([a-zA-Z0-9._%+-]+)@((\w+)\.(\w+))$`)
 94 | 	indices := expr.FindStringSubmatchIndex(content)
 95 | 	if len(indices) > 0 {
 96 | 		userBegin, userEnd := indices[2], indices[3]
 97 | 		var user string
 98 | 		if userBegin != -1 && userEnd != -1 {
 99 | 			user = content[userBegin:userEnd]
100 | 		}
101 | 
102 | 		domainBegin, domainEnd := indices[6], indices[7]
103 | 		var domain string
104 | 		if domainBegin != -1 && domainEnd != -1 {
105 | 			domain = content[domainBegin:domainEnd]
106 | 		}
107 | 
108 | 		tldBegin, tldEnd := indices[8], indices[9]
109 | 		var tld string
110 | 		if tldBegin != -1 && tldEnd != -1 {
111 | 			tld = content[tldBegin:tldEnd]
112 | 		}
113 | 
114 | 		fmt.Println(user)   // prints "joe"
115 | 		fmt.Println(domain) // prints "example"
116 | 		fmt.Println(tld)    // prints "com"
117 | 	}
118 | }
119 | ```
120 | 
121 | ### Ints
122 | 
123 | It is also possible to set struct fields as `int` to get the string automatically converted.
124 | 
125 | ```go
126 | // Matches "12 wombats", "1 wombat" and store the number as int
127 | type Wisdom struct {
128 | 	Number   int       `^\d+`
129 | 	_   	 string    `\s+`
130 | 	Animal   string    `\w+`
131 | }
132 | ```
133 | 
134 | ### Optional fields
135 | 
136 | When nesting one struct within another, you can make the nested struct optional by marking it with `?`. The following example parses floating point numbers with optional sign and exponent:
137 | 
138 | ```go
139 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
140 | type Float struct {
141 | 	Sign     *Sign     `?`      // sign is optional
142 | 	Whole    string    `[0-9]*`
143 | 	Period   struct{}  `\.?`
144 | 	Frac     string    `[0-9]+`
145 | 	Exponent *Exponent `?`      // exponent is optional
146 | }
147 | 
148 | // Matches "e+4", "E6", "e-03"
149 | type Exponent struct {
150 | 	_    struct{} `[eE]`
151 | 	Sign *Sign    `?`         // sign is optional
152 | 	Num  string   `[0-9]+`
153 | }
154 | 
155 | // Matches "+" or "-"
156 | type Sign struct {
157 | 	Ch string `[+-]`
158 | }
159 | ```
160 | 
161 | When an optional sub-struct is not matched, it will be set to nil:
162 | 
163 | ```javascript
164 | "1.23" -> {
165 |   "Sign": nil,
166 |   "Whole": "1",
167 |   "Frac": "23",
168 |   "Exponent": nil
169 | }
170 | 
171 | "1.23e+45" -> {
172 |   "Sign": nil,
173 |   "Whole": "1",
174 |   "Frac": "23",
175 |   "Exponent": {
176 |     "Sign": {
177 |       "Ch": "+"
178 |     },
179 |     "Num": "45"
180 |   }
181 | }
182 | ```
183 | 
184 | ### Finding multiple matches
185 | 
186 | The following example uses `Regexp.FindAll` to extract all floating point numbers from
187 | a string, using the same `Float` struct as in the example above.
188 | 
189 | ```go
190 | src := "There are 10.4 cats for every 100 dogs in the United States."
191 | floatRegexp := restructure.MustCompile(Float{}, restructure.Options{})
192 | var floats []Float
193 | floatRegexp.FindAll(&floats, src, -1)
194 | ```
195 | 
196 | To limit the number of matches set the third parameter to a positive number.
197 | 
198 | ### Getting begin and end positions for submatches
199 | 
200 | To get the begin and end position of submatches, use the `restructure.Submatch` struct in place of `string`:
201 | 
202 | Here is an example of matching python imports such as `import foo as bar`:
203 | 
204 | ```go
205 | type Import struct {
206 | 	_       struct{}             `^import\s+`
207 | 	Package restructure.Submatch `\w+`
208 | 	_       struct{}             `\s+as\s+`
209 | 	Alias   restructure.Submatch `\w+`
210 | }
211 | 
212 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{})
213 | 
214 | func main() {
215 | 	var imp Import
216 | 	importRegexp.Find(&imp, "import foo as bar")
217 | 	fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End)
218 | 	fmt.Printf("    AS %s (bytes %d...%d)\n", imp.Alias.String(), imp.Alias.Begin, imp.Alias.End)
219 | }
220 | ```
221 | Output:
222 | ```
223 | IMPORT foo (bytes 7...10)
224 |     AS bar (bytes 14...17)
225 | ```
226 | 
227 | ### Regular expressions inside JSON
228 | 
229 | To run a regular expression as part of a json unmarshal, just implement the `JSONUnmarshaler` interface. Here is an example that parses the following JSON string containing a quaternion:
230 | 
231 | ```javascript
232 | {
233 | 	"Var": "foo",
234 | 	"Val": "1+2i+3j+4k"
235 | }
236 | ```
237 | 
238 | First we define the expressions for matching quaternions in the form `1+2i+3j+4k`:
239 | 
240 | ```go
241 | // Matches "1", "-12", "+12"
242 | type RealPart struct {
243 | 	Sign string `regexp:"[+-]?"`
244 | 	Real string `regexp:"[0-9]+"`
245 | }
246 | 
247 | // Matches "+123", "-1"
248 | type SignedInt struct {
249 | 	Sign string `regexp:"[+-]"`
250 | 	Real string `regexp:"[0-9]+"`
251 | }
252 | 
253 | // Matches "+12i", "-123i"
254 | type IPart struct {
255 | 	Magnitude SignedInt
256 | 	_         struct{} `regexp:"i"`
257 | }
258 | 
259 | // Matches "+12j", "-123j"
260 | type JPart struct {
261 | 	Magnitude SignedInt
262 | 	_         struct{} `regexp:"j"`
263 | }
264 | 
265 | // Matches "+12k", "-123k"
266 | type KPart struct {
267 | 	Magnitude SignedInt
268 | 	_         struct{} `regexp:"k"`
269 | }
270 | 
271 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc
272 | type Quaternion struct {
273 | 	Real *RealPart
274 | 	I    *IPart `regexp:"?"`
275 | 	J    *JPart `regexp:"?"`
276 | 	K    *KPart `regexp:"?"`
277 | }
278 | 
279 | // matches the quoted strings `"-1+2i"`, `"3-4i"`, `"12+34i"`, etc
280 | type QuotedQuaternion struct {
281 | 	_          struct{} `regexp:"^"`
282 | 	_          struct{} `regexp:"\""`
283 | 	Quaternion *Quaternion
284 | 	_          struct{} `regexp:"\""`
285 | 	_          struct{} `regexp:"$"`
286 | }
287 | ```
288 | 
289 | Next we implement `UnmarshalJSON` for the `QuotedQuaternion` type:
290 | ```go
291 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{})
292 | 
293 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error {
294 | 	if !quaternionRegexp.Find(c, string(b)) {
295 | 		return fmt.Errorf("%s is not a quaternion", string(b))
296 | 	}
297 | 	return nil
298 | }
299 | 
300 | ```
301 | 
302 | Now we can define a struct and unmarshal JSON into it:
303 | ```go
304 | type Var struct {
305 | 	Name  string
306 | 	Value *QuotedQuaternion
307 | }
308 | 
309 | func main() {
310 | 	src := `{"name": "foo", "value": "1+2i+3j+4k"}`
311 | 	var v Var
312 | 	json.Unmarshal([]byte(src), &v)
313 | }
314 | ```
315 | The result is:
316 | ```javascript
317 | {
318 |   "Name": "foo",
319 |   "Value": {
320 |     "Quaternion": {
321 |       "Real": {
322 |         "Sign": "",
323 |         "Real": "1"
324 |       },
325 |       "I": {
326 |         "Magnitude": {
327 |           "Sign": "+",
328 |           "Real": "2"
329 |         }
330 |       },
331 |       "J": {
332 |         "Magnitude": {
333 |           "Sign": "+",
334 |           "Real": "3"
335 |         }
336 |       },
337 |       "K": {
338 |         "Magnitude": {
339 |           "Sign": "+",
340 |           "Real": "4"
341 |         }
342 |       }
343 |     }
344 |   }
345 | }
346 | ```
347 | 
348 | ### Index of examples
349 | 
350 | - [Parse an email address](samples/simple-email/simple-email.go)
351 | - [Parse an email address using nested structs](samples/email-address/email-address.go)
352 | - [Parse a floating point number](samples/floating-point/floating-point.go)
353 | - [Find all floats in a string](samples/find-all-floats/find-all-floats.go)
354 | - [Parse a dotted name](samples/name-dot-name/name-dot-name.go)
355 | - [Parse a python import statement](samples/python-import/python-import.go)
356 | - [Regular expression inside a JSON struct](samples/quaternion-in-json/quaternion-in-json.go)
357 | 
358 | ### Benchmarks
359 | 
360 | See [benchmarks document](BENCHMARKS.md)
361 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | ## TODO
2 | - repeated subexpressions
3 | - optional terminal matches (look at top node in AST)
4 | - remove OpCaptures from terminals
5 | 


--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
  1 | package restructure
  2 | 
  3 | import (
  4 | 	"io/ioutil"
  5 | 	"os"
  6 | 	"regexp"
  7 | 	"testing"
  8 | )
  9 | 
 10 | var src = `
 11 | The US economy went through an economic downturn following the financial 
 12 | crisis of 2007–08, with output as late as 2013 still below potential
 13 | according to the Congressional Budget Office.[57] The economy, however,
 14 | began to recover in the second half of 2009, and as of November 2015,
 15 | unemployment had declined from a high of 10% to 5%; the government's
 16 | broader U-6 unemployment rate, which includes the part-time underemployed,
 17 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of
 18 | the lowest labor union participation rates in the OECD.[58] Households
 19 | living on less than $2 per day before government benefits, doubled from
 20 | 1996 levels to 1.5 million households in 2011, including 2.8 million
 21 | children.[59] The gap in income between rich and poor is greater in the
 22 | United States than in any other developed country.[60] Total public and
 23 | private debt was $50 trillion at the end of the first quarter of 2010,
 24 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more
 25 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion
 26 | and domestic financial liabilities totaled $106 trillion.[63]
 27 | `
 28 | 
 29 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
 30 | type Float struct {
 31 | 	Sign     *Sign     `?`
 32 | 	Whole    string    `[0-9]*`
 33 | 	Period   struct{}  `\.?`
 34 | 	Frac     string    `[0-9]+`
 35 | 	Exponent *Exponent `?`
 36 | }
 37 | 
 38 | // Matches "+" or "-"
 39 | type Sign struct {
 40 | 	Ch string `[+-]`
 41 | }
 42 | 
 43 | // Matches "e+4", "E6", "e-03"
 44 | type Exponent struct {
 45 | 	_    struct{} `[eE]`
 46 | 	Sign *Sign    `?`
 47 | 	Num  string   `[0-9]+`
 48 | }
 49 | 
 50 | func BenchmarkFindFloat(b *testing.B) {
 51 | 	pattern := MustCompile(Float{}, Options{})
 52 | 	var f Float
 53 | 	b.ResetTimer()
 54 | 	for i := 0; i < b.N; i++ {
 55 | 		pattern.Find(&f, src)
 56 | 	}
 57 | }
 58 | 
 59 | func BenchmarkFindFloatStdlib(b *testing.B) {
 60 | 	pattern := regexp.MustCompile(`((?P<Sign>((?P<Ch>[\+\-]))?)(?P<Whole>[0-9]*)(?P<Period>\.?)(?P<Frac>[0-9]+)(?P<Exponent>((?i:E)(?P<Sign>((?P<Ch>[\+\-]))?)(?P<Num>[0-9]+))?))`)
 61 | 	b.ResetTimer()
 62 | 	for i := 0; i < b.N; i++ {
 63 | 		pattern.FindSubmatch([]byte(src))
 64 | 	}
 65 | }
 66 | 
 67 | type EmailAddress struct {
 68 | 	_    struct{} `^`
 69 | 	User string   `[a-zA-Z0-9._%+-]+`
 70 | 	_    struct{} `@`
 71 | 	Host string   `.+`
 72 | 	_    struct{} `$`
 73 | }
 74 | 
 75 | func BenchmarkParseEmail(b *testing.B) {
 76 | 	var addr EmailAddress
 77 | 	pattern := MustCompile(EmailAddress{}, Options{})
 78 | 	b.ResetTimer()
 79 | 	for i := 0; i < b.N; i++ {
 80 | 		pattern.Find(&addr, "joe@example.com")
 81 | 	}
 82 | }
 83 | 
 84 | func BenchmarkParseEmailStdlib(b *testing.B) {
 85 | 	//pattern := regexp.MustCompile(`(\A(?P<User>[%\+\--\.0-9A-Z_a-z]+)@(?P<Host>((?P<Domain>[0-9A-Z_a-z]+)\.(?P<TLD>[0-9A-Z_a-z]+)))(?-m:$))`)
 86 | 	pattern := regexp.MustCompile(`(\A(?P<User>[%\+\--\.0-9A-Z_a-z]+)@(?P<Host>.+)(?-m:$))`)
 87 | 	b.ResetTimer()
 88 | 	for i := 0; i < b.N; i++ {
 89 | 		pattern.FindStringSubmatch("joe@example.com")
 90 | 	}
 91 | }
 92 | 
 93 | // Import matches "import foo" and "import foo as bar"
 94 | type Import struct {
 95 | 	_       struct{} `^import\s+`
 96 | 	Package Submatch `\w+`
 97 | 	Alias   *AsName  `?`
 98 | 	_       struct{} `$`
 99 | }
100 | 
101 | // AsName matches "as xyz"
102 | type AsName struct {
103 | 	_    struct{} `\s+as\s+`
104 | 	Name Submatch `\w+`
105 | }
106 | 
107 | func BenchmarkFindAllImports(b *testing.B) {
108 | 	path := os.Getenv("TESTDATA")
109 | 	if path == "" {
110 | 		b.Skip("skipping because TESTDATA environment var was not set")
111 | 	}
112 | 	buf, err := ioutil.ReadFile(path)
113 | 	if err != nil {
114 | 		b.Error(err)
115 | 	}
116 | 	pattern := MustCompile(Import{}, Options{})
117 | 	var imports []Import
118 | 	b.ResetTimer()
119 | 	for i := 0; i < b.N; i++ {
120 | 		pattern.FindAll(&imports, string(buf), -1)
121 | 	}
122 | }
123 | 
124 | func BenchmarkFindAllImportsStdlib(b *testing.B) {
125 | 	path := os.Getenv("TESTDATA")
126 | 	if path == "" {
127 | 		b.Skip("skipping because TESTDATA environment var was not set")
128 | 	}
129 | 	buf, err := ioutil.ReadFile(path)
130 | 	if err != nil {
131 | 		b.Error(err)
132 | 	}
133 | 	pattern := regexp.MustCompile(`(\Aimport[\t-\n\f-\r ]+(?P<Package>[0-9A-Z_a-z]+)(?P<Alias>([\t-\n\f-\r ]+as[\t-\n\f-\r ]+(?P<Name>[0-9A-Z_a-z]+))?)(?-m:$))`)
134 | 	b.ResetTimer()
135 | 	for i := 0; i < b.N; i++ {
136 | 		pattern.FindAllSubmatchIndex(buf, -1)
137 | 	}
138 | }
139 | 


--------------------------------------------------------------------------------
/builder.go:
--------------------------------------------------------------------------------
  1 | package restructure
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"reflect"
  7 | 	"regexp/syntax"
  8 | 	"strings"
  9 | )
 10 | 
 11 | // A Role determines how a struct field is inflated
 12 | type Role int
 13 | 
 14 | const (
 15 | 	EmptyRole Role = iota
 16 | 	PosRole
 17 | 	SubstructRole
 18 | 	StringScalarRole
 19 | 	IntScalarRole
 20 | 	ByteSliceScalarRole
 21 | 	SubmatchScalarRole
 22 | )
 23 | 
 24 | // A Struct describes how to inflate a match into a struct
 25 | type Struct struct {
 26 | 	capture int
 27 | 	fields  []*Field
 28 | }
 29 | 
 30 | // A Field describes how to inflate a match into a field
 31 | type Field struct {
 32 | 	capture int     // index of the capture for this field
 33 | 	index   []int   // index of this field within its parent struct
 34 | 	child   *Struct // descendant struct; nil for terminals
 35 | 	role    Role
 36 | }
 37 | 
 38 | func isExported(f reflect.StructField) bool {
 39 | 	return f.PkgPath == ""
 40 | }
 41 | 
 42 | // A builder builds stencils from structs using reflection
 43 | type builder struct {
 44 | 	numCaptures int
 45 | 	opts        Options
 46 | }
 47 | 
 48 | func newBuilder(opts Options) *builder {
 49 | 	return &builder{
 50 | 		opts: opts,
 51 | 	}
 52 | }
 53 | 
 54 | func (b *builder) nextCaptureIndex() int {
 55 | 	k := b.numCaptures
 56 | 	b.numCaptures++
 57 | 	return k
 58 | }
 59 | 
 60 | func (b *builder) extractTag(tag reflect.StructTag) (string, error) {
 61 | 	// Allow tags that look like either `regexp:"\\w+"` or just `\w+`
 62 | 	if s := tag.Get("regexp"); s != "" {
 63 | 		return s, nil
 64 | 	} else if strings.Contains(string(tag), `regexp:"`) {
 65 | 		return "", errors.New("incorrectly escaped struct tag")
 66 | 	} else {
 67 | 		return string(tag), nil
 68 | 	}
 69 | }
 70 | 
 71 | func removeCaptures(expr *syntax.Regexp) ([]*syntax.Regexp, error) {
 72 | 	if expr.Op == syntax.OpCapture {
 73 | 		return expr.Sub, nil
 74 | 	}
 75 | 	return []*syntax.Regexp{expr}, nil
 76 | }
 77 | 
 78 | func (b *builder) terminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
 79 | 	pattern, err := b.extractTag(f.Tag)
 80 | 	if err != nil {
 81 | 		return nil, nil, fmt.Errorf("%s: %v", fullName, err)
 82 | 	}
 83 | 	if pattern == "" {
 84 | 		return nil, nil, nil
 85 | 	}
 86 | 
 87 | 	// Parse the pattern
 88 | 	expr, err := syntax.Parse(pattern, b.opts.SyntaxFlags)
 89 | 	if err != nil {
 90 | 		return nil, nil, fmt.Errorf(`%s: %v (pattern was "%s")`, fullName, err, f.Tag)
 91 | 	}
 92 | 
 93 | 	// Remove capture nodes within the AST
 94 | 	expr, err = transform(expr, removeCaptures)
 95 | 	if err != nil {
 96 | 		return nil, nil, fmt.Errorf(`failed to remove captures from "%s": %v`, pattern, err)
 97 | 	}
 98 | 
 99 | 	// Determine the kind
100 | 	t := f.Type
101 | 	if t.Kind() == reflect.Ptr {
102 | 		t = t.Elem()
103 | 	}
104 | 	var role Role
105 | 	switch t {
106 | 	case emptyType:
107 | 		role = EmptyRole
108 | 	case stringType:
109 | 		role = StringScalarRole
110 | 	case intType:
111 | 		role = IntScalarRole
112 | 	case byteSliceType:
113 | 		role = ByteSliceScalarRole
114 | 	case submatchType:
115 | 		role = SubmatchScalarRole
116 | 	}
117 | 
118 | 	captureIndex := -1
119 | 	if isExported(f) {
120 | 		captureIndex = b.nextCaptureIndex()
121 | 		expr = &syntax.Regexp{
122 | 			Op:   syntax.OpCapture,
123 | 			Sub:  []*syntax.Regexp{expr},
124 | 			Name: f.Name,
125 | 			Cap:  captureIndex,
126 | 		}
127 | 	}
128 | 	field := &Field{
129 | 		index:   f.Index,
130 | 		capture: captureIndex,
131 | 		role:    role,
132 | 	}
133 | 
134 | 	return field, expr, nil
135 | }
136 | 
137 | func (b *builder) pos(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
138 | 	if !isExported(f) {
139 | 		return nil, nil, nil
140 | 	}
141 | 	captureIndex := b.nextCaptureIndex()
142 | 	empty := &syntax.Regexp{
143 | 		Op: syntax.OpEmptyMatch,
144 | 	}
145 | 	expr := &syntax.Regexp{
146 | 		Op:   syntax.OpCapture,
147 | 		Sub:  []*syntax.Regexp{empty},
148 | 		Name: f.Name,
149 | 		Cap:  captureIndex,
150 | 	}
151 | 	field := &Field{
152 | 		index:   f.Index,
153 | 		capture: captureIndex,
154 | 		role:    PosRole,
155 | 	}
156 | 
157 | 	return field, expr, nil
158 | }
159 | 
160 | func (b *builder) nonterminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
161 | 	opstr, err := b.extractTag(f.Tag)
162 | 	if err != nil {
163 | 		return nil, nil, err
164 | 	}
165 | 	child, expr, err := b.structure(f.Type)
166 | 	if err != nil {
167 | 		return nil, nil, err
168 | 	}
169 | 
170 | 	switch opstr {
171 | 	case "?":
172 | 		if f.Type.Kind() != reflect.Ptr {
173 | 			return nil, nil, fmt.Errorf(`%s is marked with "?" but is not a pointer`, fullName)
174 | 		}
175 | 		expr = &syntax.Regexp{
176 | 			Sub: []*syntax.Regexp{expr},
177 | 			Op:  syntax.OpQuest,
178 | 		}
179 | 	case "":
180 | 		// nothing to do
181 | 	default:
182 | 		return nil, nil, fmt.Errorf("invalid op \"%s\" for non-slice field on %s", opstr, fullName)
183 | 	}
184 | 
185 | 	captureIndex := b.nextCaptureIndex()
186 | 	expr = &syntax.Regexp{
187 | 		Op:   syntax.OpCapture,
188 | 		Sub:  []*syntax.Regexp{expr},
189 | 		Name: f.Name,
190 | 		Cap:  captureIndex,
191 | 	}
192 | 	field := &Field{
193 | 		index:   f.Index,
194 | 		capture: captureIndex,
195 | 		child:   child,
196 | 		role:    SubstructRole,
197 | 	}
198 | 
199 | 	return field, expr, nil
200 | }
201 | 
202 | func (b *builder) field(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) {
203 | 	if isScalar(f.Type) {
204 | 		return b.terminal(f, fullName)
205 | 	} else if isStruct(f.Type) {
206 | 		return b.nonterminal(f, fullName)
207 | 	} else if f.Type == posType {
208 | 		return b.pos(f, fullName)
209 | 	}
210 | 	return nil, nil, nil
211 | }
212 | 
213 | func (b *builder) structure(t reflect.Type) (*Struct, *syntax.Regexp, error) {
214 | 	if t.Kind() == reflect.Ptr {
215 | 		t = t.Elem()
216 | 	}
217 | 
218 | 	// Select a capture index first so that the struct comes before its fields
219 | 	captureIndex := b.nextCaptureIndex()
220 | 
221 | 	var exprs []*syntax.Regexp
222 | 	var fields []*Field
223 | 	for i := 0; i < t.NumField(); i++ {
224 | 		f := t.Field(i)
225 | 		field, expr, err := b.field(f, t.Name()+"."+f.Name)
226 | 		if err != nil {
227 | 			return nil, nil, err
228 | 		}
229 | 		if field != nil {
230 | 			exprs = append(exprs, expr)
231 | 			fields = append(fields, field)
232 | 		}
233 | 	}
234 | 
235 | 	// Wrap in a concat
236 | 	expr := &syntax.Regexp{
237 | 		Sub: exprs,
238 | 		Op:  syntax.OpConcat,
239 | 	}
240 | 
241 | 	// Wrap in a capture
242 | 	expr = &syntax.Regexp{
243 | 		Sub: []*syntax.Regexp{expr},
244 | 		Op:  syntax.OpCapture,
245 | 		Cap: captureIndex,
246 | 	}
247 | 
248 | 	st := &Struct{
249 | 		fields:  fields,
250 | 		capture: captureIndex,
251 | 	}
252 | 
253 | 	return st, expr, nil
254 | }
255 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/alexflint/go-restructure
2 | 
3 | go 1.15
4 | 
5 | require (
6 | 	github.com/stretchr/testify v1.7.0
7 | )
8 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28 h1:p32gFVhF4WnI/qpSpZ0//GGb6BAAFLVnkd4Vowg7im8=
 2 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28/go.mod h1:8Mq15S+jJn5TWrSU0Ua7L8rFWmY06lu0UCbhJrrcGBY=
 3 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 8 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 9 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
10 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
11 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
12 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
13 | 


--------------------------------------------------------------------------------
/inflate.go:
--------------------------------------------------------------------------------
  1 | package restructure
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"reflect"
  6 | 	"strconv"
  7 | )
  8 | 
  9 | var (
 10 | 	posType = reflect.TypeOf(Pos(0))
 11 | 
 12 | 	emptyType     = reflect.TypeOf(struct{}{})
 13 | 	stringType    = reflect.TypeOf("")
 14 | 	intType       = reflect.TypeOf(1)
 15 | 	byteSliceType = reflect.TypeOf([]byte{})
 16 | 	submatchType  = reflect.TypeOf(Submatch{})
 17 | 	scalarTypes   = []reflect.Type{
 18 | 		emptyType,
 19 | 		stringType,
 20 | 		intType,
 21 | 		byteSliceType,
 22 | 		submatchType,
 23 | 	}
 24 | )
 25 | 
 26 | // determines whether t is a scalar type or a pointer to a scalar type
 27 | func isScalar(t reflect.Type) bool {
 28 | 	if t.Kind() == reflect.Ptr {
 29 | 		t = t.Elem()
 30 | 	}
 31 | 	for _, u := range scalarTypes {
 32 | 		if t == u {
 33 | 			return true
 34 | 		}
 35 | 	}
 36 | 	return false
 37 | }
 38 | 
 39 | // determines whether t is a struct type or a pointer to a struct type
 40 | func isStruct(t reflect.Type) bool {
 41 | 	if t.Kind() == reflect.Ptr {
 42 | 		t = t.Elem()
 43 | 	}
 44 | 	return t.Kind() == reflect.Struct
 45 | }
 46 | 
 47 | // ensureAlloc replaces nil pointers with newly allocated objects
 48 | func ensureAlloc(dest reflect.Value) reflect.Value {
 49 | 	if dest.Kind() == reflect.Ptr {
 50 | 		if dest.IsNil() {
 51 | 			dest.Set(reflect.New(dest.Type().Elem()))
 52 | 		}
 53 | 		return dest.Elem()
 54 | 	}
 55 | 	return dest
 56 | }
 57 | 
 58 | // inflate the results of a match into a string
 59 | func inflateScalar(dest reflect.Value, match *match, captureIndex int, role Role) error {
 60 | 	if captureIndex == -1 {
 61 | 		// This means the field generated a regex but we did not want the results
 62 | 		return nil
 63 | 	}
 64 | 
 65 | 	// Get the subcapture for this field
 66 | 	subcapture := match.captures[captureIndex]
 67 | 	if !subcapture.wasMatched() {
 68 | 		// This means the subcapture was optional and was not matched
 69 | 		return nil
 70 | 	}
 71 | 
 72 | 	// Get the matched bytes
 73 | 	buf := match.input[subcapture.begin:subcapture.end]
 74 | 
 75 | 	// If dest is a nil pointer then allocate a new instance and assign the pointer to dest
 76 | 	dest = ensureAlloc(dest)
 77 | 
 78 | 	// Deal with each recognized type
 79 | 	switch role {
 80 | 	case StringScalarRole:
 81 | 		dest.SetString(string(buf))
 82 | 		return nil
 83 | 	case IntScalarRole:
 84 | 		if intVal, err := strconv.Atoi(string(buf)); err != nil {
 85 | 			return fmt.Errorf("unable to capture into %s", dest.Type().String())
 86 | 		} else {
 87 | 			dest.SetInt(int64(intVal))
 88 | 			return nil
 89 | 		}
 90 | 	case ByteSliceScalarRole:
 91 | 		dest.SetBytes(buf)
 92 | 		return nil
 93 | 	case SubmatchScalarRole:
 94 | 		submatch := dest.Addr().Interface().(*Submatch)
 95 | 		submatch.Begin = Pos(subcapture.begin)
 96 | 		submatch.End = Pos(subcapture.end)
 97 | 		submatch.Bytes = buf
 98 | 		return nil
 99 | 	}
100 | 	return fmt.Errorf("unable to capture into %s", dest.Type().String())
101 | }
102 | 
103 | // inflate the position of a match into a Pos
104 | func inflatePos(dest reflect.Value, match *match, captureIndex int) error {
105 | 	if captureIndex == -1 {
106 | 		// This means the field generated a regex but we did not want the results
107 | 		return nil
108 | 	}
109 | 
110 | 	// Get the subcapture for this field
111 | 	subcapture := match.captures[captureIndex]
112 | 	if !subcapture.wasMatched() {
113 | 		// This means the subcapture was optional and was not matched
114 | 		return nil
115 | 	}
116 | 
117 | 	// If dest is a nil pointer then allocate a new instance and assign the pointer to dest
118 | 	dest.SetInt(int64(subcapture.begin))
119 | 	return nil
120 | }
121 | 
122 | // inflate the results of a match into a struct
123 | func inflateStruct(dest reflect.Value, match *match, structure *Struct) error {
124 | 	// Get the subcapture for this field
125 | 	subcapture := match.captures[structure.capture]
126 | 	if !subcapture.wasMatched() {
127 | 		return nil
128 | 	}
129 | 
130 | 	// If the field is a nil pointer then allocate an instance and assign pointer to dest
131 | 	dest = ensureAlloc(dest)
132 | 
133 | 	// Inflate values into the struct fields
134 | 	for _, field := range structure.fields {
135 | 		switch field.role {
136 | 		case PosRole:
137 | 			val := dest.FieldByIndex(field.index)
138 | 			if err := inflatePos(val, match, field.capture); err != nil {
139 | 				return err
140 | 			}
141 | 		case StringScalarRole, ByteSliceScalarRole, SubmatchScalarRole, IntScalarRole:
142 | 			val := dest.FieldByIndex(field.index)
143 | 			if err := inflateScalar(val, match, field.capture, field.role); err != nil {
144 | 				return err
145 | 			}
146 | 		case SubstructRole:
147 | 			val := dest.FieldByIndex(field.index)
148 | 			if err := inflateStruct(val, match, field.child); err != nil {
149 | 				return err
150 | 			}
151 | 		}
152 | 	}
153 | 	return nil
154 | }
155 | 


--------------------------------------------------------------------------------
/regex/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a slightly modified version of the Go 1.5.2 standard library `regexp` package.


--------------------------------------------------------------------------------
/regex/backtrack.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // backtrack is a regular expression search with submatch
  6 | // tracking for small regular expressions and texts. It allocates
  7 | // a bit vector with (length of input) * (length of prog) bits,
  8 | // to make sure it never explores the same (character position, instruction)
  9 | // state multiple times. This limits the search to run in time linear in
 10 | // the length of the test.
 11 | //
 12 | // backtrack is a fast replacement for the NFA code on small
 13 | // regexps when onepass cannot be used.
 14 | 
 15 | package regex
 16 | 
 17 | import "regexp/syntax"
 18 | 
 19 | // A job is an entry on the backtracker's job stack. It holds
 20 | // the instruction pc and the position in the input.
 21 | type job struct {
 22 | 	pc  uint32
 23 | 	arg int
 24 | 	pos int
 25 | }
 26 | 
 27 | const (
 28 | 	visitedBits        = 32
 29 | 	maxBacktrackProg   = 500        // len(prog.Inst) <= max
 30 | 	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
 31 | )
 32 | 
 33 | // bitState holds state for the backtracker.
 34 | type bitState struct {
 35 | 	prog *syntax.Prog
 36 | 
 37 | 	end     int
 38 | 	cap     []int
 39 | 	input   input
 40 | 	jobs    []job
 41 | 	visited []uint32
 42 | }
 43 | 
 44 | var notBacktrack *bitState = nil
 45 | 
 46 | // maxBitStateLen returns the maximum length of a string to search with
 47 | // the backtracker using prog.
 48 | func maxBitStateLen(prog *syntax.Prog) int {
 49 | 	if !shouldBacktrack(prog) {
 50 | 		return 0
 51 | 	}
 52 | 	return maxBacktrackVector / len(prog.Inst)
 53 | }
 54 | 
 55 | // newBitState returns a new bitState for the given prog,
 56 | // or notBacktrack if the size of the prog exceeds the maximum size that
 57 | // the backtracker will be run for.
 58 | func newBitState(prog *syntax.Prog) *bitState {
 59 | 	if !shouldBacktrack(prog) {
 60 | 		return notBacktrack
 61 | 	}
 62 | 	return &bitState{
 63 | 		prog: prog,
 64 | 	}
 65 | }
 66 | 
 67 | // shouldBacktrack reports whether the program is too
 68 | // long for the backtracker to run.
 69 | func shouldBacktrack(prog *syntax.Prog) bool {
 70 | 	return len(prog.Inst) <= maxBacktrackProg
 71 | }
 72 | 
 73 | // reset resets the state of the backtracker.
 74 | // end is the end position in the input.
 75 | // ncap is the number of captures.
 76 | func (b *bitState) reset(end int, ncap int) {
 77 | 	b.end = end
 78 | 
 79 | 	if cap(b.jobs) == 0 {
 80 | 		b.jobs = make([]job, 0, 256)
 81 | 	} else {
 82 | 		b.jobs = b.jobs[:0]
 83 | 	}
 84 | 
 85 | 	visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
 86 | 	if cap(b.visited) < visitedSize {
 87 | 		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
 88 | 	} else {
 89 | 		b.visited = b.visited[:visitedSize]
 90 | 		for i := range b.visited {
 91 | 			b.visited[i] = 0
 92 | 		}
 93 | 	}
 94 | 
 95 | 	if cap(b.cap) < ncap {
 96 | 		b.cap = make([]int, ncap)
 97 | 	} else {
 98 | 		b.cap = b.cap[:ncap]
 99 | 	}
100 | 	for i := range b.cap {
101 | 		b.cap[i] = -1
102 | 	}
103 | }
104 | 
105 | // shouldVisit reports whether the combination of (pc, pos) has not
106 | // been visited yet.
107 | func (b *bitState) shouldVisit(pc uint32, pos int) bool {
108 | 	n := uint(int(pc)*(b.end+1) + pos)
109 | 	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
110 | 		return false
111 | 	}
112 | 	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
113 | 	return true
114 | }
115 | 
116 | // push pushes (pc, pos, arg) onto the job stack if it should be
117 | // visited.
118 | func (b *bitState) push(pc uint32, pos int, arg int) {
119 | 	if b.prog.Inst[pc].Op == syntax.InstFail {
120 | 		return
121 | 	}
122 | 
123 | 	// Only check shouldVisit when arg == 0.
124 | 	// When arg > 0, we are continuing a previous visit.
125 | 	if arg == 0 && !b.shouldVisit(pc, pos) {
126 | 		return
127 | 	}
128 | 
129 | 	b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
130 | }
131 | 
132 | // tryBacktrack runs a backtracking search starting at pos.
133 | func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
134 | 	longest := m.re.longest
135 | 	m.matched = false
136 | 
137 | 	b.push(pc, pos, 0)
138 | 	for len(b.jobs) > 0 {
139 | 		l := len(b.jobs) - 1
140 | 		// Pop job off the stack.
141 | 		pc := b.jobs[l].pc
142 | 		pos := b.jobs[l].pos
143 | 		arg := b.jobs[l].arg
144 | 		b.jobs = b.jobs[:l]
145 | 
146 | 		// Optimization: rather than push and pop,
147 | 		// code that is going to Push and continue
148 | 		// the loop simply updates ip, p, and arg
149 | 		// and jumps to CheckAndLoop.  We have to
150 | 		// do the ShouldVisit check that Push
151 | 		// would have, but we avoid the stack
152 | 		// manipulation.
153 | 		goto Skip
154 | 	CheckAndLoop:
155 | 		if !b.shouldVisit(pc, pos) {
156 | 			continue
157 | 		}
158 | 	Skip:
159 | 
160 | 		inst := b.prog.Inst[pc]
161 | 
162 | 		switch inst.Op {
163 | 		default:
164 | 			panic("bad inst")
165 | 		case syntax.InstFail:
166 | 			panic("unexpected InstFail")
167 | 		case syntax.InstAlt:
168 | 			// Cannot just
169 | 			//   b.push(inst.Out, pos, 0)
170 | 			//   b.push(inst.Arg, pos, 0)
171 | 			// If during the processing of inst.Out, we encounter
172 | 			// inst.Arg via another path, we want to process it then.
173 | 			// Pushing it here will inhibit that. Instead, re-push
174 | 			// inst with arg==1 as a reminder to push inst.Arg out
175 | 			// later.
176 | 			switch arg {
177 | 			case 0:
178 | 				b.push(pc, pos, 1)
179 | 				pc = inst.Out
180 | 				goto CheckAndLoop
181 | 			case 1:
182 | 				// Finished inst.Out; try inst.Arg.
183 | 				arg = 0
184 | 				pc = inst.Arg
185 | 				goto CheckAndLoop
186 | 			}
187 | 			panic("bad arg in InstAlt")
188 | 
189 | 		case syntax.InstAltMatch:
190 | 			// One opcode consumes runes; the other leads to match.
191 | 			switch b.prog.Inst[inst.Out].Op {
192 | 			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
193 | 				// inst.Arg is the match.
194 | 				b.push(inst.Arg, pos, 0)
195 | 				pc = inst.Arg
196 | 				pos = b.end
197 | 				goto CheckAndLoop
198 | 			}
199 | 			// inst.Out is the match - non-greedy
200 | 			b.push(inst.Out, b.end, 0)
201 | 			pc = inst.Out
202 | 			goto CheckAndLoop
203 | 
204 | 		case syntax.InstRune:
205 | 			r, width := i.step(pos)
206 | 			if !inst.MatchRune(r) {
207 | 				continue
208 | 			}
209 | 			pos += width
210 | 			pc = inst.Out
211 | 			goto CheckAndLoop
212 | 
213 | 		case syntax.InstRune1:
214 | 			r, width := i.step(pos)
215 | 			if r != inst.Rune[0] {
216 | 				continue
217 | 			}
218 | 			pos += width
219 | 			pc = inst.Out
220 | 			goto CheckAndLoop
221 | 
222 | 		case syntax.InstRuneAnyNotNL:
223 | 			r, width := i.step(pos)
224 | 			if r == '\n' || r == endOfText {
225 | 				continue
226 | 			}
227 | 			pos += width
228 | 			pc = inst.Out
229 | 			goto CheckAndLoop
230 | 
231 | 		case syntax.InstRuneAny:
232 | 			r, width := i.step(pos)
233 | 			if r == endOfText {
234 | 				continue
235 | 			}
236 | 			pos += width
237 | 			pc = inst.Out
238 | 			goto CheckAndLoop
239 | 
240 | 		case syntax.InstCapture:
241 | 			switch arg {
242 | 			case 0:
243 | 				if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
244 | 					// Capture pos to register, but save old value.
245 | 					b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
246 | 					b.cap[inst.Arg] = pos
247 | 				}
248 | 				pc = inst.Out
249 | 				goto CheckAndLoop
250 | 			case 1:
251 | 				// Finished inst.Out; restore the old value.
252 | 				b.cap[inst.Arg] = pos
253 | 				continue
254 | 
255 | 			}
256 | 			panic("bad arg in InstCapture")
257 | 			continue
258 | 
259 | 		case syntax.InstEmptyWidth:
260 | 			if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
261 | 				continue
262 | 			}
263 | 			pc = inst.Out
264 | 			goto CheckAndLoop
265 | 
266 | 		case syntax.InstNop:
267 | 			pc = inst.Out
268 | 			goto CheckAndLoop
269 | 
270 | 		case syntax.InstMatch:
271 | 			// We found a match. If the caller doesn't care
272 | 			// where the match is, no point going further.
273 | 			if len(b.cap) == 0 {
274 | 				m.matched = true
275 | 				return m.matched
276 | 			}
277 | 
278 | 			// Record best match so far.
279 | 			// Only need to check end point, because this entire
280 | 			// call is only considering one start position.
281 | 			if len(b.cap) > 1 {
282 | 				b.cap[1] = pos
283 | 			}
284 | 			if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
285 | 				copy(m.matchcap, b.cap)
286 | 			}
287 | 			m.matched = true
288 | 
289 | 			// If going for first match, we're done.
290 | 			if !longest {
291 | 				return m.matched
292 | 			}
293 | 
294 | 			// If we used the entire text, no longer match is possible.
295 | 			if pos == b.end {
296 | 				return m.matched
297 | 			}
298 | 
299 | 			// Otherwise, continue on in hope of a longer match.
300 | 			continue
301 | 		}
302 | 		panic("unreachable")
303 | 	}
304 | 
305 | 	return m.matched
306 | }
307 | 
308 | // backtrack runs a backtracking search of prog on the input starting at pos.
309 | func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
310 | 	if !i.canCheckPrefix() {
311 | 		panic("backtrack called for a RuneReader")
312 | 	}
313 | 
314 | 	startCond := m.re.cond
315 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
316 | 		return false
317 | 	}
318 | 	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
319 | 		// Anchored match, past beginning of text.
320 | 		return false
321 | 	}
322 | 
323 | 	b := m.b
324 | 	b.reset(end, ncap)
325 | 
326 | 	m.matchcap = m.matchcap[:ncap]
327 | 	for i := range m.matchcap {
328 | 		m.matchcap[i] = -1
329 | 	}
330 | 
331 | 	// Anchored search must start at the beginning of the input
332 | 	if startCond&syntax.EmptyBeginText != 0 {
333 | 		if len(b.cap) > 0 {
334 | 			b.cap[0] = pos
335 | 		}
336 | 		return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
337 | 	}
338 | 
339 | 	// Unanchored search, starting from each possible text position.
340 | 	// Notice that we have to try the empty string at the end of
341 | 	// the text, so the loop condition is pos <= end, not pos < end.
342 | 	// This looks like it's quadratic in the size of the text,
343 | 	// but we are not clearing visited between calls to TrySearch,
344 | 	// so no work is duplicated and it ends up still being linear.
345 | 	width := -1
346 | 	for ; pos <= end && width != 0; pos += width {
347 | 		if len(m.re.prefix) > 0 {
348 | 			// Match requires literal prefix; fast search for it.
349 | 			advance := i.index(m.re, pos)
350 | 			if advance < 0 {
351 | 				return false
352 | 			}
353 | 			pos += advance
354 | 		}
355 | 
356 | 		if len(b.cap) > 0 {
357 | 			b.cap[0] = pos
358 | 		}
359 | 		if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
360 | 			// Match must be leftmost; done.
361 | 			return true
362 | 		}
363 | 		_, width = i.step(pos)
364 | 	}
365 | 	return false
366 | }
367 | 


--------------------------------------------------------------------------------
/regex/machine.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regex
  6 | 
  7 | import (
  8 | 	"io"
  9 | 	"regexp/syntax"
 10 | )
 11 | 
 12 | // A queue is a 'sparse array' holding pending threads of execution.
 13 | // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
 14 | type queue struct {
 15 | 	sparse []uint32
 16 | 	dense  []entry
 17 | }
 18 | 
 19 | // A entry is an entry on a queue.
 20 | // It holds both the instruction pc and the actual thread.
 21 | // Some queue entries are just place holders so that the machine
 22 | // knows it has considered that pc.  Such entries have t == nil.
 23 | type entry struct {
 24 | 	pc uint32
 25 | 	t  *thread
 26 | }
 27 | 
 28 | // A thread is the state of a single path through the machine:
 29 | // an instruction and a corresponding capture array.
 30 | // See http://swtch.com/~rsc/regexp/regexp2.html
 31 | type thread struct {
 32 | 	inst *syntax.Inst
 33 | 	cap  []int
 34 | }
 35 | 
 36 | // A machine holds all the state during an NFA simulation for p.
 37 | type machine struct {
 38 | 	re             *Regexp      // corresponding Regexp
 39 | 	p              *syntax.Prog // compiled program
 40 | 	op             *onePassProg // compiled onepass program, or notOnePass
 41 | 	maxBitStateLen int          // max length of string to search with bitstate
 42 | 	b              *bitState    // state for backtracker, allocated lazily
 43 | 	q0, q1         queue        // two queues for runq, nextq
 44 | 	pool           []*thread    // pool of available threads
 45 | 	matched        bool         // whether a match was found
 46 | 	matchcap       []int        // capture information for the match
 47 | 
 48 | 	// cached inputs, to avoid allocation
 49 | 	inputBytes  inputBytes
 50 | 	inputString inputString
 51 | 	inputReader inputReader
 52 | }
 53 | 
 54 | func (m *machine) newInputBytes(b []byte) input {
 55 | 	m.inputBytes.str = b
 56 | 	return &m.inputBytes
 57 | }
 58 | 
 59 | func (m *machine) newInputString(s string) input {
 60 | 	m.inputString.str = s
 61 | 	return &m.inputString
 62 | }
 63 | 
 64 | func (m *machine) newInputReader(r io.RuneReader) input {
 65 | 	m.inputReader.r = r
 66 | 	m.inputReader.atEOT = false
 67 | 	m.inputReader.pos = 0
 68 | 	return &m.inputReader
 69 | }
 70 | 
 71 | // progMachine returns a new machine running the prog p.
 72 | func progMachine(p *syntax.Prog, op *onePassProg) *machine {
 73 | 	m := &machine{p: p, op: op}
 74 | 	n := len(m.p.Inst)
 75 | 	m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
 76 | 	m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
 77 | 	ncap := p.NumCap
 78 | 	if ncap < 2 {
 79 | 		ncap = 2
 80 | 	}
 81 | 	if op == notOnePass {
 82 | 		m.maxBitStateLen = maxBitStateLen(p)
 83 | 	}
 84 | 	m.matchcap = make([]int, ncap)
 85 | 	return m
 86 | }
 87 | 
 88 | func (m *machine) init(ncap int) {
 89 | 	for _, t := range m.pool {
 90 | 		t.cap = t.cap[:ncap]
 91 | 	}
 92 | 	m.matchcap = m.matchcap[:ncap]
 93 | }
 94 | 
 95 | // alloc allocates a new thread with the given instruction.
 96 | // It uses the free pool if possible.
 97 | func (m *machine) alloc(i *syntax.Inst) *thread {
 98 | 	var t *thread
 99 | 	if n := len(m.pool); n > 0 {
100 | 		t = m.pool[n-1]
101 | 		m.pool = m.pool[:n-1]
102 | 	} else {
103 | 		t = new(thread)
104 | 		t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
105 | 	}
106 | 	t.inst = i
107 | 	return t
108 | }
109 | 
110 | // free returns t to the free pool.
111 | func (m *machine) free(t *thread) {
112 | 	m.inputBytes.str = nil
113 | 	m.inputString.str = ""
114 | 	m.inputReader.r = nil
115 | 	m.pool = append(m.pool, t)
116 | }
117 | 
118 | // match runs the machine over the input starting at pos.
119 | // It reports whether a match was found.
120 | // If so, m.matchcap holds the submatch information.
121 | func (m *machine) match(i input, pos int) bool {
122 | 	startCond := m.re.cond
123 | 	if startCond == ^syntax.EmptyOp(0) { // impossible
124 | 		return false
125 | 	}
126 | 	m.matched = false
127 | 	for i := range m.matchcap {
128 | 		m.matchcap[i] = -1
129 | 	}
130 | 	runq, nextq := &m.q0, &m.q1
131 | 	r, r1 := endOfText, endOfText
132 | 	width, width1 := 0, 0
133 | 	r, width = i.step(pos)
134 | 	if r != endOfText {
135 | 		r1, width1 = i.step(pos + width)
136 | 	}
137 | 	var flag syntax.EmptyOp
138 | 	if pos == 0 {
139 | 		flag = syntax.EmptyOpContext(-1, r)
140 | 	} else {
141 | 		flag = i.context(pos)
142 | 	}
143 | 	for {
144 | 		if len(runq.dense) == 0 {
145 | 			if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
146 | 				// Anchored match, past beginning of text.
147 | 				break
148 | 			}
149 | 			if m.matched {
150 | 				// Have match; finished exploring alternatives.
151 | 				break
152 | 			}
153 | 			if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
154 | 				// Match requires literal prefix; fast search for it.
155 | 				advance := i.index(m.re, pos)
156 | 				if advance < 0 {
157 | 					break
158 | 				}
159 | 				pos += advance
160 | 				r, width = i.step(pos)
161 | 				r1, width1 = i.step(pos + width)
162 | 			}
163 | 		}
164 | 		if !m.matched {
165 | 			if len(m.matchcap) > 0 {
166 | 				m.matchcap[0] = pos
167 | 			}
168 | 			m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
169 | 		}
170 | 		flag = syntax.EmptyOpContext(r, r1)
171 | 		m.step(runq, nextq, pos, pos+width, r, flag)
172 | 		if width == 0 {
173 | 			break
174 | 		}
175 | 		if len(m.matchcap) == 0 && m.matched {
176 | 			// Found a match and not paying attention
177 | 			// to where it is, so any match will do.
178 | 			break
179 | 		}
180 | 		pos += width
181 | 		r, width = r1, width1
182 | 		if r != endOfText {
183 | 			r1, width1 = i.step(pos + width)
184 | 		}
185 | 		runq, nextq = nextq, runq
186 | 	}
187 | 	m.clear(nextq)
188 | 	return m.matched
189 | }
190 | 
191 | // clear frees all threads on the thread queue.
192 | func (m *machine) clear(q *queue) {
193 | 	for _, d := range q.dense {
194 | 		if d.t != nil {
195 | 			// m.free(d.t)
196 | 			m.pool = append(m.pool, d.t)
197 | 		}
198 | 	}
199 | 	q.dense = q.dense[:0]
200 | }
201 | 
202 | // step executes one step of the machine, running each of the threads
203 | // on runq and appending new threads to nextq.
204 | // The step processes the rune c (which may be endOfText),
205 | // which starts at position pos and ends at nextPos.
206 | // nextCond gives the setting for the empty-width flags after c.
207 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
208 | 	longest := m.re.longest
209 | 	for j := 0; j < len(runq.dense); j++ {
210 | 		d := &runq.dense[j]
211 | 		t := d.t
212 | 		if t == nil {
213 | 			continue
214 | 		}
215 | 		if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
216 | 			// m.free(t)
217 | 			m.pool = append(m.pool, t)
218 | 			continue
219 | 		}
220 | 		i := t.inst
221 | 		add := false
222 | 		switch i.Op {
223 | 		default:
224 | 			panic("bad inst")
225 | 
226 | 		case syntax.InstMatch:
227 | 			if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
228 | 				t.cap[1] = pos
229 | 				copy(m.matchcap, t.cap)
230 | 			}
231 | 			if !longest {
232 | 				// First-match mode: cut off all lower-priority threads.
233 | 				for _, d := range runq.dense[j+1:] {
234 | 					if d.t != nil {
235 | 						// m.free(d.t)
236 | 						m.pool = append(m.pool, d.t)
237 | 					}
238 | 				}
239 | 				runq.dense = runq.dense[:0]
240 | 			}
241 | 			m.matched = true
242 | 
243 | 		case syntax.InstRune:
244 | 			add = i.MatchRune(c)
245 | 		case syntax.InstRune1:
246 | 			add = c == i.Rune[0]
247 | 		case syntax.InstRuneAny:
248 | 			add = true
249 | 		case syntax.InstRuneAnyNotNL:
250 | 			add = c != '\n'
251 | 		}
252 | 		if add {
253 | 			t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
254 | 		}
255 | 		if t != nil {
256 | 			// m.free(t)
257 | 			m.pool = append(m.pool, t)
258 | 		}
259 | 	}
260 | 	runq.dense = runq.dense[:0]
261 | }
262 | 
263 | // add adds an entry to q for pc, unless the q already has such an entry.
264 | // It also recursively adds an entry for all instructions reachable from pc by following
265 | // empty-width conditions satisfied by cond.  pos gives the current position
266 | // in the input.
267 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread {
268 | 	if pc == 0 {
269 | 		return t
270 | 	}
271 | 	if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
272 | 		return t
273 | 	}
274 | 
275 | 	j := len(q.dense)
276 | 	q.dense = q.dense[:j+1]
277 | 	d := &q.dense[j]
278 | 	d.t = nil
279 | 	d.pc = pc
280 | 	q.sparse[pc] = uint32(j)
281 | 
282 | 	i := &m.p.Inst[pc]
283 | 	switch i.Op {
284 | 	default:
285 | 		panic("unhandled")
286 | 	case syntax.InstFail:
287 | 		// nothing
288 | 	case syntax.InstAlt, syntax.InstAltMatch:
289 | 		t = m.add(q, i.Out, pos, cap, cond, t)
290 | 		t = m.add(q, i.Arg, pos, cap, cond, t)
291 | 	case syntax.InstEmptyWidth:
292 | 		if syntax.EmptyOp(i.Arg)&^cond == 0 {
293 | 			t = m.add(q, i.Out, pos, cap, cond, t)
294 | 		}
295 | 	case syntax.InstNop:
296 | 		t = m.add(q, i.Out, pos, cap, cond, t)
297 | 	case syntax.InstCapture:
298 | 		if int(i.Arg) < len(cap) {
299 | 			opos := cap[i.Arg]
300 | 			cap[i.Arg] = pos
301 | 			m.add(q, i.Out, pos, cap, cond, nil)
302 | 			cap[i.Arg] = opos
303 | 		} else {
304 | 			t = m.add(q, i.Out, pos, cap, cond, t)
305 | 		}
306 | 	case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
307 | 		if t == nil {
308 | 			t = m.alloc(i)
309 | 		} else {
310 | 			t.inst = i
311 | 		}
312 | 		if len(cap) > 0 && &t.cap[0] != &cap[0] {
313 | 			copy(t.cap, cap)
314 | 		}
315 | 		d.t = t
316 | 		t = nil
317 | 	}
318 | 	return t
319 | }
320 | 
321 | // empty is a non-nil 0-element slice,
322 | // so doExecute can avoid an allocation
323 | // when 0 captures are requested from a successful match.
324 | var empty = make([]int, 0)
325 | 
326 | // doExecute finds the leftmost match in the input and returns
327 | // the position of its subexpressions.
328 | func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int {
329 | 	m := re.get()
330 | 	var i input
331 | 	var size int
332 | 	if r != nil {
333 | 		i = m.newInputReader(r)
334 | 	} else if b != nil {
335 | 		i = m.newInputBytes(b)
336 | 		size = len(b)
337 | 	} else {
338 | 		i = m.newInputString(s)
339 | 		size = len(s)
340 | 	}
341 | 	if size < m.maxBitStateLen && r == nil {
342 | 		if m.b == nil {
343 | 			m.b = newBitState(m.p)
344 | 		}
345 | 		if !m.backtrack(i, pos, size, ncap) {
346 | 			re.put(m)
347 | 			return nil
348 | 		}
349 | 	} else {
350 | 		m.init(ncap)
351 | 		if !m.match(i, pos) {
352 | 			re.put(m)
353 | 			return nil
354 | 		}
355 | 	}
356 | 	if ncap == 0 {
357 | 		re.put(m)
358 | 		return empty // empty but not nil
359 | 	}
360 | 	cap := make([]int, len(m.matchcap))
361 | 	copy(cap, m.matchcap)
362 | 	re.put(m)
363 | 	return cap
364 | }
365 | 


--------------------------------------------------------------------------------
/regex/onepass.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regex
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"regexp/syntax"
 10 | 	"sort"
 11 | 	"unicode"
 12 | )
 13 | 
 14 | // "One-pass" regexp execution.
 15 | // Some regexps can be analyzed to determine that they never need
 16 | // backtracking: they are guaranteed to run in one pass over the string
 17 | // without bothering to save all the usual NFA state.
 18 | // Detect those and execute them more quickly.
 19 | 
 20 | // A onePassProg is a compiled one-pass regular expression program.
 21 | // It is the same as syntax.Prog except for the use of onePassInst.
 22 | type onePassProg struct {
 23 | 	Inst   []onePassInst
 24 | 	Start  int // index of start instruction
 25 | 	NumCap int // number of InstCapture insts in re
 26 | }
 27 | 
 28 | // A onePassInst is a single instruction in a one-pass regular expression program.
 29 | // It is the same as syntax.Inst except for the new 'Next' field.
 30 | type onePassInst struct {
 31 | 	syntax.Inst
 32 | 	Next []uint32
 33 | }
 34 | 
 35 | // OnePassPrefix returns a literal string that all matches for the
 36 | // regexp must start with.  Complete is true if the prefix
 37 | // is the entire match. Pc is the index of the last rune instruction
 38 | // in the string. The OnePassPrefix skips over the mandatory
 39 | // EmptyBeginText
 40 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
 41 | 	i := &p.Inst[p.Start]
 42 | 	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
 43 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 44 | 	}
 45 | 	pc = i.Out
 46 | 	i = &p.Inst[pc]
 47 | 	for i.Op == syntax.InstNop {
 48 | 		pc = i.Out
 49 | 		i = &p.Inst[pc]
 50 | 	}
 51 | 	// Avoid allocation of buffer if prefix is empty.
 52 | 	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
 53 | 		return "", i.Op == syntax.InstMatch, uint32(p.Start)
 54 | 	}
 55 | 
 56 | 	// Have prefix; gather characters.
 57 | 	var buf bytes.Buffer
 58 | 	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
 59 | 		buf.WriteRune(i.Rune[0])
 60 | 		pc, i = i.Out, &p.Inst[i.Out]
 61 | 	}
 62 | 	return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
 63 | }
 64 | 
 65 | // OnePassNext selects the next actionable state of the prog, based on the input character.
 66 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
 67 | // One of the alternates may ultimately lead without input to end of line. If the instruction
 68 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
 69 | func onePassNext(i *onePassInst, r rune) uint32 {
 70 | 	next := i.MatchRunePos(r)
 71 | 	if next >= 0 {
 72 | 		return i.Next[next]
 73 | 	}
 74 | 	if i.Op == syntax.InstAltMatch {
 75 | 		return i.Out
 76 | 	}
 77 | 	return 0
 78 | }
 79 | 
 80 | func iop(i *syntax.Inst) syntax.InstOp {
 81 | 	op := i.Op
 82 | 	switch op {
 83 | 	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 84 | 		op = syntax.InstRune
 85 | 	}
 86 | 	return op
 87 | }
 88 | 
 89 | // Sparse Array implementation is used as a queueOnePass.
 90 | type queueOnePass struct {
 91 | 	sparse          []uint32
 92 | 	dense           []uint32
 93 | 	size, nextIndex uint32
 94 | }
 95 | 
 96 | func (q *queueOnePass) empty() bool {
 97 | 	return q.nextIndex >= q.size
 98 | }
 99 | 
100 | func (q *queueOnePass) next() (n uint32) {
101 | 	n = q.dense[q.nextIndex]
102 | 	q.nextIndex++
103 | 	return
104 | }
105 | 
106 | func (q *queueOnePass) clear() {
107 | 	q.size = 0
108 | 	q.nextIndex = 0
109 | }
110 | 
111 | func (q *queueOnePass) reset() {
112 | 	q.nextIndex = 0
113 | }
114 | 
115 | func (q *queueOnePass) contains(u uint32) bool {
116 | 	if u >= uint32(len(q.sparse)) {
117 | 		return false
118 | 	}
119 | 	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
120 | }
121 | 
122 | func (q *queueOnePass) insert(u uint32) {
123 | 	if !q.contains(u) {
124 | 		q.insertNew(u)
125 | 	}
126 | }
127 | 
128 | func (q *queueOnePass) insertNew(u uint32) {
129 | 	if u >= uint32(len(q.sparse)) {
130 | 		return
131 | 	}
132 | 	q.sparse[u] = q.size
133 | 	q.dense[q.size] = u
134 | 	q.size++
135 | }
136 | 
137 | func newQueue(size int) (q *queueOnePass) {
138 | 	return &queueOnePass{
139 | 		sparse: make([]uint32, size),
140 | 		dense:  make([]uint32, size),
141 | 	}
142 | }
143 | 
144 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
145 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
146 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
147 | // NextIp array with the single element mergeFailed is returned.
148 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
149 | const mergeFailed = uint32(0xffffffff)
150 | 
151 | var (
152 | 	noRune = []rune{}
153 | 	noNext = []uint32{mergeFailed}
154 | )
155 | 
156 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
157 | 	leftLen := len(*leftRunes)
158 | 	rightLen := len(*rightRunes)
159 | 	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
160 | 		panic("mergeRuneSets odd length []rune")
161 | 	}
162 | 	var (
163 | 		lx, rx int
164 | 	)
165 | 	merged := make([]rune, 0)
166 | 	next := make([]uint32, 0)
167 | 	ok := true
168 | 	defer func() {
169 | 		if !ok {
170 | 			merged = nil
171 | 			next = nil
172 | 		}
173 | 	}()
174 | 
175 | 	ix := -1
176 | 	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
177 | 		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
178 | 			return false
179 | 		}
180 | 		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
181 | 		*newLow += 2
182 | 		ix += 2
183 | 		next = append(next, pc)
184 | 		return true
185 | 	}
186 | 
187 | 	for lx < leftLen || rx < rightLen {
188 | 		switch {
189 | 		case rx >= rightLen:
190 | 			ok = extend(&lx, leftRunes, leftPC)
191 | 		case lx >= leftLen:
192 | 			ok = extend(&rx, rightRunes, rightPC)
193 | 		case (*rightRunes)[rx] < (*leftRunes)[lx]:
194 | 			ok = extend(&rx, rightRunes, rightPC)
195 | 		default:
196 | 			ok = extend(&lx, leftRunes, leftPC)
197 | 		}
198 | 		if !ok {
199 | 			return noRune, noNext
200 | 		}
201 | 	}
202 | 	return merged, next
203 | }
204 | 
205 | // cleanupOnePass drops working memory, and restores certain shortcut instructions.
206 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
207 | 	for ix, instOriginal := range original.Inst {
208 | 		switch instOriginal.Op {
209 | 		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
210 | 		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
211 | 			prog.Inst[ix].Next = nil
212 | 		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
213 | 			prog.Inst[ix].Next = nil
214 | 			prog.Inst[ix] = onePassInst{Inst: instOriginal}
215 | 		}
216 | 	}
217 | }
218 | 
219 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it
220 | func onePassCopy(prog *syntax.Prog) *onePassProg {
221 | 	p := &onePassProg{
222 | 		Start:  prog.Start,
223 | 		NumCap: prog.NumCap,
224 | 	}
225 | 	for _, inst := range prog.Inst {
226 | 		p.Inst = append(p.Inst, onePassInst{Inst: inst})
227 | 	}
228 | 
229 | 	// rewrites one or more common Prog constructs that enable some otherwise
230 | 	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
231 | 	// ip A, that points to ips B & C.
232 | 	// A:BC + B:DA => A:BC + B:CD
233 | 	// A:BC + B:DC => A:DC + B:DC
234 | 	for pc := range p.Inst {
235 | 		switch p.Inst[pc].Op {
236 | 		default:
237 | 			continue
238 | 		case syntax.InstAlt, syntax.InstAltMatch:
239 | 			// A:Bx + B:Ay
240 | 			p_A_Other := &p.Inst[pc].Out
241 | 			p_A_Alt := &p.Inst[pc].Arg
242 | 			// make sure a target is another Alt
243 | 			instAlt := p.Inst[*p_A_Alt]
244 | 			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
245 | 				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
246 | 				instAlt = p.Inst[*p_A_Alt]
247 | 				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
248 | 					continue
249 | 				}
250 | 			}
251 | 			instOther := p.Inst[*p_A_Other]
252 | 			// Analyzing both legs pointing to Alts is for another day
253 | 			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
254 | 				// too complicated
255 | 				continue
256 | 			}
257 | 			// simple empty transition loop
258 | 			// A:BC + B:DA => A:BC + B:DC
259 | 			p_B_Alt := &p.Inst[*p_A_Alt].Out
260 | 			p_B_Other := &p.Inst[*p_A_Alt].Arg
261 | 			patch := false
262 | 			if instAlt.Out == uint32(pc) {
263 | 				patch = true
264 | 			} else if instAlt.Arg == uint32(pc) {
265 | 				patch = true
266 | 				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
267 | 			}
268 | 			if patch {
269 | 				*p_B_Alt = *p_A_Other
270 | 			}
271 | 
272 | 			// empty transition to common target
273 | 			// A:BC + B:DC => A:DC + B:DC
274 | 			if *p_A_Other == *p_B_Alt {
275 | 				*p_A_Alt = *p_B_Other
276 | 			}
277 | 		}
278 | 	}
279 | 	return p
280 | }
281 | 
282 | // runeSlice exists to permit sorting the case-folded rune sets.
283 | type runeSlice []rune
284 | 
285 | func (p runeSlice) Len() int           { return len(p) }
286 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
287 | func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
288 | 
289 | // Sort is a convenience method.
290 | func (p runeSlice) Sort() {
291 | 	sort.Sort(p)
292 | }
293 | 
294 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
295 | var anyRune = []rune{0, unicode.MaxRune}
296 | 
297 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
298 | // the match engine can always tell which branch to take. The routine may modify
299 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a
300 | // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
301 | // to the size of the Prog.
302 | func makeOnePass(p *onePassProg) *onePassProg {
303 | 	// If the machine is very long, it's not worth the time to check if we can use one pass.
304 | 	if len(p.Inst) >= 1000 {
305 | 		return notOnePass
306 | 	}
307 | 
308 | 	var (
309 | 		instQueue    = newQueue(len(p.Inst))
310 | 		visitQueue   = newQueue(len(p.Inst))
311 | 		build        func(uint32, *queueOnePass)
312 | 		check        func(uint32, map[uint32]bool) bool
313 | 		onePassRunes = make([][]rune, len(p.Inst))
314 | 	)
315 | 	build = func(pc uint32, q *queueOnePass) {
316 | 		if q.contains(pc) {
317 | 			return
318 | 		}
319 | 		inst := p.Inst[pc]
320 | 		switch inst.Op {
321 | 		case syntax.InstAlt, syntax.InstAltMatch:
322 | 			q.insert(inst.Out)
323 | 			build(inst.Out, q)
324 | 			q.insert(inst.Arg)
325 | 		case syntax.InstMatch, syntax.InstFail:
326 | 		default:
327 | 			q.insert(inst.Out)
328 | 		}
329 | 	}
330 | 
331 | 	// check that paths from Alt instructions are unambiguous, and rebuild the new
332 | 	// program as a onepass program
333 | 	check = func(pc uint32, m map[uint32]bool) (ok bool) {
334 | 		ok = true
335 | 		inst := &p.Inst[pc]
336 | 		if visitQueue.contains(pc) {
337 | 			return
338 | 		}
339 | 		visitQueue.insert(pc)
340 | 		switch inst.Op {
341 | 		case syntax.InstAlt, syntax.InstAltMatch:
342 | 			ok = check(inst.Out, m) && check(inst.Arg, m)
343 | 			// check no-input paths to InstMatch
344 | 			matchOut := m[inst.Out]
345 | 			matchArg := m[inst.Arg]
346 | 			if matchOut && matchArg {
347 | 				ok = false
348 | 				break
349 | 			}
350 | 			// Match on empty goes in inst.Out
351 | 			if matchArg {
352 | 				inst.Out, inst.Arg = inst.Arg, inst.Out
353 | 				matchOut, matchArg = matchArg, matchOut
354 | 			}
355 | 			if matchOut {
356 | 				m[pc] = true
357 | 				inst.Op = syntax.InstAltMatch
358 | 			}
359 | 
360 | 			// build a dispatch operator from the two legs of the alt.
361 | 			onePassRunes[pc], inst.Next = mergeRuneSets(
362 | 				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
363 | 			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
364 | 				ok = false
365 | 				break
366 | 			}
367 | 		case syntax.InstCapture, syntax.InstNop:
368 | 			ok = check(inst.Out, m)
369 | 			m[pc] = m[inst.Out]
370 | 			// pass matching runes back through these no-ops.
371 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
372 | 			inst.Next = []uint32{}
373 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
374 | 				inst.Next = append(inst.Next, inst.Out)
375 | 			}
376 | 		case syntax.InstEmptyWidth:
377 | 			ok = check(inst.Out, m)
378 | 			m[pc] = m[inst.Out]
379 | 			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
380 | 			inst.Next = []uint32{}
381 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
382 | 				inst.Next = append(inst.Next, inst.Out)
383 | 			}
384 | 		case syntax.InstMatch, syntax.InstFail:
385 | 			m[pc] = inst.Op == syntax.InstMatch
386 | 			break
387 | 		case syntax.InstRune:
388 | 			ok = check(inst.Out, m)
389 | 			m[pc] = false
390 | 			if len(inst.Next) > 0 {
391 | 				break
392 | 			}
393 | 			if len(inst.Rune) == 0 {
394 | 				onePassRunes[pc] = []rune{}
395 | 				inst.Next = []uint32{inst.Out}
396 | 				break
397 | 			}
398 | 			runes := make([]rune, 0)
399 | 			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
400 | 				r0 := inst.Rune[0]
401 | 				runes = append(runes, r0, r0)
402 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
403 | 					runes = append(runes, r1, r1)
404 | 				}
405 | 				sort.Sort(runeSlice(runes))
406 | 			} else {
407 | 				runes = append(runes, inst.Rune...)
408 | 			}
409 | 			onePassRunes[pc] = runes
410 | 			inst.Next = []uint32{}
411 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
412 | 				inst.Next = append(inst.Next, inst.Out)
413 | 			}
414 | 			inst.Op = syntax.InstRune
415 | 		case syntax.InstRune1:
416 | 			ok = check(inst.Out, m)
417 | 			m[pc] = false
418 | 			if len(inst.Next) > 0 {
419 | 				break
420 | 			}
421 | 			runes := []rune{}
422 | 			// expand case-folded runes
423 | 			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
424 | 				r0 := inst.Rune[0]
425 | 				runes = append(runes, r0, r0)
426 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
427 | 					runes = append(runes, r1, r1)
428 | 				}
429 | 				sort.Sort(runeSlice(runes))
430 | 			} else {
431 | 				runes = append(runes, inst.Rune[0], inst.Rune[0])
432 | 			}
433 | 			onePassRunes[pc] = runes
434 | 			inst.Next = []uint32{}
435 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
436 | 				inst.Next = append(inst.Next, inst.Out)
437 | 			}
438 | 			inst.Op = syntax.InstRune
439 | 		case syntax.InstRuneAny:
440 | 			ok = check(inst.Out, m)
441 | 			m[pc] = false
442 | 			if len(inst.Next) > 0 {
443 | 				break
444 | 			}
445 | 			onePassRunes[pc] = append([]rune{}, anyRune...)
446 | 			inst.Next = []uint32{inst.Out}
447 | 		case syntax.InstRuneAnyNotNL:
448 | 			ok = check(inst.Out, m)
449 | 			m[pc] = false
450 | 			if len(inst.Next) > 0 {
451 | 				break
452 | 			}
453 | 			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
454 | 			inst.Next = []uint32{}
455 | 			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
456 | 				inst.Next = append(inst.Next, inst.Out)
457 | 			}
458 | 		}
459 | 		return
460 | 	}
461 | 
462 | 	instQueue.clear()
463 | 	instQueue.insert(uint32(p.Start))
464 | 	m := make(map[uint32]bool, len(p.Inst))
465 | 	for !instQueue.empty() {
466 | 		pc := instQueue.next()
467 | 		inst := p.Inst[pc]
468 | 		visitQueue.clear()
469 | 		if !check(uint32(pc), m) {
470 | 			p = notOnePass
471 | 			break
472 | 		}
473 | 		switch inst.Op {
474 | 		case syntax.InstAlt, syntax.InstAltMatch:
475 | 			instQueue.insert(inst.Out)
476 | 			instQueue.insert(inst.Arg)
477 | 		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
478 | 			instQueue.insert(inst.Out)
479 | 		case syntax.InstMatch:
480 | 		case syntax.InstFail:
481 | 		case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
482 | 		default:
483 | 		}
484 | 	}
485 | 	if p != notOnePass {
486 | 		for i := range p.Inst {
487 | 			p.Inst[i].Rune = onePassRunes[i]
488 | 		}
489 | 	}
490 | 	return p
491 | }
492 | 
493 | // walk visits each Inst in the prog once, and applies the argument
494 | // function(ip, next), in pre-order.
495 | func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
496 | 	var walk1 func(uint32)
497 | 	progQueue := newQueue(len(prog.Inst))
498 | 	walk1 = func(ip uint32) {
499 | 		if progQueue.contains(ip) {
500 | 			return
501 | 		}
502 | 		progQueue.insert(ip)
503 | 		inst := prog.Inst[ip]
504 | 		switch inst.Op {
505 | 		case syntax.InstAlt, syntax.InstAltMatch:
506 | 			for _, f := range funcs {
507 | 				f(ip, inst.Out)
508 | 				f(ip, inst.Arg)
509 | 			}
510 | 			walk1(inst.Out)
511 | 			walk1(inst.Arg)
512 | 		default:
513 | 			for _, f := range funcs {
514 | 				f(ip, inst.Out)
515 | 			}
516 | 			walk1(inst.Out)
517 | 		}
518 | 	}
519 | 	walk1(uint32(prog.Start))
520 | }
521 | 
522 | // find returns the Insts that match the argument predicate function
523 | func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
524 | 	matches = []uint32{}
525 | 
526 | 	for ip := range prog.Inst {
527 | 		if f(prog, ip) {
528 | 			matches = append(matches, uint32(ip))
529 | 		}
530 | 	}
531 | 	return
532 | }
533 | 
534 | var notOnePass *onePassProg = nil
535 | 
536 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
537 | // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
538 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must
539 | // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
540 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
541 | 	if prog.Start == 0 {
542 | 		return notOnePass
543 | 	}
544 | 	// onepass regexp is anchored
545 | 	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
546 | 		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
547 | 		return notOnePass
548 | 	}
549 | 	// every instruction leading to InstMatch must be EmptyEndText
550 | 	for _, inst := range prog.Inst {
551 | 		opOut := prog.Inst[inst.Out].Op
552 | 		switch inst.Op {
553 | 		default:
554 | 			if opOut == syntax.InstMatch {
555 | 				return notOnePass
556 | 			}
557 | 		case syntax.InstAlt, syntax.InstAltMatch:
558 | 			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
559 | 				return notOnePass
560 | 			}
561 | 		case syntax.InstEmptyWidth:
562 | 			if opOut == syntax.InstMatch {
563 | 				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
564 | 					continue
565 | 				}
566 | 				return notOnePass
567 | 			}
568 | 		}
569 | 	}
570 | 	// Creates a slightly optimized copy of the original Prog
571 | 	// that cleans up some Prog idioms that block valid onepass programs
572 | 	p = onePassCopy(prog)
573 | 
574 | 	// checkAmbiguity on InstAlts, build onepass Prog if possible
575 | 	p = makeOnePass(p)
576 | 
577 | 	if p != notOnePass {
578 | 		cleanupOnePass(p, prog)
579 | 	}
580 | 	return p
581 | }
582 | 


--------------------------------------------------------------------------------
/regex/regexp.go:
--------------------------------------------------------------------------------
   1 | // Copyright 2009 The Go Authors. All rights reserved.
   2 | // Use of this source code is governed by a BSD-style
   3 | // license that can be found in the LICENSE file.
   4 | 
   5 | // Package regex is a fork of the standard library regexp package.
   6 | // It contains a few small additions that make it possible to
   7 | // interact more directly with the underlying DFA machinery.
   8 | //
   9 | // Package regexp implements regular expression search.
  10 | //
  11 | // The syntax of the regular expressions accepted is the same
  12 | // general syntax used by Perl, Python, and other languages.
  13 | // More precisely, it is the syntax accepted by RE2 and described at
  14 | // https://golang.org/s/re2syntax, except for \C.
  15 | // For an overview of the syntax, run
  16 | //   go doc regexp/syntax
  17 | //
  18 | // The regexp implementation provided by this package is
  19 | // guaranteed to run in time linear in the size of the input.
  20 | // (This is a property not guaranteed by most open source
  21 | // implementations of regular expressions.) For more information
  22 | // about this property, see
  23 | //	http://swtch.com/~rsc/regexp/regexp1.html
  24 | // or any book about automata theory.
  25 | //
  26 | // All characters are UTF-8-encoded code points.
  27 | //
  28 | // There are 16 methods of Regexp that match a regular expression and identify
  29 | // the matched text.  Their names are matched by this regular expression:
  30 | //
  31 | //	Find(All)?(String)?(Submatch)?(Index)?
  32 | //
  33 | // If 'All' is present, the routine matches successive non-overlapping
  34 | // matches of the entire expression.  Empty matches abutting a preceding
  35 | // match are ignored.  The return value is a slice containing the successive
  36 | // return values of the corresponding non-'All' routine.  These routines take
  37 | // an extra integer argument, n; if n >= 0, the function returns at most n
  38 | // matches/submatches.
  39 | //
  40 | // If 'String' is present, the argument is a string; otherwise it is a slice
  41 | // of bytes; return values are adjusted as appropriate.
  42 | //
  43 | // If 'Submatch' is present, the return value is a slice identifying the
  44 | // successive submatches of the expression. Submatches are matches of
  45 | // parenthesized subexpressions (also known as capturing groups) within the
  46 | // regular expression, numbered from left to right in order of opening
  47 | // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
  48 | // the match of the first parenthesized subexpression, and so on.
  49 | //
  50 | // If 'Index' is present, matches and submatches are identified by byte index
  51 | // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  52 | // the nth submatch.  The pair for n==0 identifies the match of the entire
  53 | // expression.  If 'Index' is not present, the match is identified by the
  54 | // text of the match/submatch.  If an index is negative, it means that
  55 | // subexpression did not match any string in the input.
  56 | //
  57 | // There is also a subset of the methods that can be applied to text read
  58 | // from a RuneReader:
  59 | //
  60 | //	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
  61 | //
  62 | // This set may grow.  Note that regular expression matches may need to
  63 | // examine text beyond the text returned by a match, so the methods that
  64 | // match text from a RuneReader may read arbitrarily far into the input
  65 | // before returning.
  66 | //
  67 | // (There are a few other methods that do not match this pattern.)
  68 | //
  69 | package regex
  70 | 
  71 | import (
  72 | 	"bytes"
  73 | 	"io"
  74 | 	"regexp/syntax"
  75 | 	"strconv"
  76 | 	"strings"
  77 | 	"sync"
  78 | 	"unicode"
  79 | 	"unicode/utf8"
  80 | )
  81 | 
  82 | var debug = false
  83 | 
  84 | // Regexp is the representation of a compiled regular expression.
  85 | // A Regexp is safe for concurrent use by multiple goroutines.
  86 | type Regexp struct {
  87 | 	// read-only after Compile
  88 | 	expr           string
  89 | 	prog           *syntax.Prog   // compiled program
  90 | 	onepass        *onePassProg   // onepass program or nil
  91 | 	prefix         string         // required prefix in unanchored matches
  92 | 	prefixBytes    []byte         // prefix, as a []byte
  93 | 	prefixComplete bool           // prefix is the entire regexp
  94 | 	prefixRune     rune           // first rune in prefix
  95 | 	prefixEnd      uint32         // pc for last rune in prefix
  96 | 	cond           syntax.EmptyOp // empty-width conditions required at start of match
  97 | 	numSubexp      int
  98 | 	subexpNames    []string
  99 | 	longest        bool
 100 | 
 101 | 	// cache of machines for running regexp
 102 | 	mu      sync.Mutex
 103 | 	machine []*machine
 104 | }
 105 | 
 106 | // String returns the source text used to compile the regular expression.
 107 | func (re *Regexp) String() string {
 108 | 	return re.expr
 109 | }
 110 | 
 111 | // Compile parses a regular expression and returns, if successful,
 112 | // a Regexp object that can be used to match against text.
 113 | //
 114 | // When matching against text, the regexp returns a match that
 115 | // begins as early as possible in the input (leftmost), and among those
 116 | // it chooses the one that a backtracking search would have found first.
 117 | // This so-called leftmost-first matching is the same semantics
 118 | // that Perl, Python, and other implementations use, although this
 119 | // package implements it without the expense of backtracking.
 120 | // For POSIX leftmost-longest matching, see CompilePOSIX.
 121 | func Compile(expr string) (*Regexp, error) {
 122 | 	return compile(expr, syntax.Perl, false)
 123 | }
 124 | 
 125 | // CompileSyntax is like Compile but takes a syntax tree as input.
 126 | func CompileSyntax(ast *syntax.Regexp) (*Regexp, error) {
 127 | 	return compileSyntax(ast, ast.String(), true)
 128 | }
 129 | 
 130 | // CompilePOSIX is like Compile but restricts the regular expression
 131 | // to POSIX ERE (egrep) syntax and changes the match semantics to
 132 | // leftmost-longest.
 133 | //
 134 | // That is, when matching against text, the regexp returns a match that
 135 | // begins as early as possible in the input (leftmost), and among those
 136 | // it chooses a match that is as long as possible.
 137 | // This so-called leftmost-longest matching is the same semantics
 138 | // that early regular expression implementations used and that POSIX
 139 | // specifies.
 140 | //
 141 | // However, there can be multiple leftmost-longest matches, with different
 142 | // submatch choices, and here this package diverges from POSIX.
 143 | // Among the possible leftmost-longest matches, this package chooses
 144 | // the one that a backtracking search would have found first, while POSIX
 145 | // specifies that the match be chosen to maximize the length of the first
 146 | // subexpression, then the second, and so on from left to right.
 147 | // The POSIX rule is computationally prohibitive and not even well-defined.
 148 | // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
 149 | func CompilePOSIX(expr string) (*Regexp, error) {
 150 | 	return compile(expr, syntax.POSIX, true)
 151 | }
 152 | 
 153 | // Longest makes future searches prefer the leftmost-longest match.
 154 | // That is, when matching against text, the regexp returns a match that
 155 | // begins as early as possible in the input (leftmost), and among those
 156 | // it chooses a match that is as long as possible.
 157 | func (re *Regexp) Longest() {
 158 | 	re.longest = true
 159 | }
 160 | 
 161 | func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
 162 | 	re, err := syntax.Parse(expr, mode)
 163 | 	if err != nil {
 164 | 		return nil, err
 165 | 	}
 166 | 	return compileSyntax(re, expr, longest)
 167 | }
 168 | 
 169 | func compileSyntax(re *syntax.Regexp, expr string, longest bool) (*Regexp, error) {
 170 | 	maxCap := re.MaxCap()
 171 | 	capNames := re.CapNames()
 172 | 
 173 | 	re = re.Simplify()
 174 | 	prog, err := syntax.Compile(re)
 175 | 	if err != nil {
 176 | 		return nil, err
 177 | 	}
 178 | 	regexp := &Regexp{
 179 | 		expr:        expr,
 180 | 		prog:        prog,
 181 | 		onepass:     compileOnePass(prog),
 182 | 		numSubexp:   maxCap,
 183 | 		subexpNames: capNames,
 184 | 		cond:        prog.StartCond(),
 185 | 		longest:     longest,
 186 | 	}
 187 | 	if regexp.onepass == notOnePass {
 188 | 		regexp.prefix, regexp.prefixComplete = prog.Prefix()
 189 | 	} else {
 190 | 		regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
 191 | 	}
 192 | 	if regexp.prefix != "" {
 193 | 		// TODO(rsc): Remove this allocation by adding
 194 | 		// IndexString to package bytes.
 195 | 		regexp.prefixBytes = []byte(regexp.prefix)
 196 | 		regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
 197 | 	}
 198 | 	return regexp, nil
 199 | }
 200 | 
 201 | // get returns a machine to use for matching re.
 202 | // It uses the re's machine cache if possible, to avoid
 203 | // unnecessary allocation.
 204 | func (re *Regexp) get() *machine {
 205 | 	re.mu.Lock()
 206 | 	if n := len(re.machine); n > 0 {
 207 | 		z := re.machine[n-1]
 208 | 		re.machine = re.machine[:n-1]
 209 | 		re.mu.Unlock()
 210 | 		return z
 211 | 	}
 212 | 	re.mu.Unlock()
 213 | 	z := progMachine(re.prog, re.onepass)
 214 | 	z.re = re
 215 | 	return z
 216 | }
 217 | 
 218 | // put returns a machine to the re's machine cache.
 219 | // There is no attempt to limit the size of the cache, so it will
 220 | // grow to the maximum number of simultaneous matches
 221 | // run using re.  (The cache empties when re gets garbage collected.)
 222 | func (re *Regexp) put(z *machine) {
 223 | 	re.mu.Lock()
 224 | 	re.machine = append(re.machine, z)
 225 | 	re.mu.Unlock()
 226 | }
 227 | 
 228 | // MustCompile is like Compile but panics if the expression cannot be parsed.
 229 | // It simplifies safe initialization of global variables holding compiled regular
 230 | // expressions.
 231 | func MustCompile(str string) *Regexp {
 232 | 	regexp, error := Compile(str)
 233 | 	if error != nil {
 234 | 		panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
 235 | 	}
 236 | 	return regexp
 237 | }
 238 | 
 239 | // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
 240 | // It simplifies safe initialization of global variables holding compiled regular
 241 | // expressions.
 242 | func MustCompilePOSIX(str string) *Regexp {
 243 | 	regexp, error := CompilePOSIX(str)
 244 | 	if error != nil {
 245 | 		panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
 246 | 	}
 247 | 	return regexp
 248 | }
 249 | 
 250 | func quote(s string) string {
 251 | 	if strconv.CanBackquote(s) {
 252 | 		return "`" + s + "`"
 253 | 	}
 254 | 	return strconv.Quote(s)
 255 | }
 256 | 
 257 | // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 258 | func (re *Regexp) NumSubexp() int {
 259 | 	return re.numSubexp
 260 | }
 261 | 
 262 | // SubexpNames returns the names of the parenthesized subexpressions
 263 | // in this Regexp.  The name for the first sub-expression is names[1],
 264 | // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
 265 | // Since the Regexp as a whole cannot be named, names[0] is always
 266 | // the empty string.  The slice should not be modified.
 267 | func (re *Regexp) SubexpNames() []string {
 268 | 	return re.subexpNames
 269 | }
 270 | 
 271 | const endOfText rune = -1
 272 | 
 273 | // input abstracts different representations of the input text. It provides
 274 | // one-character lookahead.
 275 | type input interface {
 276 | 	step(pos int) (r rune, width int) // advance one rune
 277 | 	canCheckPrefix() bool             // can we look ahead without losing info?
 278 | 	hasPrefix(re *Regexp) bool
 279 | 	index(re *Regexp, pos int) int
 280 | 	context(pos int) syntax.EmptyOp
 281 | }
 282 | 
 283 | // inputString scans a string.
 284 | type inputString struct {
 285 | 	str string
 286 | }
 287 | 
 288 | func (i *inputString) step(pos int) (rune, int) {
 289 | 	if pos < len(i.str) {
 290 | 		c := i.str[pos]
 291 | 		if c < utf8.RuneSelf {
 292 | 			return rune(c), 1
 293 | 		}
 294 | 		return utf8.DecodeRuneInString(i.str[pos:])
 295 | 	}
 296 | 	return endOfText, 0
 297 | }
 298 | 
 299 | func (i *inputString) canCheckPrefix() bool {
 300 | 	return true
 301 | }
 302 | 
 303 | func (i *inputString) hasPrefix(re *Regexp) bool {
 304 | 	return strings.HasPrefix(i.str, re.prefix)
 305 | }
 306 | 
 307 | func (i *inputString) index(re *Regexp, pos int) int {
 308 | 	return strings.Index(i.str[pos:], re.prefix)
 309 | }
 310 | 
 311 | func (i *inputString) context(pos int) syntax.EmptyOp {
 312 | 	r1, r2 := endOfText, endOfText
 313 | 	if pos > 0 && pos <= len(i.str) {
 314 | 		r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 315 | 	}
 316 | 	if pos < len(i.str) {
 317 | 		r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 318 | 	}
 319 | 	return syntax.EmptyOpContext(r1, r2)
 320 | }
 321 | 
 322 | // inputBytes scans a byte slice.
 323 | type inputBytes struct {
 324 | 	str []byte
 325 | }
 326 | 
 327 | func (i *inputBytes) step(pos int) (rune, int) {
 328 | 	if pos < len(i.str) {
 329 | 		c := i.str[pos]
 330 | 		if c < utf8.RuneSelf {
 331 | 			return rune(c), 1
 332 | 		}
 333 | 		return utf8.DecodeRune(i.str[pos:])
 334 | 	}
 335 | 	return endOfText, 0
 336 | }
 337 | 
 338 | func (i *inputBytes) canCheckPrefix() bool {
 339 | 	return true
 340 | }
 341 | 
 342 | func (i *inputBytes) hasPrefix(re *Regexp) bool {
 343 | 	return bytes.HasPrefix(i.str, re.prefixBytes)
 344 | }
 345 | 
 346 | func (i *inputBytes) index(re *Regexp, pos int) int {
 347 | 	return bytes.Index(i.str[pos:], re.prefixBytes)
 348 | }
 349 | 
 350 | func (i *inputBytes) context(pos int) syntax.EmptyOp {
 351 | 	r1, r2 := endOfText, endOfText
 352 | 	if pos > 0 && pos <= len(i.str) {
 353 | 		r1, _ = utf8.DecodeLastRune(i.str[:pos])
 354 | 	}
 355 | 	if pos < len(i.str) {
 356 | 		r2, _ = utf8.DecodeRune(i.str[pos:])
 357 | 	}
 358 | 	return syntax.EmptyOpContext(r1, r2)
 359 | }
 360 | 
 361 | // inputReader scans a RuneReader.
 362 | type inputReader struct {
 363 | 	r     io.RuneReader
 364 | 	atEOT bool
 365 | 	pos   int
 366 | }
 367 | 
 368 | func (i *inputReader) step(pos int) (rune, int) {
 369 | 	if !i.atEOT && pos != i.pos {
 370 | 		return endOfText, 0
 371 | 
 372 | 	}
 373 | 	r, w, err := i.r.ReadRune()
 374 | 	if err != nil {
 375 | 		i.atEOT = true
 376 | 		return endOfText, 0
 377 | 	}
 378 | 	i.pos += w
 379 | 	return r, w
 380 | }
 381 | 
 382 | func (i *inputReader) canCheckPrefix() bool {
 383 | 	return false
 384 | }
 385 | 
 386 | func (i *inputReader) hasPrefix(re *Regexp) bool {
 387 | 	return false
 388 | }
 389 | 
 390 | func (i *inputReader) index(re *Regexp, pos int) int {
 391 | 	return -1
 392 | }
 393 | 
 394 | func (i *inputReader) context(pos int) syntax.EmptyOp {
 395 | 	return 0
 396 | }
 397 | 
 398 | // LiteralPrefix returns a literal string that must begin any match
 399 | // of the regular expression re.  It returns the boolean true if the
 400 | // literal string comprises the entire regular expression.
 401 | func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
 402 | 	return re.prefix, re.prefixComplete
 403 | }
 404 | 
 405 | // MatchReader reports whether the Regexp matches the text read by the
 406 | // RuneReader.
 407 | func (re *Regexp) MatchReader(r io.RuneReader) bool {
 408 | 	return re.doExecute(r, nil, "", 0, 0) != nil
 409 | }
 410 | 
 411 | // MatchString reports whether the Regexp matches the string s.
 412 | func (re *Regexp) MatchString(s string) bool {
 413 | 	return re.doExecute(nil, nil, s, 0, 0) != nil
 414 | }
 415 | 
 416 | // Match reports whether the Regexp matches the byte slice b.
 417 | func (re *Regexp) Match(b []byte) bool {
 418 | 	return re.doExecute(nil, b, "", 0, 0) != nil
 419 | }
 420 | 
 421 | // MatchReader checks whether a textual regular expression matches the text
 422 | // read by the RuneReader.  More complicated queries need to use Compile and
 423 | // the full Regexp interface.
 424 | func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
 425 | 	re, err := Compile(pattern)
 426 | 	if err != nil {
 427 | 		return false, err
 428 | 	}
 429 | 	return re.MatchReader(r), nil
 430 | }
 431 | 
 432 | // MatchString checks whether a textual regular expression
 433 | // matches a string.  More complicated queries need
 434 | // to use Compile and the full Regexp interface.
 435 | func MatchString(pattern string, s string) (matched bool, err error) {
 436 | 	re, err := Compile(pattern)
 437 | 	if err != nil {
 438 | 		return false, err
 439 | 	}
 440 | 	return re.MatchString(s), nil
 441 | }
 442 | 
 443 | // Match checks whether a textual regular expression
 444 | // matches a byte slice.  More complicated queries need
 445 | // to use Compile and the full Regexp interface.
 446 | func Match(pattern string, b []byte) (matched bool, err error) {
 447 | 	re, err := Compile(pattern)
 448 | 	if err != nil {
 449 | 		return false, err
 450 | 	}
 451 | 	return re.Match(b), nil
 452 | }
 453 | 
 454 | // ReplaceAllString returns a copy of src, replacing matches of the Regexp
 455 | // with the replacement string repl.  Inside repl, $ signs are interpreted as
 456 | // in Expand, so for instance $1 represents the text of the first submatch.
 457 | func (re *Regexp) ReplaceAllString(src, repl string) string {
 458 | 	n := 2
 459 | 	if strings.Index(repl, "$") >= 0 {
 460 | 		n = 2 * (re.numSubexp + 1)
 461 | 	}
 462 | 	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
 463 | 		return re.expand(dst, repl, nil, src, match)
 464 | 	})
 465 | 	return string(b)
 466 | }
 467 | 
 468 | // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
 469 | // with the replacement string repl.  The replacement repl is substituted directly,
 470 | // without using Expand.
 471 | func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
 472 | 	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 473 | 		return append(dst, repl...)
 474 | 	}))
 475 | }
 476 | 
 477 | // ReplaceAllStringFunc returns a copy of src in which all matches of the
 478 | // Regexp have been replaced by the return value of function repl applied
 479 | // to the matched substring.  The replacement returned by repl is substituted
 480 | // directly, without using Expand.
 481 | func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
 482 | 	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 483 | 		return append(dst, repl(src[match[0]:match[1]])...)
 484 | 	})
 485 | 	return string(b)
 486 | }
 487 | 
 488 | func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
 489 | 	lastMatchEnd := 0 // end position of the most recent match
 490 | 	searchPos := 0    // position where we next look for a match
 491 | 	var buf []byte
 492 | 	var endPos int
 493 | 	if bsrc != nil {
 494 | 		endPos = len(bsrc)
 495 | 	} else {
 496 | 		endPos = len(src)
 497 | 	}
 498 | 	for searchPos <= endPos {
 499 | 		a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
 500 | 		if len(a) == 0 {
 501 | 			break // no more matches
 502 | 		}
 503 | 
 504 | 		// Copy the unmatched characters before this match.
 505 | 		if bsrc != nil {
 506 | 			buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
 507 | 		} else {
 508 | 			buf = append(buf, src[lastMatchEnd:a[0]]...)
 509 | 		}
 510 | 
 511 | 		// Now insert a copy of the replacement string, but not for a
 512 | 		// match of the empty string immediately after another match.
 513 | 		// (Otherwise, we get double replacement for patterns that
 514 | 		// match both empty and nonempty strings.)
 515 | 		if a[1] > lastMatchEnd || a[0] == 0 {
 516 | 			buf = repl(buf, a)
 517 | 		}
 518 | 		lastMatchEnd = a[1]
 519 | 
 520 | 		// Advance past this match; always advance at least one character.
 521 | 		var width int
 522 | 		if bsrc != nil {
 523 | 			_, width = utf8.DecodeRune(bsrc[searchPos:])
 524 | 		} else {
 525 | 			_, width = utf8.DecodeRuneInString(src[searchPos:])
 526 | 		}
 527 | 		if searchPos+width > a[1] {
 528 | 			searchPos += width
 529 | 		} else if searchPos+1 > a[1] {
 530 | 			// This clause is only needed at the end of the input
 531 | 			// string.  In that case, DecodeRuneInString returns width=0.
 532 | 			searchPos++
 533 | 		} else {
 534 | 			searchPos = a[1]
 535 | 		}
 536 | 	}
 537 | 
 538 | 	// Copy the unmatched characters after the last match.
 539 | 	if bsrc != nil {
 540 | 		buf = append(buf, bsrc[lastMatchEnd:]...)
 541 | 	} else {
 542 | 		buf = append(buf, src[lastMatchEnd:]...)
 543 | 	}
 544 | 
 545 | 	return buf
 546 | }
 547 | 
 548 | // ReplaceAll returns a copy of src, replacing matches of the Regexp
 549 | // with the replacement text repl.  Inside repl, $ signs are interpreted as
 550 | // in Expand, so for instance $1 represents the text of the first submatch.
 551 | func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
 552 | 	n := 2
 553 | 	if bytes.IndexByte(repl, '$') >= 0 {
 554 | 		n = 2 * (re.numSubexp + 1)
 555 | 	}
 556 | 	srepl := ""
 557 | 	b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
 558 | 		if len(srepl) != len(repl) {
 559 | 			srepl = string(repl)
 560 | 		}
 561 | 		return re.expand(dst, srepl, src, "", match)
 562 | 	})
 563 | 	return b
 564 | }
 565 | 
 566 | // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
 567 | // with the replacement bytes repl.  The replacement repl is substituted directly,
 568 | // without using Expand.
 569 | func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
 570 | 	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 571 | 		return append(dst, repl...)
 572 | 	})
 573 | }
 574 | 
 575 | // ReplaceAllFunc returns a copy of src in which all matches of the
 576 | // Regexp have been replaced by the return value of function repl applied
 577 | // to the matched byte slice.  The replacement returned by repl is substituted
 578 | // directly, without using Expand.
 579 | func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 580 | 	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 581 | 		return append(dst, repl(src[match[0]:match[1]])...)
 582 | 	})
 583 | }
 584 | 
 585 | var specialBytes = []byte(`\.+*?()|[]{}^$`)
 586 | 
 587 | func special(b byte) bool {
 588 | 	return bytes.IndexByte(specialBytes, b) >= 0
 589 | }
 590 | 
 591 | // QuoteMeta returns a string that quotes all regular expression metacharacters
 592 | // inside the argument text; the returned string is a regular expression matching
 593 | // the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
 594 | func QuoteMeta(s string) string {
 595 | 	b := make([]byte, 2*len(s))
 596 | 
 597 | 	// A byte loop is correct because all metacharacters are ASCII.
 598 | 	j := 0
 599 | 	for i := 0; i < len(s); i++ {
 600 | 		if special(s[i]) {
 601 | 			b[j] = '\\'
 602 | 			j++
 603 | 		}
 604 | 		b[j] = s[i]
 605 | 		j++
 606 | 	}
 607 | 	return string(b[0:j])
 608 | }
 609 | 
 610 | // The number of capture values in the program may correspond
 611 | // to fewer capturing expressions than are in the regexp.
 612 | // For example, "(a){0}" turns into an empty program, so the
 613 | // maximum capture in the program is 0 but we need to return
 614 | // an expression for \1.  Pad appends -1s to the slice a as needed.
 615 | func (re *Regexp) pad(a []int) []int {
 616 | 	if a == nil {
 617 | 		// No match.
 618 | 		return nil
 619 | 	}
 620 | 	n := (1 + re.numSubexp) * 2
 621 | 	for len(a) < n {
 622 | 		a = append(a, -1)
 623 | 	}
 624 | 	return a
 625 | }
 626 | 
 627 | // Find matches in slice b if b is non-nil, otherwise find matches in string s.
 628 | func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 629 | 	var end int
 630 | 	if b == nil {
 631 | 		end = len(s)
 632 | 	} else {
 633 | 		end = len(b)
 634 | 	}
 635 | 
 636 | 	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
 637 | 		matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
 638 | 		if len(matches) == 0 {
 639 | 			break
 640 | 		}
 641 | 
 642 | 		accept := true
 643 | 		if matches[1] == pos {
 644 | 			// We've found an empty match.
 645 | 			if matches[0] == prevMatchEnd {
 646 | 				// We don't allow an empty match right
 647 | 				// after a previous match, so ignore it.
 648 | 				accept = false
 649 | 			}
 650 | 			var width int
 651 | 			// TODO: use step()
 652 | 			if b == nil {
 653 | 				_, width = utf8.DecodeRuneInString(s[pos:end])
 654 | 			} else {
 655 | 				_, width = utf8.DecodeRune(b[pos:end])
 656 | 			}
 657 | 			if width > 0 {
 658 | 				pos += width
 659 | 			} else {
 660 | 				pos = end + 1
 661 | 			}
 662 | 		} else {
 663 | 			pos = matches[1]
 664 | 		}
 665 | 		prevMatchEnd = matches[1]
 666 | 
 667 | 		if accept {
 668 | 			deliver(re.pad(matches))
 669 | 			i++
 670 | 		}
 671 | 	}
 672 | }
 673 | 
 674 | // Find returns a slice holding the text of the leftmost match in b of the regular expression.
 675 | // A return value of nil indicates no match.
 676 | func (re *Regexp) Find(b []byte) []byte {
 677 | 	a := re.doExecute(nil, b, "", 0, 2)
 678 | 	if a == nil {
 679 | 		return nil
 680 | 	}
 681 | 	return b[a[0]:a[1]]
 682 | }
 683 | 
 684 | // FindIndex returns a two-element slice of integers defining the location of
 685 | // the leftmost match in b of the regular expression.  The match itself is at
 686 | // b[loc[0]:loc[1]].
 687 | // A return value of nil indicates no match.
 688 | func (re *Regexp) FindIndex(b []byte) (loc []int) {
 689 | 	a := re.doExecute(nil, b, "", 0, 2)
 690 | 	if a == nil {
 691 | 		return nil
 692 | 	}
 693 | 	return a[0:2]
 694 | }
 695 | 
 696 | // FindString returns a string holding the text of the leftmost match in s of the regular
 697 | // expression.  If there is no match, the return value is an empty string,
 698 | // but it will also be empty if the regular expression successfully matches
 699 | // an empty string.  Use FindStringIndex or FindStringSubmatch if it is
 700 | // necessary to distinguish these cases.
 701 | func (re *Regexp) FindString(s string) string {
 702 | 	a := re.doExecute(nil, nil, s, 0, 2)
 703 | 	if a == nil {
 704 | 		return ""
 705 | 	}
 706 | 	return s[a[0]:a[1]]
 707 | }
 708 | 
 709 | // FindStringIndex returns a two-element slice of integers defining the
 710 | // location of the leftmost match in s of the regular expression.  The match
 711 | // itself is at s[loc[0]:loc[1]].
 712 | // A return value of nil indicates no match.
 713 | func (re *Regexp) FindStringIndex(s string) (loc []int) {
 714 | 	a := re.doExecute(nil, nil, s, 0, 2)
 715 | 	if a == nil {
 716 | 		return nil
 717 | 	}
 718 | 	return a[0:2]
 719 | }
 720 | 
 721 | // FindReaderIndex returns a two-element slice of integers defining the
 722 | // location of the leftmost match of the regular expression in text read from
 723 | // the RuneReader.  The match text was found in the input stream at
 724 | // byte offset loc[0] through loc[1]-1.
 725 | // A return value of nil indicates no match.
 726 | func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
 727 | 	a := re.doExecute(r, nil, "", 0, 2)
 728 | 	if a == nil {
 729 | 		return nil
 730 | 	}
 731 | 	return a[0:2]
 732 | }
 733 | 
 734 | // FindSubmatch returns a slice of slices holding the text of the leftmost
 735 | // match of the regular expression in b and the matches, if any, of its
 736 | // subexpressions, as defined by the 'Submatch' descriptions in the package
 737 | // comment.
 738 | // A return value of nil indicates no match.
 739 | func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 740 | 	a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
 741 | 	if a == nil {
 742 | 		return nil
 743 | 	}
 744 | 	ret := make([][]byte, 1+re.numSubexp)
 745 | 	for i := range ret {
 746 | 		if 2*i < len(a) && a[2*i] >= 0 {
 747 | 			ret[i] = b[a[2*i]:a[2*i+1]]
 748 | 		}
 749 | 	}
 750 | 	return ret
 751 | }
 752 | 
 753 | // Expand appends template to dst and returns the result; during the
 754 | // append, Expand replaces variables in the template with corresponding
 755 | // matches drawn from src.  The match slice should have been returned by
 756 | // FindSubmatchIndex.
 757 | //
 758 | // In the template, a variable is denoted by a substring of the form
 759 | // $name or ${name}, where name is a non-empty sequence of letters,
 760 | // digits, and underscores.  A purely numeric name like $1 refers to
 761 | // the submatch with the corresponding index; other names refer to
 762 | // capturing parentheses named with the (?P<name>...) syntax.  A
 763 | // reference to an out of range or unmatched index or a name that is not
 764 | // present in the regular expression is replaced with an empty slice.
 765 | //
 766 | // In the $name form, name is taken to be as long as possible: $1x is
 767 | // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
 768 | //
 769 | // To insert a literal $ in the output, use $$ in the template.
 770 | func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
 771 | 	return re.expand(dst, string(template), src, "", match)
 772 | }
 773 | 
 774 | // ExpandString is like Expand but the template and source are strings.
 775 | // It appends to and returns a byte slice in order to give the calling
 776 | // code control over allocation.
 777 | func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
 778 | 	return re.expand(dst, template, nil, src, match)
 779 | }
 780 | 
 781 | func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
 782 | 	for len(template) > 0 {
 783 | 		i := strings.Index(template, "$")
 784 | 		if i < 0 {
 785 | 			break
 786 | 		}
 787 | 		dst = append(dst, template[:i]...)
 788 | 		template = template[i:]
 789 | 		if len(template) > 1 && template[1] == '$' {
 790 | 			// Treat $$ as $.
 791 | 			dst = append(dst, '$')
 792 | 			template = template[2:]
 793 | 			continue
 794 | 		}
 795 | 		name, num, rest, ok := extract(template)
 796 | 		if !ok {
 797 | 			// Malformed; treat $ as raw text.
 798 | 			dst = append(dst, '$')
 799 | 			template = template[1:]
 800 | 			continue
 801 | 		}
 802 | 		template = rest
 803 | 		if num >= 0 {
 804 | 			if 2*num+1 < len(match) && match[2*num] >= 0 {
 805 | 				if bsrc != nil {
 806 | 					dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
 807 | 				} else {
 808 | 					dst = append(dst, src[match[2*num]:match[2*num+1]]...)
 809 | 				}
 810 | 			}
 811 | 		} else {
 812 | 			for i, namei := range re.subexpNames {
 813 | 				if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
 814 | 					if bsrc != nil {
 815 | 						dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
 816 | 					} else {
 817 | 						dst = append(dst, src[match[2*i]:match[2*i+1]]...)
 818 | 					}
 819 | 					break
 820 | 				}
 821 | 			}
 822 | 		}
 823 | 	}
 824 | 	dst = append(dst, template...)
 825 | 	return dst
 826 | }
 827 | 
 828 | // extract returns the name from a leading "$name" or "${name}" in str.
 829 | // If it is a number, extract returns num set to that number; otherwise num = -1.
 830 | func extract(str string) (name string, num int, rest string, ok bool) {
 831 | 	if len(str) < 2 || str[0] != '$' {
 832 | 		return
 833 | 	}
 834 | 	brace := false
 835 | 	if str[1] == '{' {
 836 | 		brace = true
 837 | 		str = str[2:]
 838 | 	} else {
 839 | 		str = str[1:]
 840 | 	}
 841 | 	i := 0
 842 | 	for i < len(str) {
 843 | 		rune, size := utf8.DecodeRuneInString(str[i:])
 844 | 		if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
 845 | 			break
 846 | 		}
 847 | 		i += size
 848 | 	}
 849 | 	if i == 0 {
 850 | 		// empty name is not okay
 851 | 		return
 852 | 	}
 853 | 	name = str[:i]
 854 | 	if brace {
 855 | 		if i >= len(str) || str[i] != '}' {
 856 | 			// missing closing brace
 857 | 			return
 858 | 		}
 859 | 		i++
 860 | 	}
 861 | 
 862 | 	// Parse number.
 863 | 	num = 0
 864 | 	for i := 0; i < len(name); i++ {
 865 | 		if name[i] < '0' || '9' < name[i] || num >= 1e8 {
 866 | 			num = -1
 867 | 			break
 868 | 		}
 869 | 		num = num*10 + int(name[i]) - '0'
 870 | 	}
 871 | 	// Disallow leading zeros.
 872 | 	if name[0] == '0' && len(name) > 1 {
 873 | 		num = -1
 874 | 	}
 875 | 
 876 | 	rest = str[i:]
 877 | 	ok = true
 878 | 	return
 879 | }
 880 | 
 881 | // FindSubmatchIndex returns a slice holding the index pairs identifying the
 882 | // leftmost match of the regular expression in b and the matches, if any, of
 883 | // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
 884 | // in the package comment.
 885 | // A return value of nil indicates no match.
 886 | func (re *Regexp) FindSubmatchIndex(b []byte) []int {
 887 | 	return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
 888 | }
 889 | 
 890 | // FindStringSubmatch returns a slice of strings holding the text of the
 891 | // leftmost match of the regular expression in s and the matches, if any, of
 892 | // its subexpressions, as defined by the 'Submatch' description in the
 893 | // package comment.
 894 | // A return value of nil indicates no match.
 895 | func (re *Regexp) FindStringSubmatch(s string) []string {
 896 | 	a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
 897 | 	if a == nil {
 898 | 		return nil
 899 | 	}
 900 | 	ret := make([]string, 1+re.numSubexp)
 901 | 	for i := range ret {
 902 | 		if 2*i < len(a) && a[2*i] >= 0 {
 903 | 			ret[i] = s[a[2*i]:a[2*i+1]]
 904 | 		}
 905 | 	}
 906 | 	return ret
 907 | }
 908 | 
 909 | // FindStringSubmatchIndex returns a slice holding the index pairs
 910 | // identifying the leftmost match of the regular expression in s and the
 911 | // matches, if any, of its subexpressions, as defined by the 'Submatch' and
 912 | // 'Index' descriptions in the package comment.
 913 | // A return value of nil indicates no match.
 914 | func (re *Regexp) FindStringSubmatchIndex(s string) []int {
 915 | 	return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
 916 | }
 917 | 
 918 | // FindReaderSubmatchIndex returns a slice holding the index pairs
 919 | // identifying the leftmost match of the regular expression of text read by
 920 | // the RuneReader, and the matches, if any, of its subexpressions, as defined
 921 | // by the 'Submatch' and 'Index' descriptions in the package comment.  A
 922 | // return value of nil indicates no match.
 923 | func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
 924 | 	return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
 925 | }
 926 | 
 927 | const startSize = 10 // The size at which to start a slice in the 'All' routines.
 928 | 
 929 | // FindAll is the 'All' version of Find; it returns a slice of all successive
 930 | // matches of the expression, as defined by the 'All' description in the
 931 | // package comment.
 932 | // A return value of nil indicates no match.
 933 | func (re *Regexp) FindAll(b []byte, n int) [][]byte {
 934 | 	if n < 0 {
 935 | 		n = len(b) + 1
 936 | 	}
 937 | 	result := make([][]byte, 0, startSize)
 938 | 	re.allMatches("", b, n, func(match []int) {
 939 | 		result = append(result, b[match[0]:match[1]])
 940 | 	})
 941 | 	if len(result) == 0 {
 942 | 		return nil
 943 | 	}
 944 | 	return result
 945 | }
 946 | 
 947 | // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
 948 | // successive matches of the expression, as defined by the 'All' description
 949 | // in the package comment.
 950 | // A return value of nil indicates no match.
 951 | func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
 952 | 	if n < 0 {
 953 | 		n = len(b) + 1
 954 | 	}
 955 | 	result := make([][]int, 0, startSize)
 956 | 	re.allMatches("", b, n, func(match []int) {
 957 | 		result = append(result, match[0:2])
 958 | 	})
 959 | 	if len(result) == 0 {
 960 | 		return nil
 961 | 	}
 962 | 	return result
 963 | }
 964 | 
 965 | // FindAllString is the 'All' version of FindString; it returns a slice of all
 966 | // successive matches of the expression, as defined by the 'All' description
 967 | // in the package comment.
 968 | // A return value of nil indicates no match.
 969 | func (re *Regexp) FindAllString(s string, n int) []string {
 970 | 	if n < 0 {
 971 | 		n = len(s) + 1
 972 | 	}
 973 | 	result := make([]string, 0, startSize)
 974 | 	re.allMatches(s, nil, n, func(match []int) {
 975 | 		result = append(result, s[match[0]:match[1]])
 976 | 	})
 977 | 	if len(result) == 0 {
 978 | 		return nil
 979 | 	}
 980 | 	return result
 981 | }
 982 | 
 983 | // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
 984 | // slice of all successive matches of the expression, as defined by the 'All'
 985 | // description in the package comment.
 986 | // A return value of nil indicates no match.
 987 | func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
 988 | 	if n < 0 {
 989 | 		n = len(s) + 1
 990 | 	}
 991 | 	result := make([][]int, 0, startSize)
 992 | 	re.allMatches(s, nil, n, func(match []int) {
 993 | 		result = append(result, match[0:2])
 994 | 	})
 995 | 	if len(result) == 0 {
 996 | 		return nil
 997 | 	}
 998 | 	return result
 999 | }
1000 | 
1001 | // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1002 | // of all successive matches of the expression, as defined by the 'All'
1003 | // description in the package comment.
1004 | // A return value of nil indicates no match.
1005 | func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1006 | 	if n < 0 {
1007 | 		n = len(b) + 1
1008 | 	}
1009 | 	result := make([][][]byte, 0, startSize)
1010 | 	re.allMatches("", b, n, func(match []int) {
1011 | 		slice := make([][]byte, len(match)/2)
1012 | 		for j := range slice {
1013 | 			if match[2*j] >= 0 {
1014 | 				slice[j] = b[match[2*j]:match[2*j+1]]
1015 | 			}
1016 | 		}
1017 | 		result = append(result, slice)
1018 | 	})
1019 | 	if len(result) == 0 {
1020 | 		return nil
1021 | 	}
1022 | 	return result
1023 | }
1024 | 
1025 | // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1026 | // a slice of all successive matches of the expression, as defined by the
1027 | // 'All' description in the package comment.
1028 | // A return value of nil indicates no match.
1029 | func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1030 | 	if n < 0 {
1031 | 		n = len(b) + 1
1032 | 	}
1033 | 	result := make([][]int, 0, startSize)
1034 | 	re.allMatches("", b, n, func(match []int) {
1035 | 		result = append(result, match)
1036 | 	})
1037 | 	if len(result) == 0 {
1038 | 		return nil
1039 | 	}
1040 | 	return result
1041 | }
1042 | 
1043 | // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1044 | // returns a slice of all successive matches of the expression, as defined by
1045 | // the 'All' description in the package comment.
1046 | // A return value of nil indicates no match.
1047 | func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1048 | 	if n < 0 {
1049 | 		n = len(s) + 1
1050 | 	}
1051 | 	result := make([][]string, 0, startSize)
1052 | 	re.allMatches(s, nil, n, func(match []int) {
1053 | 		slice := make([]string, len(match)/2)
1054 | 		for j := range slice {
1055 | 			if match[2*j] >= 0 {
1056 | 				slice[j] = s[match[2*j]:match[2*j+1]]
1057 | 			}
1058 | 		}
1059 | 		result = append(result, slice)
1060 | 	})
1061 | 	if len(result) == 0 {
1062 | 		return nil
1063 | 	}
1064 | 	return result
1065 | }
1066 | 
1067 | // FindAllStringSubmatchIndex is the 'All' version of
1068 | // FindStringSubmatchIndex; it returns a slice of all successive matches of
1069 | // the expression, as defined by the 'All' description in the package
1070 | // comment.
1071 | // A return value of nil indicates no match.
1072 | func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1073 | 	if n < 0 {
1074 | 		n = len(s) + 1
1075 | 	}
1076 | 	result := make([][]int, 0, startSize)
1077 | 	re.allMatches(s, nil, n, func(match []int) {
1078 | 		result = append(result, match)
1079 | 	})
1080 | 	if len(result) == 0 {
1081 | 		return nil
1082 | 	}
1083 | 	return result
1084 | }
1085 | 


--------------------------------------------------------------------------------
/restructure.go:
--------------------------------------------------------------------------------
  1 | package restructure
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"reflect"
  6 | 	"regexp/syntax"
  7 | 
  8 | 	"github.com/alexflint/go-restructure/regex"
  9 | )
 10 | 
 11 | // Style determines whether we are in Perl or POSIX or custom mode
 12 | type Style int
 13 | 
 14 | const (
 15 | 	Perl Style = iota
 16 | 	POSIX
 17 | 	CustomStyle
 18 | )
 19 | 
 20 | // Options represents optional parameters for compilation
 21 | type Options struct {
 22 | 	Style       Style // Style can be set to Perl, POSIX, or CustomStyle
 23 | 	SyntaxFlags syntax.Flags
 24 | }
 25 | 
 26 | type subcapture struct {
 27 | 	begin, end int
 28 | }
 29 | 
 30 | func (r subcapture) wasMatched() bool {
 31 | 	return r.begin != -1 && r.end != -1
 32 | }
 33 | 
 34 | type match struct {
 35 | 	input    []byte
 36 | 	captures []subcapture
 37 | }
 38 | 
 39 | func matchFromIndices(indices []int, input []byte) *match {
 40 | 	match := &match{
 41 | 		input: input,
 42 | 	}
 43 | 	for i := 0; i < len(indices); i += 2 {
 44 | 		match.captures = append(match.captures, subcapture{indices[i], indices[i+1]})
 45 | 	}
 46 | 	return match
 47 | }
 48 | 
 49 | // Pos represents a position within a matched region. If a matched struct contains
 50 | // a field of type Pos then this field will be assigned a value indicating a position
 51 | // in the input string, where the position corresponds to the index of the Pos field.
 52 | type Pos int
 53 | 
 54 | // Submatch represents a matched region. It is a used to determine the begin and and
 55 | // position of the match corresponding to a field. This library treats fields of type
 56 | // `Submatch` just like `string` or `[]byte` fields, except that the matched string
 57 | // is inserted into `Submatch.Str` and its begin and end position are inserted into
 58 | // `Submatch.Begin` and `Submatch.End`.
 59 | type Submatch struct {
 60 | 	Begin Pos
 61 | 	End   Pos
 62 | 	Bytes []byte
 63 | }
 64 | 
 65 | // String gets the matched substring
 66 | func (r *Submatch) String() string {
 67 | 	return string(r.Bytes)
 68 | }
 69 | 
 70 | // Regexp is a regular expression that captures submatches into struct fields.
 71 | type Regexp struct {
 72 | 	st   *Struct
 73 | 	re   *regex.Regexp
 74 | 	t    reflect.Type
 75 | 	opts Options
 76 | }
 77 | 
 78 | // Find attempts to match the regular expression against the input string. It
 79 | // returns true if there was a match, and also populates the fields of the provided
 80 | // struct with the contents of each submatch.
 81 | func (r *Regexp) Find(dest interface{}, s string) bool {
 82 | 	v := reflect.ValueOf(dest)
 83 | 	input := []byte(s)
 84 | 
 85 | 	// Check the type
 86 | 	expected := reflect.PtrTo(r.t)
 87 | 	if v.Type() != expected {
 88 | 		panic(fmt.Errorf("expected destination to be *%s but got %T", r.t.String(), dest))
 89 | 	}
 90 | 
 91 | 	// Execute the regular expression
 92 | 	indices := r.re.FindSubmatchIndex(input)
 93 | 	if indices == nil {
 94 | 		return false
 95 | 	}
 96 | 
 97 | 	// Inflate matches into original struct
 98 | 	match := matchFromIndices(indices, input)
 99 | 
100 | 	err := inflateStruct(v, match, r.st)
101 | 	if err != nil {
102 | 		panic(err)
103 | 	}
104 | 	return true
105 | }
106 | 
107 | // FindAll attempts to match the regular expression against the input string. It returns true
108 | // if there was at least one match.
109 | func (r *Regexp) FindAll(dest interface{}, s string, limit int) {
110 | 	// Check the type
111 | 	v := reflect.ValueOf(dest)
112 | 	t := v.Type()
113 | 	if t.Kind() != reflect.Ptr {
114 | 		panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest))
115 | 	}
116 | 
117 | 	sliceType := t.Elem()
118 | 	if sliceType.Kind() != reflect.Slice {
119 | 		panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest))
120 | 	}
121 | 
122 | 	itemType := sliceType.Elem()
123 | 	if itemType != r.t && itemType != reflect.PtrTo(r.t) {
124 | 		panic(fmt.Errorf("expected the slice element to be %s or *%s but it was %s", r.t, r.t, t))
125 | 	}
126 | 
127 | 	// Execute the regular expression
128 | 	input := []byte(s)
129 | 	matches := r.re.FindAllSubmatchIndex(input, limit)
130 | 
131 | 	// Allocate a slice with the desired length
132 | 	v.Elem().Set(reflect.MakeSlice(sliceType, len(matches), len(matches)))
133 | 
134 | 	// Inflate the matches into the slice elements
135 | 	for i, indices := range matches {
136 | 		// Get the i-th element of the slice
137 | 		destItem := v.Elem().Index(i)
138 | 		if itemType.Kind() != reflect.Ptr {
139 | 			destItem = destItem.Addr()
140 | 		}
141 | 
142 | 		// Create the match object
143 | 		match := matchFromIndices(indices, input)
144 | 
145 | 		// Inflate the match into the dest item
146 | 		err := inflateStruct(destItem, match, r.st)
147 | 		if err != nil {
148 | 			panic(err)
149 | 		}
150 | 	}
151 | }
152 | 
153 | // String returns a string representation of the regular expression
154 | func (r *Regexp) String() string {
155 | 	return r.re.String()
156 | }
157 | 
158 | // Compile constructs a regular expression from the struct fields on the
159 | // provided struct.
160 | func Compile(proto interface{}, opts Options) (*Regexp, error) {
161 | 	return CompileType(reflect.TypeOf(proto), opts)
162 | }
163 | 
164 | // CompileType is like Compile but takes a reflect.Type instead.
165 | func CompileType(t reflect.Type, opts Options) (*Regexp, error) {
166 | 	// We do this so that the zero value for Options gives us Perl mode,
167 | 	// which is also the default used by the standard library regexp package
168 | 	switch opts.Style {
169 | 	case Perl:
170 | 		opts.SyntaxFlags = syntax.Perl
171 | 	case POSIX:
172 | 		opts.SyntaxFlags = syntax.POSIX
173 | 	}
174 | 
175 | 	if t.Kind() == reflect.Ptr {
176 | 		t = t.Elem()
177 | 	}
178 | 
179 | 	// Traverse the struct
180 | 	b := newBuilder(opts)
181 | 	st, expr, err := b.structure(t)
182 | 	if err != nil {
183 | 		return nil, err
184 | 	}
185 | 
186 | 	// Compile regular expression
187 | 	re, err := regex.CompileSyntax(expr)
188 | 	if err != nil {
189 | 		return nil, err
190 | 	}
191 | 
192 | 	// Return
193 | 	return &Regexp{
194 | 		st:   st,
195 | 		re:   re,
196 | 		t:    t,
197 | 		opts: opts,
198 | 	}, nil
199 | }
200 | 
201 | // MustCompile is like Compile but panics if there is a compilation error
202 | func MustCompile(proto interface{}, opts Options) *Regexp {
203 | 	re, err := Compile(proto, opts)
204 | 	if err != nil {
205 | 		panic(err)
206 | 	}
207 | 	return re
208 | }
209 | 
210 | // MustCompileType is like CompileType but panics if there is a compilation error
211 | func MustCompileType(t reflect.Type, opts Options) *Regexp {
212 | 	re, err := CompileType(t, opts)
213 | 	if err != nil {
214 | 		panic(err)
215 | 	}
216 | 	return re
217 | }
218 | 
219 | // Find constructs a regular expression from the given struct and executes it on the
220 | // given string, placing submatches into the fields of the struct. The first parameter
221 | // must be a non-nil struct pointer. It returns true if the match succeeded. The only
222 | // errors that are returned are compilation errors.
223 | func Find(dest interface{}, s string) (bool, error) {
224 | 	re, err := Compile(dest, Options{})
225 | 	if err != nil {
226 | 		return false, err
227 | 	}
228 | 	return re.Find(dest, s), nil
229 | }
230 | 


--------------------------------------------------------------------------------
/restructure_test.go:
--------------------------------------------------------------------------------
  1 | package restructure
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | 	"github.com/stretchr/testify/require"
  9 | )
 10 | 
 11 | func assertRegion(t *testing.T, s string, begin int, end int, r *Submatch) {
 12 | 	assert.NotNil(t, r)
 13 | 	assert.Equal(t, s, string(r.Bytes))
 14 | 	assert.EqualValues(t, begin, r.Begin)
 15 | 	assert.EqualValues(t, end, r.End)
 16 | }
 17 | 
 18 | type DotName struct {
 19 | 	Dot  string `regexp:"\\."`
 20 | 	Name string `regexp:"\\w+"`
 21 | }
 22 | 
 23 | type DotExpr struct {
 24 | 	_    struct{} `regexp:"^"`
 25 | 	Head string   `regexp:"\\w+"`
 26 | 	Tail *DotName `regexp:"?"`
 27 | 	_    struct{} `regexp:"$"`
 28 | }
 29 | 
 30 | func TestMatchNameDotName(t *testing.T) {
 31 | 	pattern, err := Compile(DotExpr{}, Options{})
 32 | 	require.NoError(t, err)
 33 | 
 34 | 	var v DotExpr
 35 | 	assert.True(t, pattern.Find(&v, "foo.bar"))
 36 | 	assert.Equal(t, "foo", v.Head)
 37 | 	require.NotNil(t, v.Tail)
 38 | 	assert.Equal(t, ".", v.Tail.Dot)
 39 | 	assert.Equal(t, "bar", v.Tail.Name)
 40 | }
 41 | 
 42 | func TestMatchNameDotNameHeadOnly(t *testing.T) {
 43 | 	pattern, err := Compile(DotExpr{}, Options{})
 44 | 	require.NoError(t, err)
 45 | 
 46 | 	var v DotExpr
 47 | 	assert.True(t, pattern.Find(&v, "head"))
 48 | 	assert.Equal(t, "head", v.Head)
 49 | 	assert.Nil(t, v.Tail)
 50 | }
 51 | 
 52 | func TestMatchNameDotNameFails(t *testing.T) {
 53 | 	pattern, err := Compile(DotExpr{}, Options{})
 54 | 	require.NoError(t, err)
 55 | 
 56 | 	var v DotExpr
 57 | 	assert.False(t, pattern.Find(&v, ".oops"))
 58 | }
 59 | 
 60 | type URL struct {
 61 | 	_      string `regexp:"^"`
 62 | 	Scheme string `regexp:"[[:alpha:]]+" json:"scheme"`
 63 | 	_      string `regexp:"://"`
 64 | 	Host   string `regexp:".*" json:"host"`
 65 | 	_      string `regexp:"$"`
 66 | }
 67 | 
 68 | func TestMatchURL(t *testing.T) {
 69 | 	pattern, err := Compile(URL{}, Options{})
 70 | 	require.NoError(t, err)
 71 | 
 72 | 	var v URL
 73 | 	require.True(t, pattern.Find(&v, "http://example.com"))
 74 | 	assert.Equal(t, "http", v.Scheme)
 75 | 	assert.Equal(t, "example.com", v.Host)
 76 | }
 77 | 
 78 | func TestCombinationWithJSONTags(t *testing.T) {
 79 | 	pattern, err := Compile(URL{}, Options{})
 80 | 	require.NoError(t, err)
 81 | 
 82 | 	var v URL
 83 | 	require.True(t, pattern.Find(&v, "http://example.com"))
 84 | 
 85 | 	js, err := json.Marshal(&v)
 86 | 	require.NoError(t, err)
 87 | 
 88 | 	assert.Equal(t, "{\"scheme\":\"http\",\"host\":\"example.com\"}", string(js))
 89 | }
 90 | 
 91 | type PtrURL struct {
 92 | 	_      struct{} `regexp:"^"`
 93 | 	Scheme *string  `regexp:"[[:alpha:]]+"`
 94 | 	_      struct{} `regexp:"://"`
 95 | 	Host   *string  `regexp:".*"`
 96 | 	_      struct{} `regexp:"$"`
 97 | }
 98 | 
 99 | func TestMatchPtrURL(t *testing.T) {
100 | 	pattern, err := Compile(PtrURL{}, Options{})
101 | 	require.NoError(t, err)
102 | 
103 | 	var v PtrURL
104 | 	require.True(t, pattern.Find(&v, "http://example.com"))
105 | 	require.NotNil(t, v.Scheme)
106 | 	require.NotNil(t, v.Host)
107 | 	assert.Equal(t, "http", *v.Scheme)
108 | 	assert.Equal(t, "example.com", *v.Host)
109 | }
110 | 
111 | func TestMatchPtrURLFailed(t *testing.T) {
112 | 	pattern, err := Compile(PtrURL{}, Options{})
113 | 	require.NoError(t, err)
114 | 
115 | 	var v PtrURL
116 | 	require.False(t, pattern.Find(&v, "oops"))
117 | 	assert.Nil(t, v.Scheme)
118 | 	assert.Nil(t, v.Host)
119 | }
120 | 
121 | type NakedURL struct {
122 | 	_      string `^`
123 | 	Scheme string `[[:alpha:]]+`
124 | 	_      string `://`
125 | 	Host   string `.*`
126 | 	_      string `$`
127 | }
128 | 
129 | func TestMatchNakedURL(t *testing.T) {
130 | 	pattern, err := Compile(NakedURL{}, Options{})
131 | 	require.NoError(t, err)
132 | 
133 | 	var v NakedURL
134 | 	require.True(t, pattern.Find(&v, "http://example.com"))
135 | 	assert.Equal(t, "http", v.Scheme)
136 | 	assert.Equal(t, "example.com", v.Host)
137 | }
138 | 
139 | type Nothing struct {
140 | 	X string
141 | }
142 | 
143 | func TestEmptyPattern(t *testing.T) {
144 | 	pattern, err := Compile(Nothing{}, Options{})
145 | 	require.NoError(t, err)
146 | 
147 | 	var v Nothing
148 | 	require.True(t, pattern.Find(&v, "abc"))
149 | }
150 | 
151 | type Malformed struct {
152 | 	X string `regexp:"\w"` // this is malformed because \w is not a valid escape sequence
153 | }
154 | 
155 | func TestErrorOnMalformedTag(t *testing.T) {
156 | 	_, err := Compile(Malformed{}, Options{})
157 | 	assert.Error(t, err)
158 | }
159 | 
160 | type HasSubcaptures struct {
161 | 	Name string `a(bc)?d`
162 | }
163 | 
164 | func TestRemoveSubcaptures(t *testing.T) {
165 | 	pattern, err := Compile(HasSubcaptures{}, Options{})
166 | 	require.NoError(t, err)
167 | 
168 | 	var v HasSubcaptures
169 | 	require.True(t, pattern.Find(&v, "abcd"))
170 | 	assert.Equal(t, "abcd", v.Name)
171 | }
172 | 
173 | type DotNameRegion struct {
174 | 	Dot  *Submatch `regexp:"\\."`
175 | 	Name *Submatch `regexp:"\\w+"`
176 | }
177 | 
178 | type DotExprRegion struct {
179 | 	_    struct{}       `regexp:"^"`
180 | 	Head Submatch       `regexp:"\\w+"`
181 | 	Tail *DotNameRegion `regexp:"?"`
182 | 	_    struct{}       `regexp:"$"`
183 | }
184 | 
185 | func TestMatchNameDotNameRegion(t *testing.T) {
186 | 	pattern, err := Compile(DotExprRegion{}, Options{})
187 | 	require.NoError(t, err)
188 | 
189 | 	var v DotExprRegion
190 | 	assert.True(t, pattern.Find(&v, "foo.bar"))
191 | 	assertRegion(t, "foo", 0, 3, &v.Head)
192 | 	assert.NotNil(t, v.Tail)
193 | 	assertRegion(t, ".", 3, 4, v.Tail.Dot)
194 | 	assertRegion(t, "bar", 4, 7, v.Tail.Name)
195 | }
196 | 
197 | type DotNamePos struct {
198 | 	Begin  Pos
199 | 	Dot    string `regexp:"\\."`
200 | 	Middle Pos
201 | 	Name   string `regexp:"\\w+"`
202 | 	End    Pos
203 | }
204 | 
205 | type DotExprPos struct {
206 | 	Begin  Pos
207 | 	_      struct{} `regexp:"^"`
208 | 	Head   string   `regexp:"\\w+"`
209 | 	Middle Pos
210 | 	Tail   *DotNamePos `regexp:"?"`
211 | 	_      struct{}    `regexp:"$"`
212 | 	End    Pos
213 | }
214 | 
215 | func TestMatchNameDotNamePos(t *testing.T) {
216 | 	pattern, err := Compile(DotExprPos{}, Options{})
217 | 	require.NoError(t, err)
218 | 
219 | 	var v DotExprPos
220 | 	assert.True(t, pattern.Find(&v, "foo.bar"))
221 | 	assert.EqualValues(t, 0, v.Begin)
222 | 	assert.EqualValues(t, 3, v.Middle)
223 | 	assert.EqualValues(t, 3, v.Tail.Begin)
224 | 	assert.EqualValues(t, 4, v.Tail.Middle)
225 | 	assert.EqualValues(t, 7, v.Tail.End)
226 | 	assert.EqualValues(t, 7, v.End)
227 | }
228 | 
229 | type DegeneratePos struct {
230 | 	X Pos
231 | 	Y Pos
232 | }
233 | 
234 | func TestDegeneratePos(t *testing.T) {
235 | 	// This tests what happens if there are degenerate position captures
236 | 	pattern, err := Compile(DegeneratePos{}, Options{})
237 | 	require.NoError(t, err)
238 | 	var v DegeneratePos
239 | 	assert.True(t, pattern.Find(&v, "abc"))
240 | 	assert.EqualValues(t, 0, v.X)
241 | 	assert.EqualValues(t, 0, v.Y)
242 | }
243 | 
244 | type UnexportedPos struct {
245 | 	Exported   Pos
246 | 	unexported Pos
247 | 	_          struct{} `regexp:"$"`
248 | }
249 | 
250 | func TestUnexportedPos(t *testing.T) {
251 | 	// This tests what happens if there are non-exported Pos fields
252 | 	pattern, err := Compile(UnexportedPos{}, Options{})
253 | 	require.NoError(t, err)
254 | 	var v UnexportedPos
255 | 	assert.True(t, pattern.Find(&v, "abc"))
256 | 	assert.EqualValues(t, 3, v.Exported)
257 | 	assert.EqualValues(t, 0, v.unexported) // should be ignored
258 | }
259 | 
260 | type Word struct {
261 | 	S string `\w+`
262 | }
263 | 
264 | func TestFindAllWords_Simple(t *testing.T) {
265 | 	pattern := MustCompile(Word{}, Options{})
266 | 	var words []Word
267 | 	pattern.FindAll(&words, "ham is spam", -1)
268 | 	require.Len(t, words, 3)
269 | 	assert.EqualValues(t, "ham", words[0].S)
270 | 	assert.EqualValues(t, "is", words[1].S)
271 | 	assert.EqualValues(t, "spam", words[2].S)
272 | }
273 | 
274 | func TestFindAllWords_Ptr(t *testing.T) {
275 | 	pattern := MustCompile(Word{}, Options{})
276 | 	var words []*Word
277 | 	pattern.FindAll(&words, "ham is spam", -1)
278 | 	require.Len(t, words, 3)
279 | 	assert.EqualValues(t, "ham", words[0].S)
280 | 	assert.EqualValues(t, "is", words[1].S)
281 | 	assert.EqualValues(t, "spam", words[2].S)
282 | }
283 | 
284 | func TestFindAllWords_NoMatches(t *testing.T) {
285 | 	pattern := MustCompile(Word{}, Options{})
286 | 	var words []*Word
287 | 	pattern.FindAll(&words, "*&!", -1)
288 | 	require.Empty(t, words)
289 | }
290 | 
291 | func TestFindAllWords_ByValueSlicePanics(t *testing.T) {
292 | 	pattern := MustCompile(Word{}, Options{})
293 | 	var words []*Word
294 | 	// This should panic because words is passed by value not by pointer:
295 | 	assert.Panics(t, func() { pattern.FindAll(words, "*&!", -1) })
296 | }
297 | 
298 | type WordSubmatch struct {
299 | 	S *Submatch `\w+`
300 | }
301 | 
302 | func TestFindAllWords_Regions(t *testing.T) {
303 | 	pattern := MustCompile(WordSubmatch{}, Options{})
304 | 	var words []*WordSubmatch
305 | 	pattern.FindAll(&words, "ham is spam", -1)
306 | 	require.Len(t, words, 3)
307 | 	assertRegion(t, "ham", 0, 3, words[0].S)
308 | 	assertRegion(t, "is", 4, 6, words[1].S)
309 | 	assertRegion(t, "spam", 7, 11, words[2].S)
310 | }
311 | 
312 | type ExprWithInt struct {
313 | 	Number int    `regexp:"^\\d+"`
314 | 	_      string `regexp:"\\s+"`
315 | 	Animal string `regexp:"\\w+$"`
316 | }
317 | 
318 | func TestMatchWithInt(t *testing.T) {
319 | 	pattern, err := Compile(ExprWithInt{}, Options{})
320 | 	require.NoError(t, err)
321 | 
322 | 	var v ExprWithInt
323 | 	assert.True(t, pattern.Find(&v, "4 wombats"))
324 | 	assert.Equal(t, 4, v.Number)
325 | 	assert.Equal(t, "wombats", v.Animal)
326 | }
327 | 


--------------------------------------------------------------------------------
/samples/email-address/email-address.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/alexflint/go-restructure"
 7 | )
 8 | 
 9 | type Hostname struct {
10 | 	Domain string   `\w+`
11 | 	_      struct{} `\.`
12 | 	TLD    string   `\w+`
13 | }
14 | 
15 | type EmailAddress struct {
16 | 	_    struct{} `^`
17 | 	User string   `[a-zA-Z0-9._%+-]+`
18 | 	_    struct{} `@`
19 | 	Host *Hostname
20 | 	_    struct{} `$`
21 | }
22 | 
23 | func main() {
24 | 	var addr EmailAddress
25 | 	success, _ := restructure.Find(&addr, "joe@example.com")
26 | 	if success {
27 | 		fmt.Println(addr.User)        // prints "joe"
28 | 		fmt.Println(addr.Host.Domain) // prints "example"
29 | 		fmt.Println(addr.Host.TLD)    // prints "com"
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/samples/find-all-floats/find-all-floats.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/alexflint/go-restructure"
 7 | )
 8 | 
 9 | var src = `
10 | The US economy went through an economic downturn following the financial 
11 | crisis of 2007–08, with output as late as 2013 still below potential
12 | according to the Congressional Budget Office.[57] The economy, however,
13 | began to recover in the second half of 2009, and as of November 2015,
14 | unemployment had declined from a high of 10% to 5%; the government's
15 | broader U-6 unemployment rate, which includes the part-time underemployed,
16 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of
17 | the lowest labor union participation rates in the OECD.[58] Households
18 | living on less than $2 per day before government benefits, doubled from
19 | 1996 levels to 1.5 million households in 2011, including 2.8 million
20 | children.[59] The gap in income between rich and poor is greater in the
21 | United States than in any other developed country.[60] Total public and
22 | private debt was $50 trillion at the end of the first quarter of 2010,
23 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more
24 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion
25 | and domestic financial liabilities totaled $106 trillion.[63]
26 | `
27 | 
28 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{})
29 | 
30 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
31 | type Float struct {
32 | 	Begin    restructure.Pos
33 | 	Sign     *Sign     `?`
34 | 	Whole    string    `[0-9]*`
35 | 	Period   struct{}  `\.?`
36 | 	Frac     string    `[0-9]+`
37 | 	Exponent *Exponent `?`
38 | 	End      restructure.Pos
39 | }
40 | 
41 | // Matches "+" or "-"
42 | type Sign struct {
43 | 	Ch string `[+-]`
44 | }
45 | 
46 | // Matches "e+4", "E6", "e-03"
47 | type Exponent struct {
48 | 	_    struct{} `[eE]`
49 | 	Sign *Sign    `?`
50 | 	Num  string   `[0-9]+`
51 | }
52 | 
53 | func main() {
54 | 	var floats []Float
55 | 	floatRegexp.FindAll(&floats, src, -1)
56 | 	for _, f := range floats {
57 | 		fmt.Println(src[f.Begin:f.End])
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/samples/floating-point/floating-point.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 
 7 | 	"github.com/alexflint/go-restructure"
 8 | )
 9 | 
10 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{})
11 | 
12 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123"
13 | type Float struct {
14 | 	Sign     *Sign     `?`
15 | 	Whole    string    `[0-9]*`
16 | 	Period   struct{}  `\.?`
17 | 	Frac     string    `[0-9]+`
18 | 	Exponent *Exponent `?`
19 | }
20 | 
21 | // Matches "+" or "-"
22 | type Sign struct {
23 | 	Ch string `[+-]`
24 | }
25 | 
26 | // Matches "e+4", "E6", "e-03"
27 | type Exponent struct {
28 | 	_    struct{} `[eE]`
29 | 	Sign *Sign    `?`
30 | 	Num  string   `[0-9]+`
31 | }
32 | 
33 | func prettyPrint(x interface{}) string {
34 | 	buf, err := json.MarshalIndent(x, "", "  ")
35 | 	if err != nil {
36 | 		return err.Error()
37 | 	}
38 | 	return string(buf)
39 | }
40 | 
41 | func main() {
42 | 	var f Float
43 | 	for _, str := range []string{"1.23", "1.23e+45", ".123", "12e3"} {
44 | 		floatRegexp.Find(&f, str)
45 | 		fmt.Printf("\"%s\" -> %s\n\n", str, prettyPrint(f))
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/samples/name-dot-name/name-dot-name.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"log"
 7 | 
 8 | 	"github.com/alexflint/go-arg"
 9 | 	"github.com/alexflint/go-restructure"
10 | )
11 | 
12 | type DotName struct {
13 | 	Dot  string `\.`
14 | 	Name string `\w+`
15 | }
16 | 
17 | type DotExpr struct {
18 | 	_    struct{} `^`
19 | 	Head string   `foo`
20 | 	Tail *DotName `?`
21 | 	_    struct{} `$`
22 | }
23 | 
24 | func prettyPrint(x interface{}) string {
25 | 	buf, err := json.MarshalIndent(x, "", "  ")
26 | 	if err != nil {
27 | 		return err.Error()
28 | 	}
29 | 	return string(buf)
30 | }
31 | 
32 | func main() {
33 | 	var args struct {
34 | 		Str string `arg:"positional"`
35 | 	}
36 | 	arg.MustParse(&args)
37 | 
38 | 	// Construct the regular expression
39 | 	pattern, err := restructure.Compile(&DotExpr{}, restructure.Options{})
40 | 	if err != nil {
41 | 		log.Fatal(err)
42 | 	}
43 | 
44 | 	// Match
45 | 	var v DotExpr
46 | 	fmt.Println(pattern.Find(&v, args.Str))
47 | }
48 | 


--------------------------------------------------------------------------------
/samples/python-import/python-import.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/alexflint/go-restructure"
 7 | )
 8 | 
 9 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{})
10 | 
11 | // Import matches "import foo" and "import foo as bar"
12 | type Import struct {
13 | 	_       struct{}             `^import\s+`
14 | 	Package restructure.Submatch `\w+`
15 | 	Alias   *AsName              `?`
16 | 	_       struct{}             `$`
17 | }
18 | 
19 | // AsName matches "as xyz"
20 | type AsName struct {
21 | 	_    struct{}             `\s+as\s+`
22 | 	Name restructure.Submatch `\w+`
23 | }
24 | 
25 | func main() {
26 | 	var imp Import
27 | 	importRegexp.Find(&imp, "import foo as bar")
28 | 	fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End)
29 | 	fmt.Printf("    AS %s (bytes %d...%d)\n", imp.Alias.Name.String(), imp.Alias.Name.Begin, imp.Alias.Name.End)
30 | }
31 | 


--------------------------------------------------------------------------------
/samples/quaternion-in-json/quaternion-in-json.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 
 7 | 	"github.com/alexflint/go-restructure"
 8 | )
 9 | 
10 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{})
11 | 
12 | type RealPart struct {
13 | 	Sign string `regexp:"[+-]?"`
14 | 	Real string `regexp:"[0-9]+"`
15 | }
16 | 
17 | type SignedInt struct {
18 | 	Sign string `regexp:"[+-]"`
19 | 	Real string `regexp:"[0-9]+"`
20 | }
21 | 
22 | type IPart struct {
23 | 	Magnitude SignedInt
24 | 	_         struct{} `regexp:"i"`
25 | }
26 | 
27 | type JPart struct {
28 | 	Magnitude SignedInt
29 | 	_         struct{} `regexp:"j"`
30 | }
31 | 
32 | type KPart struct {
33 | 	Magnitude SignedInt
34 | 	_         struct{} `regexp:"k"`
35 | }
36 | 
37 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc
38 | type Quaternion struct {
39 | 	Real *RealPart
40 | 	I    *IPart `regexp:"?"`
41 | 	J    *JPart `regexp:"?"`
42 | 	K    *KPart `regexp:"?"`
43 | }
44 | 
45 | // matches the quoted strings `"-1+2i+3j+4k"`, `"3-4k"`, `"12+34i"`, etc
46 | type QuotedQuaternion struct {
47 | 	_          struct{} `regexp:"^"`
48 | 	_          struct{} `regexp:"\""`
49 | 	Quaternion *Quaternion
50 | 	_          struct{} `regexp:"\""`
51 | 	_          struct{} `regexp:"$"`
52 | }
53 | 
54 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error {
55 | 	if !quaternionRegexp.Find(c, string(b)) {
56 | 		return fmt.Errorf("%s is not a quaternion number", string(b))
57 | 	}
58 | 	return nil
59 | }
60 | 
61 | // this struct is handled by JSON
62 | type Var struct {
63 | 	Name  string
64 | 	Value *QuotedQuaternion
65 | }
66 | 
67 | func prettyPrint(x interface{}) string {
68 | 	buf, err := json.MarshalIndent(x, "", "  ")
69 | 	if err != nil {
70 | 		return err.Error()
71 | 	}
72 | 	return string(buf)
73 | }
74 | 
75 | func main() {
76 | 	src := `{"name": "foo", "value": "1+2i+3j+4k"}`
77 | 	var v Var
78 | 	err := json.Unmarshal([]byte(src), &v)
79 | 	if err != nil {
80 | 		fmt.Println(err)
81 | 	}
82 | 	fmt.Println(prettyPrint(v))
83 | }
84 | 


--------------------------------------------------------------------------------
/samples/simple-email/simple-email.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/alexflint/go-restructure"
 7 | )
 8 | 
 9 | type EmailAddress struct {
10 | 	_    struct{} `^`
11 | 	User string   `\w+`
12 | 	_    struct{} `@`
13 | 	Host string   `[^@]+`
14 | 	_    struct{} `$`
15 | }
16 | 
17 | func main() {
18 | 	var addr EmailAddress
19 | 	success, err := restructure.Find(&addr, "joe@example.com")
20 | 	if err != nil {
21 | 		fmt.Println(err)
22 | 	}
23 | 	if success {
24 | 		fmt.Println(addr.User) // prints "joe"
25 | 		fmt.Println(addr.Host) // prints "example.com"
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/transform.go:
--------------------------------------------------------------------------------
 1 | package restructure
 2 | 
 3 | import "regexp/syntax"
 4 | 
 5 | type transformer func(expr *syntax.Regexp) ([]*syntax.Regexp, error)
 6 | 
 7 | // transform replaces each node in a regex AST with the return value of the given function
 8 | // it processes the children of a node before the node itself
 9 | func transform(expr *syntax.Regexp, f transformer) (*syntax.Regexp, error) {
10 | 	var newchildren []*syntax.Regexp
11 | 	for _, child := range expr.Sub {
12 | 		newchild, err := transform(child, f)
13 | 		if err != nil {
14 | 			return nil, err
15 | 		}
16 | 		replacements, err := f(newchild)
17 | 		if err != nil {
18 | 			return nil, err
19 | 		}
20 | 		newchildren = append(newchildren, replacements...)
21 | 	}
22 | 	expr.Sub = newchildren
23 | 	return expr, nil
24 | }
25 | 


--------------------------------------------------------------------------------