├── .github └── workflows │ └── go.yml ├── BENCHMARKS.md ├── LICENSE.md ├── README.md ├── TODO.md ├── benchmark_test.go ├── builder.go ├── go.mod ├── go.sum ├── inflate.go ├── regex ├── README.md ├── backtrack.go ├── machine.go ├── onepass.go └── regexp.go ├── restructure.go ├── restructure_test.go ├── samples ├── email-address │ └── email-address.go ├── find-all-floats │ └── find-all-floats.go ├── floating-point │ └── floating-point.go ├── name-dot-name │ └── name-dot-name.go ├── python-import │ └── python-import.go ├── quaternion-in-json │ └── quaternion-in-json.go └── simple-email │ └── simple-email.go └── transform.go /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: [push] 4 | 5 | jobs: 6 | 7 | test: 8 | name: Test 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | go: ['1.22', '1.23', 'stable'] 15 | 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | 20 | - id: go 21 | name: Setup 22 | uses: actions/setup-go@v5 23 | with: 24 | go-version: ${{ matrix.go }} 25 | 26 | - name: Build 27 | run: go build -v . 28 | 29 | - name: Test 30 | run: go test -v -coverprofile=profile.cov . 31 | 32 | - name: Coverage 33 | run: bash <(curl -s https://codecov.io/bash) -f profile.cov 34 | -------------------------------------------------------------------------------- /BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | ## Performance Benchmarks 2 | 3 | There are three benchmarks in `benchmark_test.go` that compare the performance of `go-restructure` to that of the standard library `regexp` package. `go-restructure` uses a very slightly modified version of the `regexp` package so the performance of the core regular expression evaluator is very similar; most of the difference is therefore associated with the overhead of reflection. 4 | 5 | These benchmarks were computed using `go test -bench=.` on an 2.8 GHz Intel Core i7 processor running OSX 10.10.5. 6 | 7 | The first benchmark involves finding the first floating point number in a string of a few thousand characters. `go-restructure` takes around 8% longer than the standard library: 8 | 9 | ``` 10 | go-restructure 32428 ns/op 11 | stdlib/regexp 30060 ns/op 12 | ``` 13 | 14 | The second benchmark involves parsing a short email address. `go-restructure` takes around 15 | 40% longer than the standard library: 16 | 17 | ``` 18 | go-restructure 1188 ns/op 19 | stdlib/regexp 844 ns/op 20 | ``` 21 | 22 | The third benchmark involves finding all python import statements in a file of around one hundred lines of python source. `go-restructure` takes around 2x longer than the standard library: 23 | 24 | ``` 25 | go-restructure 695 ns/op 26 | stdlib/regexp 337 ns/op 27 | ``` 28 | 29 | The high overhead for `go-restructure` on the last benchmark is probably due to `go-restructure` allocating a struct to hold the results of each match found by `FindAll`. In most cases this performance overhead will be a small price to pay for composable, inspectable regular expressions, particularly when it amonuts to the difference between one third of a microsecond and two thirds of a microsecond. However, applications that execute a very large number of regular expressions for which performance is critical may be well advised to use the standard library `regexp` package directly. 30 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Alex Flint 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Struct-based regular expressions for Go

2 |

3 | Documentation 4 | Build Status 5 | Coverage Status 6 | Go Report Card 7 |

8 |
9 | 10 | ## Match regular expressions into struct fields 11 | 12 | ```shell 13 | go get github.com/alexflint/go-restructure 14 | ``` 15 | 16 | This package allows you to express regular expressions by defining a struct, and then capture matched sub-expressions into struct fields. Here is a very simple email address parser: 17 | 18 | ```go 19 | import "github.com/alexflint/go-restructure" 20 | 21 | type EmailAddress struct { 22 | _ struct{} `^` 23 | User string `\w+` 24 | _ struct{} `@` 25 | Host string `[^@]+` 26 | _ struct{} `$` 27 | } 28 | 29 | func main() { 30 | var addr EmailAddress 31 | restructure.Find(&addr, "joe@example.com") 32 | fmt.Println(addr.User) // prints "joe" 33 | fmt.Println(addr.Host) // prints "example.com" 34 | } 35 | ``` 36 | (Note that the above is far too simplistic to be used as a serious email address validator.) 37 | 38 | The regular expression that was executed was the concatenation of the struct tags: 39 | 40 | ``` 41 | ^(\w+)@([^@]+)$ 42 | ``` 43 | 44 | The first submatch was inserted into the `User` field and the second into the `Host` field. 45 | 46 | You may also use the `regexp:` tag key, but keep in mind that you must escape quotes and backslashes: 47 | 48 | ```go 49 | type EmailAddress struct { 50 | _ string `regexp:"^"` 51 | User string `regexp:"\\w+"` 52 | _ string `regexp:"@"` 53 | Host string `regexp:"[^@]+"` 54 | _ string `regexp:"$"` 55 | } 56 | ``` 57 | 58 | ### Nested Structs 59 | 60 | Here is a slightly more sophisticated email address parser that uses nested structs: 61 | 62 | ```go 63 | type Hostname struct { 64 | Domain string `\w+` 65 | _ struct{} `\.` 66 | TLD string `\w+` 67 | } 68 | 69 | type EmailAddress struct { 70 | _ struct{} `^` 71 | User string `[a-zA-Z0-9._%+-]+` 72 | _ struct{} `@` 73 | Host *Hostname 74 | _ struct{} `$` 75 | } 76 | 77 | func main() { 78 | var addr EmailAddress 79 | success, _ := restructure.Find(&addr, "joe@example.com") 80 | if success { 81 | fmt.Println(addr.User) // prints "joe" 82 | fmt.Println(addr.Host.Domain) // prints "example" 83 | fmt.Println(addr.Host.TLD) // prints "com" 84 | } 85 | } 86 | ``` 87 | 88 | Compare this to using the standard library `regexp.FindStringSubmatchIndex` directly: 89 | 90 | ```go 91 | func main() { 92 | content := "joe@example.com" 93 | expr := regexp.MustCompile(`^([a-zA-Z0-9._%+-]+)@((\w+)\.(\w+))$`) 94 | indices := expr.FindStringSubmatchIndex(content) 95 | if len(indices) > 0 { 96 | userBegin, userEnd := indices[2], indices[3] 97 | var user string 98 | if userBegin != -1 && userEnd != -1 { 99 | user = content[userBegin:userEnd] 100 | } 101 | 102 | domainBegin, domainEnd := indices[6], indices[7] 103 | var domain string 104 | if domainBegin != -1 && domainEnd != -1 { 105 | domain = content[domainBegin:domainEnd] 106 | } 107 | 108 | tldBegin, tldEnd := indices[8], indices[9] 109 | var tld string 110 | if tldBegin != -1 && tldEnd != -1 { 111 | tld = content[tldBegin:tldEnd] 112 | } 113 | 114 | fmt.Println(user) // prints "joe" 115 | fmt.Println(domain) // prints "example" 116 | fmt.Println(tld) // prints "com" 117 | } 118 | } 119 | ``` 120 | 121 | ### Ints 122 | 123 | It is also possible to set struct fields as `int` to get the string automatically converted. 124 | 125 | ```go 126 | // Matches "12 wombats", "1 wombat" and store the number as int 127 | type Wisdom struct { 128 | Number int `^\d+` 129 | _ string `\s+` 130 | Animal string `\w+` 131 | } 132 | ``` 133 | 134 | ### Optional fields 135 | 136 | When nesting one struct within another, you can make the nested struct optional by marking it with `?`. The following example parses floating point numbers with optional sign and exponent: 137 | 138 | ```go 139 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123" 140 | type Float struct { 141 | Sign *Sign `?` // sign is optional 142 | Whole string `[0-9]*` 143 | Period struct{} `\.?` 144 | Frac string `[0-9]+` 145 | Exponent *Exponent `?` // exponent is optional 146 | } 147 | 148 | // Matches "e+4", "E6", "e-03" 149 | type Exponent struct { 150 | _ struct{} `[eE]` 151 | Sign *Sign `?` // sign is optional 152 | Num string `[0-9]+` 153 | } 154 | 155 | // Matches "+" or "-" 156 | type Sign struct { 157 | Ch string `[+-]` 158 | } 159 | ``` 160 | 161 | When an optional sub-struct is not matched, it will be set to nil: 162 | 163 | ```javascript 164 | "1.23" -> { 165 | "Sign": nil, 166 | "Whole": "1", 167 | "Frac": "23", 168 | "Exponent": nil 169 | } 170 | 171 | "1.23e+45" -> { 172 | "Sign": nil, 173 | "Whole": "1", 174 | "Frac": "23", 175 | "Exponent": { 176 | "Sign": { 177 | "Ch": "+" 178 | }, 179 | "Num": "45" 180 | } 181 | } 182 | ``` 183 | 184 | ### Finding multiple matches 185 | 186 | The following example uses `Regexp.FindAll` to extract all floating point numbers from 187 | a string, using the same `Float` struct as in the example above. 188 | 189 | ```go 190 | src := "There are 10.4 cats for every 100 dogs in the United States." 191 | floatRegexp := restructure.MustCompile(Float{}, restructure.Options{}) 192 | var floats []Float 193 | floatRegexp.FindAll(&floats, src, -1) 194 | ``` 195 | 196 | To limit the number of matches set the third parameter to a positive number. 197 | 198 | ### Getting begin and end positions for submatches 199 | 200 | To get the begin and end position of submatches, use the `restructure.Submatch` struct in place of `string`: 201 | 202 | Here is an example of matching python imports such as `import foo as bar`: 203 | 204 | ```go 205 | type Import struct { 206 | _ struct{} `^import\s+` 207 | Package restructure.Submatch `\w+` 208 | _ struct{} `\s+as\s+` 209 | Alias restructure.Submatch `\w+` 210 | } 211 | 212 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{}) 213 | 214 | func main() { 215 | var imp Import 216 | importRegexp.Find(&imp, "import foo as bar") 217 | fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End) 218 | fmt.Printf(" AS %s (bytes %d...%d)\n", imp.Alias.String(), imp.Alias.Begin, imp.Alias.End) 219 | } 220 | ``` 221 | Output: 222 | ``` 223 | IMPORT foo (bytes 7...10) 224 | AS bar (bytes 14...17) 225 | ``` 226 | 227 | ### Regular expressions inside JSON 228 | 229 | To run a regular expression as part of a json unmarshal, just implement the `JSONUnmarshaler` interface. Here is an example that parses the following JSON string containing a quaternion: 230 | 231 | ```javascript 232 | { 233 | "Var": "foo", 234 | "Val": "1+2i+3j+4k" 235 | } 236 | ``` 237 | 238 | First we define the expressions for matching quaternions in the form `1+2i+3j+4k`: 239 | 240 | ```go 241 | // Matches "1", "-12", "+12" 242 | type RealPart struct { 243 | Sign string `regexp:"[+-]?"` 244 | Real string `regexp:"[0-9]+"` 245 | } 246 | 247 | // Matches "+123", "-1" 248 | type SignedInt struct { 249 | Sign string `regexp:"[+-]"` 250 | Real string `regexp:"[0-9]+"` 251 | } 252 | 253 | // Matches "+12i", "-123i" 254 | type IPart struct { 255 | Magnitude SignedInt 256 | _ struct{} `regexp:"i"` 257 | } 258 | 259 | // Matches "+12j", "-123j" 260 | type JPart struct { 261 | Magnitude SignedInt 262 | _ struct{} `regexp:"j"` 263 | } 264 | 265 | // Matches "+12k", "-123k" 266 | type KPart struct { 267 | Magnitude SignedInt 268 | _ struct{} `regexp:"k"` 269 | } 270 | 271 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc 272 | type Quaternion struct { 273 | Real *RealPart 274 | I *IPart `regexp:"?"` 275 | J *JPart `regexp:"?"` 276 | K *KPart `regexp:"?"` 277 | } 278 | 279 | // matches the quoted strings `"-1+2i"`, `"3-4i"`, `"12+34i"`, etc 280 | type QuotedQuaternion struct { 281 | _ struct{} `regexp:"^"` 282 | _ struct{} `regexp:"\""` 283 | Quaternion *Quaternion 284 | _ struct{} `regexp:"\""` 285 | _ struct{} `regexp:"$"` 286 | } 287 | ``` 288 | 289 | Next we implement `UnmarshalJSON` for the `QuotedQuaternion` type: 290 | ```go 291 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{}) 292 | 293 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error { 294 | if !quaternionRegexp.Find(c, string(b)) { 295 | return fmt.Errorf("%s is not a quaternion", string(b)) 296 | } 297 | return nil 298 | } 299 | 300 | ``` 301 | 302 | Now we can define a struct and unmarshal JSON into it: 303 | ```go 304 | type Var struct { 305 | Name string 306 | Value *QuotedQuaternion 307 | } 308 | 309 | func main() { 310 | src := `{"name": "foo", "value": "1+2i+3j+4k"}` 311 | var v Var 312 | json.Unmarshal([]byte(src), &v) 313 | } 314 | ``` 315 | The result is: 316 | ```javascript 317 | { 318 | "Name": "foo", 319 | "Value": { 320 | "Quaternion": { 321 | "Real": { 322 | "Sign": "", 323 | "Real": "1" 324 | }, 325 | "I": { 326 | "Magnitude": { 327 | "Sign": "+", 328 | "Real": "2" 329 | } 330 | }, 331 | "J": { 332 | "Magnitude": { 333 | "Sign": "+", 334 | "Real": "3" 335 | } 336 | }, 337 | "K": { 338 | "Magnitude": { 339 | "Sign": "+", 340 | "Real": "4" 341 | } 342 | } 343 | } 344 | } 345 | } 346 | ``` 347 | 348 | ### Index of examples 349 | 350 | - [Parse an email address](samples/simple-email/simple-email.go) 351 | - [Parse an email address using nested structs](samples/email-address/email-address.go) 352 | - [Parse a floating point number](samples/floating-point/floating-point.go) 353 | - [Find all floats in a string](samples/find-all-floats/find-all-floats.go) 354 | - [Parse a dotted name](samples/name-dot-name/name-dot-name.go) 355 | - [Parse a python import statement](samples/python-import/python-import.go) 356 | - [Regular expression inside a JSON struct](samples/quaternion-in-json/quaternion-in-json.go) 357 | 358 | ### Benchmarks 359 | 360 | See [benchmarks document](BENCHMARKS.md) 361 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ## TODO 2 | - repeated subexpressions 3 | - optional terminal matches (look at top node in AST) 4 | - remove OpCaptures from terminals 5 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import ( 4 | "io/ioutil" 5 | "os" 6 | "regexp" 7 | "testing" 8 | ) 9 | 10 | var src = ` 11 | The US economy went through an economic downturn following the financial 12 | crisis of 2007–08, with output as late as 2013 still below potential 13 | according to the Congressional Budget Office.[57] The economy, however, 14 | began to recover in the second half of 2009, and as of November 2015, 15 | unemployment had declined from a high of 10% to 5%; the government's 16 | broader U-6 unemployment rate, which includes the part-time underemployed, 17 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of 18 | the lowest labor union participation rates in the OECD.[58] Households 19 | living on less than $2 per day before government benefits, doubled from 20 | 1996 levels to 1.5 million households in 2011, including 2.8 million 21 | children.[59] The gap in income between rich and poor is greater in the 22 | United States than in any other developed country.[60] Total public and 23 | private debt was $50 trillion at the end of the first quarter of 2010, 24 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more 25 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion 26 | and domestic financial liabilities totaled $106 trillion.[63] 27 | ` 28 | 29 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123" 30 | type Float struct { 31 | Sign *Sign `?` 32 | Whole string `[0-9]*` 33 | Period struct{} `\.?` 34 | Frac string `[0-9]+` 35 | Exponent *Exponent `?` 36 | } 37 | 38 | // Matches "+" or "-" 39 | type Sign struct { 40 | Ch string `[+-]` 41 | } 42 | 43 | // Matches "e+4", "E6", "e-03" 44 | type Exponent struct { 45 | _ struct{} `[eE]` 46 | Sign *Sign `?` 47 | Num string `[0-9]+` 48 | } 49 | 50 | func BenchmarkFindFloat(b *testing.B) { 51 | pattern := MustCompile(Float{}, Options{}) 52 | var f Float 53 | b.ResetTimer() 54 | for i := 0; i < b.N; i++ { 55 | pattern.Find(&f, src) 56 | } 57 | } 58 | 59 | func BenchmarkFindFloatStdlib(b *testing.B) { 60 | pattern := regexp.MustCompile(`((?P((?P[\+\-]))?)(?P[0-9]*)(?P\.?)(?P[0-9]+)(?P((?i:E)(?P((?P[\+\-]))?)(?P[0-9]+))?))`) 61 | b.ResetTimer() 62 | for i := 0; i < b.N; i++ { 63 | pattern.FindSubmatch([]byte(src)) 64 | } 65 | } 66 | 67 | type EmailAddress struct { 68 | _ struct{} `^` 69 | User string `[a-zA-Z0-9._%+-]+` 70 | _ struct{} `@` 71 | Host string `.+` 72 | _ struct{} `$` 73 | } 74 | 75 | func BenchmarkParseEmail(b *testing.B) { 76 | var addr EmailAddress 77 | pattern := MustCompile(EmailAddress{}, Options{}) 78 | b.ResetTimer() 79 | for i := 0; i < b.N; i++ { 80 | pattern.Find(&addr, "joe@example.com") 81 | } 82 | } 83 | 84 | func BenchmarkParseEmailStdlib(b *testing.B) { 85 | //pattern := regexp.MustCompile(`(\A(?P[%\+\--\.0-9A-Z_a-z]+)@(?P((?P[0-9A-Z_a-z]+)\.(?P[0-9A-Z_a-z]+)))(?-m:$))`) 86 | pattern := regexp.MustCompile(`(\A(?P[%\+\--\.0-9A-Z_a-z]+)@(?P.+)(?-m:$))`) 87 | b.ResetTimer() 88 | for i := 0; i < b.N; i++ { 89 | pattern.FindStringSubmatch("joe@example.com") 90 | } 91 | } 92 | 93 | // Import matches "import foo" and "import foo as bar" 94 | type Import struct { 95 | _ struct{} `^import\s+` 96 | Package Submatch `\w+` 97 | Alias *AsName `?` 98 | _ struct{} `$` 99 | } 100 | 101 | // AsName matches "as xyz" 102 | type AsName struct { 103 | _ struct{} `\s+as\s+` 104 | Name Submatch `\w+` 105 | } 106 | 107 | func BenchmarkFindAllImports(b *testing.B) { 108 | path := os.Getenv("TESTDATA") 109 | if path == "" { 110 | b.Skip("skipping because TESTDATA environment var was not set") 111 | } 112 | buf, err := ioutil.ReadFile(path) 113 | if err != nil { 114 | b.Error(err) 115 | } 116 | pattern := MustCompile(Import{}, Options{}) 117 | var imports []Import 118 | b.ResetTimer() 119 | for i := 0; i < b.N; i++ { 120 | pattern.FindAll(&imports, string(buf), -1) 121 | } 122 | } 123 | 124 | func BenchmarkFindAllImportsStdlib(b *testing.B) { 125 | path := os.Getenv("TESTDATA") 126 | if path == "" { 127 | b.Skip("skipping because TESTDATA environment var was not set") 128 | } 129 | buf, err := ioutil.ReadFile(path) 130 | if err != nil { 131 | b.Error(err) 132 | } 133 | pattern := regexp.MustCompile(`(\Aimport[\t-\n\f-\r ]+(?P[0-9A-Z_a-z]+)(?P([\t-\n\f-\r ]+as[\t-\n\f-\r ]+(?P[0-9A-Z_a-z]+))?)(?-m:$))`) 134 | b.ResetTimer() 135 | for i := 0; i < b.N; i++ { 136 | pattern.FindAllSubmatchIndex(buf, -1) 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /builder.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "reflect" 7 | "regexp/syntax" 8 | "strings" 9 | ) 10 | 11 | // A Role determines how a struct field is inflated 12 | type Role int 13 | 14 | const ( 15 | EmptyRole Role = iota 16 | PosRole 17 | SubstructRole 18 | StringScalarRole 19 | IntScalarRole 20 | ByteSliceScalarRole 21 | SubmatchScalarRole 22 | ) 23 | 24 | // A Struct describes how to inflate a match into a struct 25 | type Struct struct { 26 | capture int 27 | fields []*Field 28 | } 29 | 30 | // A Field describes how to inflate a match into a field 31 | type Field struct { 32 | capture int // index of the capture for this field 33 | index []int // index of this field within its parent struct 34 | child *Struct // descendant struct; nil for terminals 35 | role Role 36 | } 37 | 38 | func isExported(f reflect.StructField) bool { 39 | return f.PkgPath == "" 40 | } 41 | 42 | // A builder builds stencils from structs using reflection 43 | type builder struct { 44 | numCaptures int 45 | opts Options 46 | } 47 | 48 | func newBuilder(opts Options) *builder { 49 | return &builder{ 50 | opts: opts, 51 | } 52 | } 53 | 54 | func (b *builder) nextCaptureIndex() int { 55 | k := b.numCaptures 56 | b.numCaptures++ 57 | return k 58 | } 59 | 60 | func (b *builder) extractTag(tag reflect.StructTag) (string, error) { 61 | // Allow tags that look like either `regexp:"\\w+"` or just `\w+` 62 | if s := tag.Get("regexp"); s != "" { 63 | return s, nil 64 | } else if strings.Contains(string(tag), `regexp:"`) { 65 | return "", errors.New("incorrectly escaped struct tag") 66 | } else { 67 | return string(tag), nil 68 | } 69 | } 70 | 71 | func removeCaptures(expr *syntax.Regexp) ([]*syntax.Regexp, error) { 72 | if expr.Op == syntax.OpCapture { 73 | return expr.Sub, nil 74 | } 75 | return []*syntax.Regexp{expr}, nil 76 | } 77 | 78 | func (b *builder) terminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) { 79 | pattern, err := b.extractTag(f.Tag) 80 | if err != nil { 81 | return nil, nil, fmt.Errorf("%s: %v", fullName, err) 82 | } 83 | if pattern == "" { 84 | return nil, nil, nil 85 | } 86 | 87 | // Parse the pattern 88 | expr, err := syntax.Parse(pattern, b.opts.SyntaxFlags) 89 | if err != nil { 90 | return nil, nil, fmt.Errorf(`%s: %v (pattern was "%s")`, fullName, err, f.Tag) 91 | } 92 | 93 | // Remove capture nodes within the AST 94 | expr, err = transform(expr, removeCaptures) 95 | if err != nil { 96 | return nil, nil, fmt.Errorf(`failed to remove captures from "%s": %v`, pattern, err) 97 | } 98 | 99 | // Determine the kind 100 | t := f.Type 101 | if t.Kind() == reflect.Ptr { 102 | t = t.Elem() 103 | } 104 | var role Role 105 | switch t { 106 | case emptyType: 107 | role = EmptyRole 108 | case stringType: 109 | role = StringScalarRole 110 | case intType: 111 | role = IntScalarRole 112 | case byteSliceType: 113 | role = ByteSliceScalarRole 114 | case submatchType: 115 | role = SubmatchScalarRole 116 | } 117 | 118 | captureIndex := -1 119 | if isExported(f) { 120 | captureIndex = b.nextCaptureIndex() 121 | expr = &syntax.Regexp{ 122 | Op: syntax.OpCapture, 123 | Sub: []*syntax.Regexp{expr}, 124 | Name: f.Name, 125 | Cap: captureIndex, 126 | } 127 | } 128 | field := &Field{ 129 | index: f.Index, 130 | capture: captureIndex, 131 | role: role, 132 | } 133 | 134 | return field, expr, nil 135 | } 136 | 137 | func (b *builder) pos(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) { 138 | if !isExported(f) { 139 | return nil, nil, nil 140 | } 141 | captureIndex := b.nextCaptureIndex() 142 | empty := &syntax.Regexp{ 143 | Op: syntax.OpEmptyMatch, 144 | } 145 | expr := &syntax.Regexp{ 146 | Op: syntax.OpCapture, 147 | Sub: []*syntax.Regexp{empty}, 148 | Name: f.Name, 149 | Cap: captureIndex, 150 | } 151 | field := &Field{ 152 | index: f.Index, 153 | capture: captureIndex, 154 | role: PosRole, 155 | } 156 | 157 | return field, expr, nil 158 | } 159 | 160 | func (b *builder) nonterminal(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) { 161 | opstr, err := b.extractTag(f.Tag) 162 | if err != nil { 163 | return nil, nil, err 164 | } 165 | child, expr, err := b.structure(f.Type) 166 | if err != nil { 167 | return nil, nil, err 168 | } 169 | 170 | switch opstr { 171 | case "?": 172 | if f.Type.Kind() != reflect.Ptr { 173 | return nil, nil, fmt.Errorf(`%s is marked with "?" but is not a pointer`, fullName) 174 | } 175 | expr = &syntax.Regexp{ 176 | Sub: []*syntax.Regexp{expr}, 177 | Op: syntax.OpQuest, 178 | } 179 | case "": 180 | // nothing to do 181 | default: 182 | return nil, nil, fmt.Errorf("invalid op \"%s\" for non-slice field on %s", opstr, fullName) 183 | } 184 | 185 | captureIndex := b.nextCaptureIndex() 186 | expr = &syntax.Regexp{ 187 | Op: syntax.OpCapture, 188 | Sub: []*syntax.Regexp{expr}, 189 | Name: f.Name, 190 | Cap: captureIndex, 191 | } 192 | field := &Field{ 193 | index: f.Index, 194 | capture: captureIndex, 195 | child: child, 196 | role: SubstructRole, 197 | } 198 | 199 | return field, expr, nil 200 | } 201 | 202 | func (b *builder) field(f reflect.StructField, fullName string) (*Field, *syntax.Regexp, error) { 203 | if isScalar(f.Type) { 204 | return b.terminal(f, fullName) 205 | } else if isStruct(f.Type) { 206 | return b.nonterminal(f, fullName) 207 | } else if f.Type == posType { 208 | return b.pos(f, fullName) 209 | } 210 | return nil, nil, nil 211 | } 212 | 213 | func (b *builder) structure(t reflect.Type) (*Struct, *syntax.Regexp, error) { 214 | if t.Kind() == reflect.Ptr { 215 | t = t.Elem() 216 | } 217 | 218 | // Select a capture index first so that the struct comes before its fields 219 | captureIndex := b.nextCaptureIndex() 220 | 221 | var exprs []*syntax.Regexp 222 | var fields []*Field 223 | for i := 0; i < t.NumField(); i++ { 224 | f := t.Field(i) 225 | field, expr, err := b.field(f, t.Name()+"."+f.Name) 226 | if err != nil { 227 | return nil, nil, err 228 | } 229 | if field != nil { 230 | exprs = append(exprs, expr) 231 | fields = append(fields, field) 232 | } 233 | } 234 | 235 | // Wrap in a concat 236 | expr := &syntax.Regexp{ 237 | Sub: exprs, 238 | Op: syntax.OpConcat, 239 | } 240 | 241 | // Wrap in a capture 242 | expr = &syntax.Regexp{ 243 | Sub: []*syntax.Regexp{expr}, 244 | Op: syntax.OpCapture, 245 | Cap: captureIndex, 246 | } 247 | 248 | st := &Struct{ 249 | fields: fields, 250 | capture: captureIndex, 251 | } 252 | 253 | return st, expr, nil 254 | } 255 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/alexflint/go-restructure 2 | 3 | go 1.15 4 | 5 | require ( 6 | github.com/stretchr/testify v1.7.0 7 | ) 8 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28 h1:p32gFVhF4WnI/qpSpZ0//GGb6BAAFLVnkd4Vowg7im8= 2 | github.com/alexflint/go-restructure v0.0.0-20160131054339-a509d071de28/go.mod h1:8Mq15S+jJn5TWrSU0Ua7L8rFWmY06lu0UCbhJrrcGBY= 3 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 8 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 9 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 10 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 11 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= 12 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 13 | -------------------------------------------------------------------------------- /inflate.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strconv" 7 | ) 8 | 9 | var ( 10 | posType = reflect.TypeOf(Pos(0)) 11 | 12 | emptyType = reflect.TypeOf(struct{}{}) 13 | stringType = reflect.TypeOf("") 14 | intType = reflect.TypeOf(1) 15 | byteSliceType = reflect.TypeOf([]byte{}) 16 | submatchType = reflect.TypeOf(Submatch{}) 17 | scalarTypes = []reflect.Type{ 18 | emptyType, 19 | stringType, 20 | intType, 21 | byteSliceType, 22 | submatchType, 23 | } 24 | ) 25 | 26 | // determines whether t is a scalar type or a pointer to a scalar type 27 | func isScalar(t reflect.Type) bool { 28 | if t.Kind() == reflect.Ptr { 29 | t = t.Elem() 30 | } 31 | for _, u := range scalarTypes { 32 | if t == u { 33 | return true 34 | } 35 | } 36 | return false 37 | } 38 | 39 | // determines whether t is a struct type or a pointer to a struct type 40 | func isStruct(t reflect.Type) bool { 41 | if t.Kind() == reflect.Ptr { 42 | t = t.Elem() 43 | } 44 | return t.Kind() == reflect.Struct 45 | } 46 | 47 | // ensureAlloc replaces nil pointers with newly allocated objects 48 | func ensureAlloc(dest reflect.Value) reflect.Value { 49 | if dest.Kind() == reflect.Ptr { 50 | if dest.IsNil() { 51 | dest.Set(reflect.New(dest.Type().Elem())) 52 | } 53 | return dest.Elem() 54 | } 55 | return dest 56 | } 57 | 58 | // inflate the results of a match into a string 59 | func inflateScalar(dest reflect.Value, match *match, captureIndex int, role Role) error { 60 | if captureIndex == -1 { 61 | // This means the field generated a regex but we did not want the results 62 | return nil 63 | } 64 | 65 | // Get the subcapture for this field 66 | subcapture := match.captures[captureIndex] 67 | if !subcapture.wasMatched() { 68 | // This means the subcapture was optional and was not matched 69 | return nil 70 | } 71 | 72 | // Get the matched bytes 73 | buf := match.input[subcapture.begin:subcapture.end] 74 | 75 | // If dest is a nil pointer then allocate a new instance and assign the pointer to dest 76 | dest = ensureAlloc(dest) 77 | 78 | // Deal with each recognized type 79 | switch role { 80 | case StringScalarRole: 81 | dest.SetString(string(buf)) 82 | return nil 83 | case IntScalarRole: 84 | if intVal, err := strconv.Atoi(string(buf)); err != nil { 85 | return fmt.Errorf("unable to capture into %s", dest.Type().String()) 86 | } else { 87 | dest.SetInt(int64(intVal)) 88 | return nil 89 | } 90 | case ByteSliceScalarRole: 91 | dest.SetBytes(buf) 92 | return nil 93 | case SubmatchScalarRole: 94 | submatch := dest.Addr().Interface().(*Submatch) 95 | submatch.Begin = Pos(subcapture.begin) 96 | submatch.End = Pos(subcapture.end) 97 | submatch.Bytes = buf 98 | return nil 99 | } 100 | return fmt.Errorf("unable to capture into %s", dest.Type().String()) 101 | } 102 | 103 | // inflate the position of a match into a Pos 104 | func inflatePos(dest reflect.Value, match *match, captureIndex int) error { 105 | if captureIndex == -1 { 106 | // This means the field generated a regex but we did not want the results 107 | return nil 108 | } 109 | 110 | // Get the subcapture for this field 111 | subcapture := match.captures[captureIndex] 112 | if !subcapture.wasMatched() { 113 | // This means the subcapture was optional and was not matched 114 | return nil 115 | } 116 | 117 | // If dest is a nil pointer then allocate a new instance and assign the pointer to dest 118 | dest.SetInt(int64(subcapture.begin)) 119 | return nil 120 | } 121 | 122 | // inflate the results of a match into a struct 123 | func inflateStruct(dest reflect.Value, match *match, structure *Struct) error { 124 | // Get the subcapture for this field 125 | subcapture := match.captures[structure.capture] 126 | if !subcapture.wasMatched() { 127 | return nil 128 | } 129 | 130 | // If the field is a nil pointer then allocate an instance and assign pointer to dest 131 | dest = ensureAlloc(dest) 132 | 133 | // Inflate values into the struct fields 134 | for _, field := range structure.fields { 135 | switch field.role { 136 | case PosRole: 137 | val := dest.FieldByIndex(field.index) 138 | if err := inflatePos(val, match, field.capture); err != nil { 139 | return err 140 | } 141 | case StringScalarRole, ByteSliceScalarRole, SubmatchScalarRole, IntScalarRole: 142 | val := dest.FieldByIndex(field.index) 143 | if err := inflateScalar(val, match, field.capture, field.role); err != nil { 144 | return err 145 | } 146 | case SubstructRole: 147 | val := dest.FieldByIndex(field.index) 148 | if err := inflateStruct(val, match, field.child); err != nil { 149 | return err 150 | } 151 | } 152 | } 153 | return nil 154 | } 155 | -------------------------------------------------------------------------------- /regex/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a slightly modified version of the Go 1.5.2 standard library `regexp` package. -------------------------------------------------------------------------------- /regex/backtrack.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // backtrack is a regular expression search with submatch 6 | // tracking for small regular expressions and texts. It allocates 7 | // a bit vector with (length of input) * (length of prog) bits, 8 | // to make sure it never explores the same (character position, instruction) 9 | // state multiple times. This limits the search to run in time linear in 10 | // the length of the test. 11 | // 12 | // backtrack is a fast replacement for the NFA code on small 13 | // regexps when onepass cannot be used. 14 | 15 | package regex 16 | 17 | import "regexp/syntax" 18 | 19 | // A job is an entry on the backtracker's job stack. It holds 20 | // the instruction pc and the position in the input. 21 | type job struct { 22 | pc uint32 23 | arg int 24 | pos int 25 | } 26 | 27 | const ( 28 | visitedBits = 32 29 | maxBacktrackProg = 500 // len(prog.Inst) <= max 30 | maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 31 | ) 32 | 33 | // bitState holds state for the backtracker. 34 | type bitState struct { 35 | prog *syntax.Prog 36 | 37 | end int 38 | cap []int 39 | input input 40 | jobs []job 41 | visited []uint32 42 | } 43 | 44 | var notBacktrack *bitState = nil 45 | 46 | // maxBitStateLen returns the maximum length of a string to search with 47 | // the backtracker using prog. 48 | func maxBitStateLen(prog *syntax.Prog) int { 49 | if !shouldBacktrack(prog) { 50 | return 0 51 | } 52 | return maxBacktrackVector / len(prog.Inst) 53 | } 54 | 55 | // newBitState returns a new bitState for the given prog, 56 | // or notBacktrack if the size of the prog exceeds the maximum size that 57 | // the backtracker will be run for. 58 | func newBitState(prog *syntax.Prog) *bitState { 59 | if !shouldBacktrack(prog) { 60 | return notBacktrack 61 | } 62 | return &bitState{ 63 | prog: prog, 64 | } 65 | } 66 | 67 | // shouldBacktrack reports whether the program is too 68 | // long for the backtracker to run. 69 | func shouldBacktrack(prog *syntax.Prog) bool { 70 | return len(prog.Inst) <= maxBacktrackProg 71 | } 72 | 73 | // reset resets the state of the backtracker. 74 | // end is the end position in the input. 75 | // ncap is the number of captures. 76 | func (b *bitState) reset(end int, ncap int) { 77 | b.end = end 78 | 79 | if cap(b.jobs) == 0 { 80 | b.jobs = make([]job, 0, 256) 81 | } else { 82 | b.jobs = b.jobs[:0] 83 | } 84 | 85 | visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 86 | if cap(b.visited) < visitedSize { 87 | b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 88 | } else { 89 | b.visited = b.visited[:visitedSize] 90 | for i := range b.visited { 91 | b.visited[i] = 0 92 | } 93 | } 94 | 95 | if cap(b.cap) < ncap { 96 | b.cap = make([]int, ncap) 97 | } else { 98 | b.cap = b.cap[:ncap] 99 | } 100 | for i := range b.cap { 101 | b.cap[i] = -1 102 | } 103 | } 104 | 105 | // shouldVisit reports whether the combination of (pc, pos) has not 106 | // been visited yet. 107 | func (b *bitState) shouldVisit(pc uint32, pos int) bool { 108 | n := uint(int(pc)*(b.end+1) + pos) 109 | if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 110 | return false 111 | } 112 | b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 113 | return true 114 | } 115 | 116 | // push pushes (pc, pos, arg) onto the job stack if it should be 117 | // visited. 118 | func (b *bitState) push(pc uint32, pos int, arg int) { 119 | if b.prog.Inst[pc].Op == syntax.InstFail { 120 | return 121 | } 122 | 123 | // Only check shouldVisit when arg == 0. 124 | // When arg > 0, we are continuing a previous visit. 125 | if arg == 0 && !b.shouldVisit(pc, pos) { 126 | return 127 | } 128 | 129 | b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 130 | } 131 | 132 | // tryBacktrack runs a backtracking search starting at pos. 133 | func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 134 | longest := m.re.longest 135 | m.matched = false 136 | 137 | b.push(pc, pos, 0) 138 | for len(b.jobs) > 0 { 139 | l := len(b.jobs) - 1 140 | // Pop job off the stack. 141 | pc := b.jobs[l].pc 142 | pos := b.jobs[l].pos 143 | arg := b.jobs[l].arg 144 | b.jobs = b.jobs[:l] 145 | 146 | // Optimization: rather than push and pop, 147 | // code that is going to Push and continue 148 | // the loop simply updates ip, p, and arg 149 | // and jumps to CheckAndLoop. We have to 150 | // do the ShouldVisit check that Push 151 | // would have, but we avoid the stack 152 | // manipulation. 153 | goto Skip 154 | CheckAndLoop: 155 | if !b.shouldVisit(pc, pos) { 156 | continue 157 | } 158 | Skip: 159 | 160 | inst := b.prog.Inst[pc] 161 | 162 | switch inst.Op { 163 | default: 164 | panic("bad inst") 165 | case syntax.InstFail: 166 | panic("unexpected InstFail") 167 | case syntax.InstAlt: 168 | // Cannot just 169 | // b.push(inst.Out, pos, 0) 170 | // b.push(inst.Arg, pos, 0) 171 | // If during the processing of inst.Out, we encounter 172 | // inst.Arg via another path, we want to process it then. 173 | // Pushing it here will inhibit that. Instead, re-push 174 | // inst with arg==1 as a reminder to push inst.Arg out 175 | // later. 176 | switch arg { 177 | case 0: 178 | b.push(pc, pos, 1) 179 | pc = inst.Out 180 | goto CheckAndLoop 181 | case 1: 182 | // Finished inst.Out; try inst.Arg. 183 | arg = 0 184 | pc = inst.Arg 185 | goto CheckAndLoop 186 | } 187 | panic("bad arg in InstAlt") 188 | 189 | case syntax.InstAltMatch: 190 | // One opcode consumes runes; the other leads to match. 191 | switch b.prog.Inst[inst.Out].Op { 192 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 193 | // inst.Arg is the match. 194 | b.push(inst.Arg, pos, 0) 195 | pc = inst.Arg 196 | pos = b.end 197 | goto CheckAndLoop 198 | } 199 | // inst.Out is the match - non-greedy 200 | b.push(inst.Out, b.end, 0) 201 | pc = inst.Out 202 | goto CheckAndLoop 203 | 204 | case syntax.InstRune: 205 | r, width := i.step(pos) 206 | if !inst.MatchRune(r) { 207 | continue 208 | } 209 | pos += width 210 | pc = inst.Out 211 | goto CheckAndLoop 212 | 213 | case syntax.InstRune1: 214 | r, width := i.step(pos) 215 | if r != inst.Rune[0] { 216 | continue 217 | } 218 | pos += width 219 | pc = inst.Out 220 | goto CheckAndLoop 221 | 222 | case syntax.InstRuneAnyNotNL: 223 | r, width := i.step(pos) 224 | if r == '\n' || r == endOfText { 225 | continue 226 | } 227 | pos += width 228 | pc = inst.Out 229 | goto CheckAndLoop 230 | 231 | case syntax.InstRuneAny: 232 | r, width := i.step(pos) 233 | if r == endOfText { 234 | continue 235 | } 236 | pos += width 237 | pc = inst.Out 238 | goto CheckAndLoop 239 | 240 | case syntax.InstCapture: 241 | switch arg { 242 | case 0: 243 | if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) { 244 | // Capture pos to register, but save old value. 245 | b.push(pc, b.cap[inst.Arg], 1) // come back when we're done. 246 | b.cap[inst.Arg] = pos 247 | } 248 | pc = inst.Out 249 | goto CheckAndLoop 250 | case 1: 251 | // Finished inst.Out; restore the old value. 252 | b.cap[inst.Arg] = pos 253 | continue 254 | 255 | } 256 | panic("bad arg in InstCapture") 257 | continue 258 | 259 | case syntax.InstEmptyWidth: 260 | if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 { 261 | continue 262 | } 263 | pc = inst.Out 264 | goto CheckAndLoop 265 | 266 | case syntax.InstNop: 267 | pc = inst.Out 268 | goto CheckAndLoop 269 | 270 | case syntax.InstMatch: 271 | // We found a match. If the caller doesn't care 272 | // where the match is, no point going further. 273 | if len(b.cap) == 0 { 274 | m.matched = true 275 | return m.matched 276 | } 277 | 278 | // Record best match so far. 279 | // Only need to check end point, because this entire 280 | // call is only considering one start position. 281 | if len(b.cap) > 1 { 282 | b.cap[1] = pos 283 | } 284 | if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) { 285 | copy(m.matchcap, b.cap) 286 | } 287 | m.matched = true 288 | 289 | // If going for first match, we're done. 290 | if !longest { 291 | return m.matched 292 | } 293 | 294 | // If we used the entire text, no longer match is possible. 295 | if pos == b.end { 296 | return m.matched 297 | } 298 | 299 | // Otherwise, continue on in hope of a longer match. 300 | continue 301 | } 302 | panic("unreachable") 303 | } 304 | 305 | return m.matched 306 | } 307 | 308 | // backtrack runs a backtracking search of prog on the input starting at pos. 309 | func (m *machine) backtrack(i input, pos int, end int, ncap int) bool { 310 | if !i.canCheckPrefix() { 311 | panic("backtrack called for a RuneReader") 312 | } 313 | 314 | startCond := m.re.cond 315 | if startCond == ^syntax.EmptyOp(0) { // impossible 316 | return false 317 | } 318 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 319 | // Anchored match, past beginning of text. 320 | return false 321 | } 322 | 323 | b := m.b 324 | b.reset(end, ncap) 325 | 326 | m.matchcap = m.matchcap[:ncap] 327 | for i := range m.matchcap { 328 | m.matchcap[i] = -1 329 | } 330 | 331 | // Anchored search must start at the beginning of the input 332 | if startCond&syntax.EmptyBeginText != 0 { 333 | if len(b.cap) > 0 { 334 | b.cap[0] = pos 335 | } 336 | return m.tryBacktrack(b, i, uint32(m.p.Start), pos) 337 | } 338 | 339 | // Unanchored search, starting from each possible text position. 340 | // Notice that we have to try the empty string at the end of 341 | // the text, so the loop condition is pos <= end, not pos < end. 342 | // This looks like it's quadratic in the size of the text, 343 | // but we are not clearing visited between calls to TrySearch, 344 | // so no work is duplicated and it ends up still being linear. 345 | width := -1 346 | for ; pos <= end && width != 0; pos += width { 347 | if len(m.re.prefix) > 0 { 348 | // Match requires literal prefix; fast search for it. 349 | advance := i.index(m.re, pos) 350 | if advance < 0 { 351 | return false 352 | } 353 | pos += advance 354 | } 355 | 356 | if len(b.cap) > 0 { 357 | b.cap[0] = pos 358 | } 359 | if m.tryBacktrack(b, i, uint32(m.p.Start), pos) { 360 | // Match must be leftmost; done. 361 | return true 362 | } 363 | _, width = i.step(pos) 364 | } 365 | return false 366 | } 367 | -------------------------------------------------------------------------------- /regex/machine.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regex 6 | 7 | import ( 8 | "io" 9 | "regexp/syntax" 10 | ) 11 | 12 | // A queue is a 'sparse array' holding pending threads of execution. 13 | // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 | type queue struct { 15 | sparse []uint32 16 | dense []entry 17 | } 18 | 19 | // A entry is an entry on a queue. 20 | // It holds both the instruction pc and the actual thread. 21 | // Some queue entries are just place holders so that the machine 22 | // knows it has considered that pc. Such entries have t == nil. 23 | type entry struct { 24 | pc uint32 25 | t *thread 26 | } 27 | 28 | // A thread is the state of a single path through the machine: 29 | // an instruction and a corresponding capture array. 30 | // See http://swtch.com/~rsc/regexp/regexp2.html 31 | type thread struct { 32 | inst *syntax.Inst 33 | cap []int 34 | } 35 | 36 | // A machine holds all the state during an NFA simulation for p. 37 | type machine struct { 38 | re *Regexp // corresponding Regexp 39 | p *syntax.Prog // compiled program 40 | op *onePassProg // compiled onepass program, or notOnePass 41 | maxBitStateLen int // max length of string to search with bitstate 42 | b *bitState // state for backtracker, allocated lazily 43 | q0, q1 queue // two queues for runq, nextq 44 | pool []*thread // pool of available threads 45 | matched bool // whether a match was found 46 | matchcap []int // capture information for the match 47 | 48 | // cached inputs, to avoid allocation 49 | inputBytes inputBytes 50 | inputString inputString 51 | inputReader inputReader 52 | } 53 | 54 | func (m *machine) newInputBytes(b []byte) input { 55 | m.inputBytes.str = b 56 | return &m.inputBytes 57 | } 58 | 59 | func (m *machine) newInputString(s string) input { 60 | m.inputString.str = s 61 | return &m.inputString 62 | } 63 | 64 | func (m *machine) newInputReader(r io.RuneReader) input { 65 | m.inputReader.r = r 66 | m.inputReader.atEOT = false 67 | m.inputReader.pos = 0 68 | return &m.inputReader 69 | } 70 | 71 | // progMachine returns a new machine running the prog p. 72 | func progMachine(p *syntax.Prog, op *onePassProg) *machine { 73 | m := &machine{p: p, op: op} 74 | n := len(m.p.Inst) 75 | m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 76 | m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 77 | ncap := p.NumCap 78 | if ncap < 2 { 79 | ncap = 2 80 | } 81 | if op == notOnePass { 82 | m.maxBitStateLen = maxBitStateLen(p) 83 | } 84 | m.matchcap = make([]int, ncap) 85 | return m 86 | } 87 | 88 | func (m *machine) init(ncap int) { 89 | for _, t := range m.pool { 90 | t.cap = t.cap[:ncap] 91 | } 92 | m.matchcap = m.matchcap[:ncap] 93 | } 94 | 95 | // alloc allocates a new thread with the given instruction. 96 | // It uses the free pool if possible. 97 | func (m *machine) alloc(i *syntax.Inst) *thread { 98 | var t *thread 99 | if n := len(m.pool); n > 0 { 100 | t = m.pool[n-1] 101 | m.pool = m.pool[:n-1] 102 | } else { 103 | t = new(thread) 104 | t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 105 | } 106 | t.inst = i 107 | return t 108 | } 109 | 110 | // free returns t to the free pool. 111 | func (m *machine) free(t *thread) { 112 | m.inputBytes.str = nil 113 | m.inputString.str = "" 114 | m.inputReader.r = nil 115 | m.pool = append(m.pool, t) 116 | } 117 | 118 | // match runs the machine over the input starting at pos. 119 | // It reports whether a match was found. 120 | // If so, m.matchcap holds the submatch information. 121 | func (m *machine) match(i input, pos int) bool { 122 | startCond := m.re.cond 123 | if startCond == ^syntax.EmptyOp(0) { // impossible 124 | return false 125 | } 126 | m.matched = false 127 | for i := range m.matchcap { 128 | m.matchcap[i] = -1 129 | } 130 | runq, nextq := &m.q0, &m.q1 131 | r, r1 := endOfText, endOfText 132 | width, width1 := 0, 0 133 | r, width = i.step(pos) 134 | if r != endOfText { 135 | r1, width1 = i.step(pos + width) 136 | } 137 | var flag syntax.EmptyOp 138 | if pos == 0 { 139 | flag = syntax.EmptyOpContext(-1, r) 140 | } else { 141 | flag = i.context(pos) 142 | } 143 | for { 144 | if len(runq.dense) == 0 { 145 | if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 146 | // Anchored match, past beginning of text. 147 | break 148 | } 149 | if m.matched { 150 | // Have match; finished exploring alternatives. 151 | break 152 | } 153 | if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 154 | // Match requires literal prefix; fast search for it. 155 | advance := i.index(m.re, pos) 156 | if advance < 0 { 157 | break 158 | } 159 | pos += advance 160 | r, width = i.step(pos) 161 | r1, width1 = i.step(pos + width) 162 | } 163 | } 164 | if !m.matched { 165 | if len(m.matchcap) > 0 { 166 | m.matchcap[0] = pos 167 | } 168 | m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 169 | } 170 | flag = syntax.EmptyOpContext(r, r1) 171 | m.step(runq, nextq, pos, pos+width, r, flag) 172 | if width == 0 { 173 | break 174 | } 175 | if len(m.matchcap) == 0 && m.matched { 176 | // Found a match and not paying attention 177 | // to where it is, so any match will do. 178 | break 179 | } 180 | pos += width 181 | r, width = r1, width1 182 | if r != endOfText { 183 | r1, width1 = i.step(pos + width) 184 | } 185 | runq, nextq = nextq, runq 186 | } 187 | m.clear(nextq) 188 | return m.matched 189 | } 190 | 191 | // clear frees all threads on the thread queue. 192 | func (m *machine) clear(q *queue) { 193 | for _, d := range q.dense { 194 | if d.t != nil { 195 | // m.free(d.t) 196 | m.pool = append(m.pool, d.t) 197 | } 198 | } 199 | q.dense = q.dense[:0] 200 | } 201 | 202 | // step executes one step of the machine, running each of the threads 203 | // on runq and appending new threads to nextq. 204 | // The step processes the rune c (which may be endOfText), 205 | // which starts at position pos and ends at nextPos. 206 | // nextCond gives the setting for the empty-width flags after c. 207 | func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 208 | longest := m.re.longest 209 | for j := 0; j < len(runq.dense); j++ { 210 | d := &runq.dense[j] 211 | t := d.t 212 | if t == nil { 213 | continue 214 | } 215 | if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 216 | // m.free(t) 217 | m.pool = append(m.pool, t) 218 | continue 219 | } 220 | i := t.inst 221 | add := false 222 | switch i.Op { 223 | default: 224 | panic("bad inst") 225 | 226 | case syntax.InstMatch: 227 | if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 228 | t.cap[1] = pos 229 | copy(m.matchcap, t.cap) 230 | } 231 | if !longest { 232 | // First-match mode: cut off all lower-priority threads. 233 | for _, d := range runq.dense[j+1:] { 234 | if d.t != nil { 235 | // m.free(d.t) 236 | m.pool = append(m.pool, d.t) 237 | } 238 | } 239 | runq.dense = runq.dense[:0] 240 | } 241 | m.matched = true 242 | 243 | case syntax.InstRune: 244 | add = i.MatchRune(c) 245 | case syntax.InstRune1: 246 | add = c == i.Rune[0] 247 | case syntax.InstRuneAny: 248 | add = true 249 | case syntax.InstRuneAnyNotNL: 250 | add = c != '\n' 251 | } 252 | if add { 253 | t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 254 | } 255 | if t != nil { 256 | // m.free(t) 257 | m.pool = append(m.pool, t) 258 | } 259 | } 260 | runq.dense = runq.dense[:0] 261 | } 262 | 263 | // add adds an entry to q for pc, unless the q already has such an entry. 264 | // It also recursively adds an entry for all instructions reachable from pc by following 265 | // empty-width conditions satisfied by cond. pos gives the current position 266 | // in the input. 267 | func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 268 | if pc == 0 { 269 | return t 270 | } 271 | if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 272 | return t 273 | } 274 | 275 | j := len(q.dense) 276 | q.dense = q.dense[:j+1] 277 | d := &q.dense[j] 278 | d.t = nil 279 | d.pc = pc 280 | q.sparse[pc] = uint32(j) 281 | 282 | i := &m.p.Inst[pc] 283 | switch i.Op { 284 | default: 285 | panic("unhandled") 286 | case syntax.InstFail: 287 | // nothing 288 | case syntax.InstAlt, syntax.InstAltMatch: 289 | t = m.add(q, i.Out, pos, cap, cond, t) 290 | t = m.add(q, i.Arg, pos, cap, cond, t) 291 | case syntax.InstEmptyWidth: 292 | if syntax.EmptyOp(i.Arg)&^cond == 0 { 293 | t = m.add(q, i.Out, pos, cap, cond, t) 294 | } 295 | case syntax.InstNop: 296 | t = m.add(q, i.Out, pos, cap, cond, t) 297 | case syntax.InstCapture: 298 | if int(i.Arg) < len(cap) { 299 | opos := cap[i.Arg] 300 | cap[i.Arg] = pos 301 | m.add(q, i.Out, pos, cap, cond, nil) 302 | cap[i.Arg] = opos 303 | } else { 304 | t = m.add(q, i.Out, pos, cap, cond, t) 305 | } 306 | case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 307 | if t == nil { 308 | t = m.alloc(i) 309 | } else { 310 | t.inst = i 311 | } 312 | if len(cap) > 0 && &t.cap[0] != &cap[0] { 313 | copy(t.cap, cap) 314 | } 315 | d.t = t 316 | t = nil 317 | } 318 | return t 319 | } 320 | 321 | // empty is a non-nil 0-element slice, 322 | // so doExecute can avoid an allocation 323 | // when 0 captures are requested from a successful match. 324 | var empty = make([]int, 0) 325 | 326 | // doExecute finds the leftmost match in the input and returns 327 | // the position of its subexpressions. 328 | func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int { 329 | m := re.get() 330 | var i input 331 | var size int 332 | if r != nil { 333 | i = m.newInputReader(r) 334 | } else if b != nil { 335 | i = m.newInputBytes(b) 336 | size = len(b) 337 | } else { 338 | i = m.newInputString(s) 339 | size = len(s) 340 | } 341 | if size < m.maxBitStateLen && r == nil { 342 | if m.b == nil { 343 | m.b = newBitState(m.p) 344 | } 345 | if !m.backtrack(i, pos, size, ncap) { 346 | re.put(m) 347 | return nil 348 | } 349 | } else { 350 | m.init(ncap) 351 | if !m.match(i, pos) { 352 | re.put(m) 353 | return nil 354 | } 355 | } 356 | if ncap == 0 { 357 | re.put(m) 358 | return empty // empty but not nil 359 | } 360 | cap := make([]int, len(m.matchcap)) 361 | copy(cap, m.matchcap) 362 | re.put(m) 363 | return cap 364 | } 365 | -------------------------------------------------------------------------------- /regex/onepass.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regex 6 | 7 | import ( 8 | "bytes" 9 | "regexp/syntax" 10 | "sort" 11 | "unicode" 12 | ) 13 | 14 | // "One-pass" regexp execution. 15 | // Some regexps can be analyzed to determine that they never need 16 | // backtracking: they are guaranteed to run in one pass over the string 17 | // without bothering to save all the usual NFA state. 18 | // Detect those and execute them more quickly. 19 | 20 | // A onePassProg is a compiled one-pass regular expression program. 21 | // It is the same as syntax.Prog except for the use of onePassInst. 22 | type onePassProg struct { 23 | Inst []onePassInst 24 | Start int // index of start instruction 25 | NumCap int // number of InstCapture insts in re 26 | } 27 | 28 | // A onePassInst is a single instruction in a one-pass regular expression program. 29 | // It is the same as syntax.Inst except for the new 'Next' field. 30 | type onePassInst struct { 31 | syntax.Inst 32 | Next []uint32 33 | } 34 | 35 | // OnePassPrefix returns a literal string that all matches for the 36 | // regexp must start with. Complete is true if the prefix 37 | // is the entire match. Pc is the index of the last rune instruction 38 | // in the string. The OnePassPrefix skips over the mandatory 39 | // EmptyBeginText 40 | func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 41 | i := &p.Inst[p.Start] 42 | if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 43 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 44 | } 45 | pc = i.Out 46 | i = &p.Inst[pc] 47 | for i.Op == syntax.InstNop { 48 | pc = i.Out 49 | i = &p.Inst[pc] 50 | } 51 | // Avoid allocation of buffer if prefix is empty. 52 | if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 53 | return "", i.Op == syntax.InstMatch, uint32(p.Start) 54 | } 55 | 56 | // Have prefix; gather characters. 57 | var buf bytes.Buffer 58 | for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { 59 | buf.WriteRune(i.Rune[0]) 60 | pc, i = i.Out, &p.Inst[i.Out] 61 | } 62 | return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc 63 | } 64 | 65 | // OnePassNext selects the next actionable state of the prog, based on the input character. 66 | // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 67 | // One of the alternates may ultimately lead without input to end of line. If the instruction 68 | // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 69 | func onePassNext(i *onePassInst, r rune) uint32 { 70 | next := i.MatchRunePos(r) 71 | if next >= 0 { 72 | return i.Next[next] 73 | } 74 | if i.Op == syntax.InstAltMatch { 75 | return i.Out 76 | } 77 | return 0 78 | } 79 | 80 | func iop(i *syntax.Inst) syntax.InstOp { 81 | op := i.Op 82 | switch op { 83 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 84 | op = syntax.InstRune 85 | } 86 | return op 87 | } 88 | 89 | // Sparse Array implementation is used as a queueOnePass. 90 | type queueOnePass struct { 91 | sparse []uint32 92 | dense []uint32 93 | size, nextIndex uint32 94 | } 95 | 96 | func (q *queueOnePass) empty() bool { 97 | return q.nextIndex >= q.size 98 | } 99 | 100 | func (q *queueOnePass) next() (n uint32) { 101 | n = q.dense[q.nextIndex] 102 | q.nextIndex++ 103 | return 104 | } 105 | 106 | func (q *queueOnePass) clear() { 107 | q.size = 0 108 | q.nextIndex = 0 109 | } 110 | 111 | func (q *queueOnePass) reset() { 112 | q.nextIndex = 0 113 | } 114 | 115 | func (q *queueOnePass) contains(u uint32) bool { 116 | if u >= uint32(len(q.sparse)) { 117 | return false 118 | } 119 | return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 120 | } 121 | 122 | func (q *queueOnePass) insert(u uint32) { 123 | if !q.contains(u) { 124 | q.insertNew(u) 125 | } 126 | } 127 | 128 | func (q *queueOnePass) insertNew(u uint32) { 129 | if u >= uint32(len(q.sparse)) { 130 | return 131 | } 132 | q.sparse[u] = q.size 133 | q.dense[q.size] = u 134 | q.size++ 135 | } 136 | 137 | func newQueue(size int) (q *queueOnePass) { 138 | return &queueOnePass{ 139 | sparse: make([]uint32, size), 140 | dense: make([]uint32, size), 141 | } 142 | } 143 | 144 | // mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 145 | // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 146 | // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 147 | // NextIp array with the single element mergeFailed is returned. 148 | // The code assumes that both inputs contain ordered and non-intersecting rune pairs. 149 | const mergeFailed = uint32(0xffffffff) 150 | 151 | var ( 152 | noRune = []rune{} 153 | noNext = []uint32{mergeFailed} 154 | ) 155 | 156 | func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 157 | leftLen := len(*leftRunes) 158 | rightLen := len(*rightRunes) 159 | if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 160 | panic("mergeRuneSets odd length []rune") 161 | } 162 | var ( 163 | lx, rx int 164 | ) 165 | merged := make([]rune, 0) 166 | next := make([]uint32, 0) 167 | ok := true 168 | defer func() { 169 | if !ok { 170 | merged = nil 171 | next = nil 172 | } 173 | }() 174 | 175 | ix := -1 176 | extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 177 | if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 178 | return false 179 | } 180 | merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 181 | *newLow += 2 182 | ix += 2 183 | next = append(next, pc) 184 | return true 185 | } 186 | 187 | for lx < leftLen || rx < rightLen { 188 | switch { 189 | case rx >= rightLen: 190 | ok = extend(&lx, leftRunes, leftPC) 191 | case lx >= leftLen: 192 | ok = extend(&rx, rightRunes, rightPC) 193 | case (*rightRunes)[rx] < (*leftRunes)[lx]: 194 | ok = extend(&rx, rightRunes, rightPC) 195 | default: 196 | ok = extend(&lx, leftRunes, leftPC) 197 | } 198 | if !ok { 199 | return noRune, noNext 200 | } 201 | } 202 | return merged, next 203 | } 204 | 205 | // cleanupOnePass drops working memory, and restores certain shortcut instructions. 206 | func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 207 | for ix, instOriginal := range original.Inst { 208 | switch instOriginal.Op { 209 | case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 210 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 211 | prog.Inst[ix].Next = nil 212 | case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 213 | prog.Inst[ix].Next = nil 214 | prog.Inst[ix] = onePassInst{Inst: instOriginal} 215 | } 216 | } 217 | } 218 | 219 | // onePassCopy creates a copy of the original Prog, as we'll be modifying it 220 | func onePassCopy(prog *syntax.Prog) *onePassProg { 221 | p := &onePassProg{ 222 | Start: prog.Start, 223 | NumCap: prog.NumCap, 224 | } 225 | for _, inst := range prog.Inst { 226 | p.Inst = append(p.Inst, onePassInst{Inst: inst}) 227 | } 228 | 229 | // rewrites one or more common Prog constructs that enable some otherwise 230 | // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 231 | // ip A, that points to ips B & C. 232 | // A:BC + B:DA => A:BC + B:CD 233 | // A:BC + B:DC => A:DC + B:DC 234 | for pc := range p.Inst { 235 | switch p.Inst[pc].Op { 236 | default: 237 | continue 238 | case syntax.InstAlt, syntax.InstAltMatch: 239 | // A:Bx + B:Ay 240 | p_A_Other := &p.Inst[pc].Out 241 | p_A_Alt := &p.Inst[pc].Arg 242 | // make sure a target is another Alt 243 | instAlt := p.Inst[*p_A_Alt] 244 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 245 | p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 246 | instAlt = p.Inst[*p_A_Alt] 247 | if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 248 | continue 249 | } 250 | } 251 | instOther := p.Inst[*p_A_Other] 252 | // Analyzing both legs pointing to Alts is for another day 253 | if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 254 | // too complicated 255 | continue 256 | } 257 | // simple empty transition loop 258 | // A:BC + B:DA => A:BC + B:DC 259 | p_B_Alt := &p.Inst[*p_A_Alt].Out 260 | p_B_Other := &p.Inst[*p_A_Alt].Arg 261 | patch := false 262 | if instAlt.Out == uint32(pc) { 263 | patch = true 264 | } else if instAlt.Arg == uint32(pc) { 265 | patch = true 266 | p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 267 | } 268 | if patch { 269 | *p_B_Alt = *p_A_Other 270 | } 271 | 272 | // empty transition to common target 273 | // A:BC + B:DC => A:DC + B:DC 274 | if *p_A_Other == *p_B_Alt { 275 | *p_A_Alt = *p_B_Other 276 | } 277 | } 278 | } 279 | return p 280 | } 281 | 282 | // runeSlice exists to permit sorting the case-folded rune sets. 283 | type runeSlice []rune 284 | 285 | func (p runeSlice) Len() int { return len(p) } 286 | func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } 287 | func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 288 | 289 | // Sort is a convenience method. 290 | func (p runeSlice) Sort() { 291 | sort.Sort(p) 292 | } 293 | 294 | var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 295 | var anyRune = []rune{0, unicode.MaxRune} 296 | 297 | // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 298 | // the match engine can always tell which branch to take. The routine may modify 299 | // p if it is turned into a onepass Prog. If it isn't possible for this to be a 300 | // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive 301 | // to the size of the Prog. 302 | func makeOnePass(p *onePassProg) *onePassProg { 303 | // If the machine is very long, it's not worth the time to check if we can use one pass. 304 | if len(p.Inst) >= 1000 { 305 | return notOnePass 306 | } 307 | 308 | var ( 309 | instQueue = newQueue(len(p.Inst)) 310 | visitQueue = newQueue(len(p.Inst)) 311 | build func(uint32, *queueOnePass) 312 | check func(uint32, map[uint32]bool) bool 313 | onePassRunes = make([][]rune, len(p.Inst)) 314 | ) 315 | build = func(pc uint32, q *queueOnePass) { 316 | if q.contains(pc) { 317 | return 318 | } 319 | inst := p.Inst[pc] 320 | switch inst.Op { 321 | case syntax.InstAlt, syntax.InstAltMatch: 322 | q.insert(inst.Out) 323 | build(inst.Out, q) 324 | q.insert(inst.Arg) 325 | case syntax.InstMatch, syntax.InstFail: 326 | default: 327 | q.insert(inst.Out) 328 | } 329 | } 330 | 331 | // check that paths from Alt instructions are unambiguous, and rebuild the new 332 | // program as a onepass program 333 | check = func(pc uint32, m map[uint32]bool) (ok bool) { 334 | ok = true 335 | inst := &p.Inst[pc] 336 | if visitQueue.contains(pc) { 337 | return 338 | } 339 | visitQueue.insert(pc) 340 | switch inst.Op { 341 | case syntax.InstAlt, syntax.InstAltMatch: 342 | ok = check(inst.Out, m) && check(inst.Arg, m) 343 | // check no-input paths to InstMatch 344 | matchOut := m[inst.Out] 345 | matchArg := m[inst.Arg] 346 | if matchOut && matchArg { 347 | ok = false 348 | break 349 | } 350 | // Match on empty goes in inst.Out 351 | if matchArg { 352 | inst.Out, inst.Arg = inst.Arg, inst.Out 353 | matchOut, matchArg = matchArg, matchOut 354 | } 355 | if matchOut { 356 | m[pc] = true 357 | inst.Op = syntax.InstAltMatch 358 | } 359 | 360 | // build a dispatch operator from the two legs of the alt. 361 | onePassRunes[pc], inst.Next = mergeRuneSets( 362 | &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 363 | if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 364 | ok = false 365 | break 366 | } 367 | case syntax.InstCapture, syntax.InstNop: 368 | ok = check(inst.Out, m) 369 | m[pc] = m[inst.Out] 370 | // pass matching runes back through these no-ops. 371 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 372 | inst.Next = []uint32{} 373 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 374 | inst.Next = append(inst.Next, inst.Out) 375 | } 376 | case syntax.InstEmptyWidth: 377 | ok = check(inst.Out, m) 378 | m[pc] = m[inst.Out] 379 | onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 380 | inst.Next = []uint32{} 381 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 382 | inst.Next = append(inst.Next, inst.Out) 383 | } 384 | case syntax.InstMatch, syntax.InstFail: 385 | m[pc] = inst.Op == syntax.InstMatch 386 | break 387 | case syntax.InstRune: 388 | ok = check(inst.Out, m) 389 | m[pc] = false 390 | if len(inst.Next) > 0 { 391 | break 392 | } 393 | if len(inst.Rune) == 0 { 394 | onePassRunes[pc] = []rune{} 395 | inst.Next = []uint32{inst.Out} 396 | break 397 | } 398 | runes := make([]rune, 0) 399 | if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 400 | r0 := inst.Rune[0] 401 | runes = append(runes, r0, r0) 402 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 403 | runes = append(runes, r1, r1) 404 | } 405 | sort.Sort(runeSlice(runes)) 406 | } else { 407 | runes = append(runes, inst.Rune...) 408 | } 409 | onePassRunes[pc] = runes 410 | inst.Next = []uint32{} 411 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 412 | inst.Next = append(inst.Next, inst.Out) 413 | } 414 | inst.Op = syntax.InstRune 415 | case syntax.InstRune1: 416 | ok = check(inst.Out, m) 417 | m[pc] = false 418 | if len(inst.Next) > 0 { 419 | break 420 | } 421 | runes := []rune{} 422 | // expand case-folded runes 423 | if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 424 | r0 := inst.Rune[0] 425 | runes = append(runes, r0, r0) 426 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 427 | runes = append(runes, r1, r1) 428 | } 429 | sort.Sort(runeSlice(runes)) 430 | } else { 431 | runes = append(runes, inst.Rune[0], inst.Rune[0]) 432 | } 433 | onePassRunes[pc] = runes 434 | inst.Next = []uint32{} 435 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 436 | inst.Next = append(inst.Next, inst.Out) 437 | } 438 | inst.Op = syntax.InstRune 439 | case syntax.InstRuneAny: 440 | ok = check(inst.Out, m) 441 | m[pc] = false 442 | if len(inst.Next) > 0 { 443 | break 444 | } 445 | onePassRunes[pc] = append([]rune{}, anyRune...) 446 | inst.Next = []uint32{inst.Out} 447 | case syntax.InstRuneAnyNotNL: 448 | ok = check(inst.Out, m) 449 | m[pc] = false 450 | if len(inst.Next) > 0 { 451 | break 452 | } 453 | onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 454 | inst.Next = []uint32{} 455 | for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { 456 | inst.Next = append(inst.Next, inst.Out) 457 | } 458 | } 459 | return 460 | } 461 | 462 | instQueue.clear() 463 | instQueue.insert(uint32(p.Start)) 464 | m := make(map[uint32]bool, len(p.Inst)) 465 | for !instQueue.empty() { 466 | pc := instQueue.next() 467 | inst := p.Inst[pc] 468 | visitQueue.clear() 469 | if !check(uint32(pc), m) { 470 | p = notOnePass 471 | break 472 | } 473 | switch inst.Op { 474 | case syntax.InstAlt, syntax.InstAltMatch: 475 | instQueue.insert(inst.Out) 476 | instQueue.insert(inst.Arg) 477 | case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: 478 | instQueue.insert(inst.Out) 479 | case syntax.InstMatch: 480 | case syntax.InstFail: 481 | case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 482 | default: 483 | } 484 | } 485 | if p != notOnePass { 486 | for i := range p.Inst { 487 | p.Inst[i].Rune = onePassRunes[i] 488 | } 489 | } 490 | return p 491 | } 492 | 493 | // walk visits each Inst in the prog once, and applies the argument 494 | // function(ip, next), in pre-order. 495 | func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { 496 | var walk1 func(uint32) 497 | progQueue := newQueue(len(prog.Inst)) 498 | walk1 = func(ip uint32) { 499 | if progQueue.contains(ip) { 500 | return 501 | } 502 | progQueue.insert(ip) 503 | inst := prog.Inst[ip] 504 | switch inst.Op { 505 | case syntax.InstAlt, syntax.InstAltMatch: 506 | for _, f := range funcs { 507 | f(ip, inst.Out) 508 | f(ip, inst.Arg) 509 | } 510 | walk1(inst.Out) 511 | walk1(inst.Arg) 512 | default: 513 | for _, f := range funcs { 514 | f(ip, inst.Out) 515 | } 516 | walk1(inst.Out) 517 | } 518 | } 519 | walk1(uint32(prog.Start)) 520 | } 521 | 522 | // find returns the Insts that match the argument predicate function 523 | func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { 524 | matches = []uint32{} 525 | 526 | for ip := range prog.Inst { 527 | if f(prog, ip) { 528 | matches = append(matches, uint32(ip)) 529 | } 530 | } 531 | return 532 | } 533 | 534 | var notOnePass *onePassProg = nil 535 | 536 | // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 537 | // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the 538 | // Prog cannot be converted. For a one pass prog, the fundamental condition that must 539 | // be true is: at any InstAlt, there must be no ambiguity about what branch to take. 540 | func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 541 | if prog.Start == 0 { 542 | return notOnePass 543 | } 544 | // onepass regexp is anchored 545 | if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 546 | syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 547 | return notOnePass 548 | } 549 | // every instruction leading to InstMatch must be EmptyEndText 550 | for _, inst := range prog.Inst { 551 | opOut := prog.Inst[inst.Out].Op 552 | switch inst.Op { 553 | default: 554 | if opOut == syntax.InstMatch { 555 | return notOnePass 556 | } 557 | case syntax.InstAlt, syntax.InstAltMatch: 558 | if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 559 | return notOnePass 560 | } 561 | case syntax.InstEmptyWidth: 562 | if opOut == syntax.InstMatch { 563 | if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 564 | continue 565 | } 566 | return notOnePass 567 | } 568 | } 569 | } 570 | // Creates a slightly optimized copy of the original Prog 571 | // that cleans up some Prog idioms that block valid onepass programs 572 | p = onePassCopy(prog) 573 | 574 | // checkAmbiguity on InstAlts, build onepass Prog if possible 575 | p = makeOnePass(p) 576 | 577 | if p != notOnePass { 578 | cleanupOnePass(p, prog) 579 | } 580 | return p 581 | } 582 | -------------------------------------------------------------------------------- /regex/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package regex is a fork of the standard library regexp package. 6 | // It contains a few small additions that make it possible to 7 | // interact more directly with the underlying DFA machinery. 8 | // 9 | // Package regexp implements regular expression search. 10 | // 11 | // The syntax of the regular expressions accepted is the same 12 | // general syntax used by Perl, Python, and other languages. 13 | // More precisely, it is the syntax accepted by RE2 and described at 14 | // https://golang.org/s/re2syntax, except for \C. 15 | // For an overview of the syntax, run 16 | // go doc regexp/syntax 17 | // 18 | // The regexp implementation provided by this package is 19 | // guaranteed to run in time linear in the size of the input. 20 | // (This is a property not guaranteed by most open source 21 | // implementations of regular expressions.) For more information 22 | // about this property, see 23 | // http://swtch.com/~rsc/regexp/regexp1.html 24 | // or any book about automata theory. 25 | // 26 | // All characters are UTF-8-encoded code points. 27 | // 28 | // There are 16 methods of Regexp that match a regular expression and identify 29 | // the matched text. Their names are matched by this regular expression: 30 | // 31 | // Find(All)?(String)?(Submatch)?(Index)? 32 | // 33 | // If 'All' is present, the routine matches successive non-overlapping 34 | // matches of the entire expression. Empty matches abutting a preceding 35 | // match are ignored. The return value is a slice containing the successive 36 | // return values of the corresponding non-'All' routine. These routines take 37 | // an extra integer argument, n; if n >= 0, the function returns at most n 38 | // matches/submatches. 39 | // 40 | // If 'String' is present, the argument is a string; otherwise it is a slice 41 | // of bytes; return values are adjusted as appropriate. 42 | // 43 | // If 'Submatch' is present, the return value is a slice identifying the 44 | // successive submatches of the expression. Submatches are matches of 45 | // parenthesized subexpressions (also known as capturing groups) within the 46 | // regular expression, numbered from left to right in order of opening 47 | // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 48 | // the match of the first parenthesized subexpression, and so on. 49 | // 50 | // If 'Index' is present, matches and submatches are identified by byte index 51 | // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 52 | // the nth submatch. The pair for n==0 identifies the match of the entire 53 | // expression. If 'Index' is not present, the match is identified by the 54 | // text of the match/submatch. If an index is negative, it means that 55 | // subexpression did not match any string in the input. 56 | // 57 | // There is also a subset of the methods that can be applied to text read 58 | // from a RuneReader: 59 | // 60 | // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 61 | // 62 | // This set may grow. Note that regular expression matches may need to 63 | // examine text beyond the text returned by a match, so the methods that 64 | // match text from a RuneReader may read arbitrarily far into the input 65 | // before returning. 66 | // 67 | // (There are a few other methods that do not match this pattern.) 68 | // 69 | package regex 70 | 71 | import ( 72 | "bytes" 73 | "io" 74 | "regexp/syntax" 75 | "strconv" 76 | "strings" 77 | "sync" 78 | "unicode" 79 | "unicode/utf8" 80 | ) 81 | 82 | var debug = false 83 | 84 | // Regexp is the representation of a compiled regular expression. 85 | // A Regexp is safe for concurrent use by multiple goroutines. 86 | type Regexp struct { 87 | // read-only after Compile 88 | expr string 89 | prog *syntax.Prog // compiled program 90 | onepass *onePassProg // onepass program or nil 91 | prefix string // required prefix in unanchored matches 92 | prefixBytes []byte // prefix, as a []byte 93 | prefixComplete bool // prefix is the entire regexp 94 | prefixRune rune // first rune in prefix 95 | prefixEnd uint32 // pc for last rune in prefix 96 | cond syntax.EmptyOp // empty-width conditions required at start of match 97 | numSubexp int 98 | subexpNames []string 99 | longest bool 100 | 101 | // cache of machines for running regexp 102 | mu sync.Mutex 103 | machine []*machine 104 | } 105 | 106 | // String returns the source text used to compile the regular expression. 107 | func (re *Regexp) String() string { 108 | return re.expr 109 | } 110 | 111 | // Compile parses a regular expression and returns, if successful, 112 | // a Regexp object that can be used to match against text. 113 | // 114 | // When matching against text, the regexp returns a match that 115 | // begins as early as possible in the input (leftmost), and among those 116 | // it chooses the one that a backtracking search would have found first. 117 | // This so-called leftmost-first matching is the same semantics 118 | // that Perl, Python, and other implementations use, although this 119 | // package implements it without the expense of backtracking. 120 | // For POSIX leftmost-longest matching, see CompilePOSIX. 121 | func Compile(expr string) (*Regexp, error) { 122 | return compile(expr, syntax.Perl, false) 123 | } 124 | 125 | // CompileSyntax is like Compile but takes a syntax tree as input. 126 | func CompileSyntax(ast *syntax.Regexp) (*Regexp, error) { 127 | return compileSyntax(ast, ast.String(), true) 128 | } 129 | 130 | // CompilePOSIX is like Compile but restricts the regular expression 131 | // to POSIX ERE (egrep) syntax and changes the match semantics to 132 | // leftmost-longest. 133 | // 134 | // That is, when matching against text, the regexp returns a match that 135 | // begins as early as possible in the input (leftmost), and among those 136 | // it chooses a match that is as long as possible. 137 | // This so-called leftmost-longest matching is the same semantics 138 | // that early regular expression implementations used and that POSIX 139 | // specifies. 140 | // 141 | // However, there can be multiple leftmost-longest matches, with different 142 | // submatch choices, and here this package diverges from POSIX. 143 | // Among the possible leftmost-longest matches, this package chooses 144 | // the one that a backtracking search would have found first, while POSIX 145 | // specifies that the match be chosen to maximize the length of the first 146 | // subexpression, then the second, and so on from left to right. 147 | // The POSIX rule is computationally prohibitive and not even well-defined. 148 | // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 149 | func CompilePOSIX(expr string) (*Regexp, error) { 150 | return compile(expr, syntax.POSIX, true) 151 | } 152 | 153 | // Longest makes future searches prefer the leftmost-longest match. 154 | // That is, when matching against text, the regexp returns a match that 155 | // begins as early as possible in the input (leftmost), and among those 156 | // it chooses a match that is as long as possible. 157 | func (re *Regexp) Longest() { 158 | re.longest = true 159 | } 160 | 161 | func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 162 | re, err := syntax.Parse(expr, mode) 163 | if err != nil { 164 | return nil, err 165 | } 166 | return compileSyntax(re, expr, longest) 167 | } 168 | 169 | func compileSyntax(re *syntax.Regexp, expr string, longest bool) (*Regexp, error) { 170 | maxCap := re.MaxCap() 171 | capNames := re.CapNames() 172 | 173 | re = re.Simplify() 174 | prog, err := syntax.Compile(re) 175 | if err != nil { 176 | return nil, err 177 | } 178 | regexp := &Regexp{ 179 | expr: expr, 180 | prog: prog, 181 | onepass: compileOnePass(prog), 182 | numSubexp: maxCap, 183 | subexpNames: capNames, 184 | cond: prog.StartCond(), 185 | longest: longest, 186 | } 187 | if regexp.onepass == notOnePass { 188 | regexp.prefix, regexp.prefixComplete = prog.Prefix() 189 | } else { 190 | regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 191 | } 192 | if regexp.prefix != "" { 193 | // TODO(rsc): Remove this allocation by adding 194 | // IndexString to package bytes. 195 | regexp.prefixBytes = []byte(regexp.prefix) 196 | regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 197 | } 198 | return regexp, nil 199 | } 200 | 201 | // get returns a machine to use for matching re. 202 | // It uses the re's machine cache if possible, to avoid 203 | // unnecessary allocation. 204 | func (re *Regexp) get() *machine { 205 | re.mu.Lock() 206 | if n := len(re.machine); n > 0 { 207 | z := re.machine[n-1] 208 | re.machine = re.machine[:n-1] 209 | re.mu.Unlock() 210 | return z 211 | } 212 | re.mu.Unlock() 213 | z := progMachine(re.prog, re.onepass) 214 | z.re = re 215 | return z 216 | } 217 | 218 | // put returns a machine to the re's machine cache. 219 | // There is no attempt to limit the size of the cache, so it will 220 | // grow to the maximum number of simultaneous matches 221 | // run using re. (The cache empties when re gets garbage collected.) 222 | func (re *Regexp) put(z *machine) { 223 | re.mu.Lock() 224 | re.machine = append(re.machine, z) 225 | re.mu.Unlock() 226 | } 227 | 228 | // MustCompile is like Compile but panics if the expression cannot be parsed. 229 | // It simplifies safe initialization of global variables holding compiled regular 230 | // expressions. 231 | func MustCompile(str string) *Regexp { 232 | regexp, error := Compile(str) 233 | if error != nil { 234 | panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 235 | } 236 | return regexp 237 | } 238 | 239 | // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 240 | // It simplifies safe initialization of global variables holding compiled regular 241 | // expressions. 242 | func MustCompilePOSIX(str string) *Regexp { 243 | regexp, error := CompilePOSIX(str) 244 | if error != nil { 245 | panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 246 | } 247 | return regexp 248 | } 249 | 250 | func quote(s string) string { 251 | if strconv.CanBackquote(s) { 252 | return "`" + s + "`" 253 | } 254 | return strconv.Quote(s) 255 | } 256 | 257 | // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 258 | func (re *Regexp) NumSubexp() int { 259 | return re.numSubexp 260 | } 261 | 262 | // SubexpNames returns the names of the parenthesized subexpressions 263 | // in this Regexp. The name for the first sub-expression is names[1], 264 | // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 265 | // Since the Regexp as a whole cannot be named, names[0] is always 266 | // the empty string. The slice should not be modified. 267 | func (re *Regexp) SubexpNames() []string { 268 | return re.subexpNames 269 | } 270 | 271 | const endOfText rune = -1 272 | 273 | // input abstracts different representations of the input text. It provides 274 | // one-character lookahead. 275 | type input interface { 276 | step(pos int) (r rune, width int) // advance one rune 277 | canCheckPrefix() bool // can we look ahead without losing info? 278 | hasPrefix(re *Regexp) bool 279 | index(re *Regexp, pos int) int 280 | context(pos int) syntax.EmptyOp 281 | } 282 | 283 | // inputString scans a string. 284 | type inputString struct { 285 | str string 286 | } 287 | 288 | func (i *inputString) step(pos int) (rune, int) { 289 | if pos < len(i.str) { 290 | c := i.str[pos] 291 | if c < utf8.RuneSelf { 292 | return rune(c), 1 293 | } 294 | return utf8.DecodeRuneInString(i.str[pos:]) 295 | } 296 | return endOfText, 0 297 | } 298 | 299 | func (i *inputString) canCheckPrefix() bool { 300 | return true 301 | } 302 | 303 | func (i *inputString) hasPrefix(re *Regexp) bool { 304 | return strings.HasPrefix(i.str, re.prefix) 305 | } 306 | 307 | func (i *inputString) index(re *Regexp, pos int) int { 308 | return strings.Index(i.str[pos:], re.prefix) 309 | } 310 | 311 | func (i *inputString) context(pos int) syntax.EmptyOp { 312 | r1, r2 := endOfText, endOfText 313 | if pos > 0 && pos <= len(i.str) { 314 | r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 315 | } 316 | if pos < len(i.str) { 317 | r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 318 | } 319 | return syntax.EmptyOpContext(r1, r2) 320 | } 321 | 322 | // inputBytes scans a byte slice. 323 | type inputBytes struct { 324 | str []byte 325 | } 326 | 327 | func (i *inputBytes) step(pos int) (rune, int) { 328 | if pos < len(i.str) { 329 | c := i.str[pos] 330 | if c < utf8.RuneSelf { 331 | return rune(c), 1 332 | } 333 | return utf8.DecodeRune(i.str[pos:]) 334 | } 335 | return endOfText, 0 336 | } 337 | 338 | func (i *inputBytes) canCheckPrefix() bool { 339 | return true 340 | } 341 | 342 | func (i *inputBytes) hasPrefix(re *Regexp) bool { 343 | return bytes.HasPrefix(i.str, re.prefixBytes) 344 | } 345 | 346 | func (i *inputBytes) index(re *Regexp, pos int) int { 347 | return bytes.Index(i.str[pos:], re.prefixBytes) 348 | } 349 | 350 | func (i *inputBytes) context(pos int) syntax.EmptyOp { 351 | r1, r2 := endOfText, endOfText 352 | if pos > 0 && pos <= len(i.str) { 353 | r1, _ = utf8.DecodeLastRune(i.str[:pos]) 354 | } 355 | if pos < len(i.str) { 356 | r2, _ = utf8.DecodeRune(i.str[pos:]) 357 | } 358 | return syntax.EmptyOpContext(r1, r2) 359 | } 360 | 361 | // inputReader scans a RuneReader. 362 | type inputReader struct { 363 | r io.RuneReader 364 | atEOT bool 365 | pos int 366 | } 367 | 368 | func (i *inputReader) step(pos int) (rune, int) { 369 | if !i.atEOT && pos != i.pos { 370 | return endOfText, 0 371 | 372 | } 373 | r, w, err := i.r.ReadRune() 374 | if err != nil { 375 | i.atEOT = true 376 | return endOfText, 0 377 | } 378 | i.pos += w 379 | return r, w 380 | } 381 | 382 | func (i *inputReader) canCheckPrefix() bool { 383 | return false 384 | } 385 | 386 | func (i *inputReader) hasPrefix(re *Regexp) bool { 387 | return false 388 | } 389 | 390 | func (i *inputReader) index(re *Regexp, pos int) int { 391 | return -1 392 | } 393 | 394 | func (i *inputReader) context(pos int) syntax.EmptyOp { 395 | return 0 396 | } 397 | 398 | // LiteralPrefix returns a literal string that must begin any match 399 | // of the regular expression re. It returns the boolean true if the 400 | // literal string comprises the entire regular expression. 401 | func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 402 | return re.prefix, re.prefixComplete 403 | } 404 | 405 | // MatchReader reports whether the Regexp matches the text read by the 406 | // RuneReader. 407 | func (re *Regexp) MatchReader(r io.RuneReader) bool { 408 | return re.doExecute(r, nil, "", 0, 0) != nil 409 | } 410 | 411 | // MatchString reports whether the Regexp matches the string s. 412 | func (re *Regexp) MatchString(s string) bool { 413 | return re.doExecute(nil, nil, s, 0, 0) != nil 414 | } 415 | 416 | // Match reports whether the Regexp matches the byte slice b. 417 | func (re *Regexp) Match(b []byte) bool { 418 | return re.doExecute(nil, b, "", 0, 0) != nil 419 | } 420 | 421 | // MatchReader checks whether a textual regular expression matches the text 422 | // read by the RuneReader. More complicated queries need to use Compile and 423 | // the full Regexp interface. 424 | func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 425 | re, err := Compile(pattern) 426 | if err != nil { 427 | return false, err 428 | } 429 | return re.MatchReader(r), nil 430 | } 431 | 432 | // MatchString checks whether a textual regular expression 433 | // matches a string. More complicated queries need 434 | // to use Compile and the full Regexp interface. 435 | func MatchString(pattern string, s string) (matched bool, err error) { 436 | re, err := Compile(pattern) 437 | if err != nil { 438 | return false, err 439 | } 440 | return re.MatchString(s), nil 441 | } 442 | 443 | // Match checks whether a textual regular expression 444 | // matches a byte slice. More complicated queries need 445 | // to use Compile and the full Regexp interface. 446 | func Match(pattern string, b []byte) (matched bool, err error) { 447 | re, err := Compile(pattern) 448 | if err != nil { 449 | return false, err 450 | } 451 | return re.Match(b), nil 452 | } 453 | 454 | // ReplaceAllString returns a copy of src, replacing matches of the Regexp 455 | // with the replacement string repl. Inside repl, $ signs are interpreted as 456 | // in Expand, so for instance $1 represents the text of the first submatch. 457 | func (re *Regexp) ReplaceAllString(src, repl string) string { 458 | n := 2 459 | if strings.Index(repl, "$") >= 0 { 460 | n = 2 * (re.numSubexp + 1) 461 | } 462 | b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 463 | return re.expand(dst, repl, nil, src, match) 464 | }) 465 | return string(b) 466 | } 467 | 468 | // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 469 | // with the replacement string repl. The replacement repl is substituted directly, 470 | // without using Expand. 471 | func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 472 | return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 473 | return append(dst, repl...) 474 | })) 475 | } 476 | 477 | // ReplaceAllStringFunc returns a copy of src in which all matches of the 478 | // Regexp have been replaced by the return value of function repl applied 479 | // to the matched substring. The replacement returned by repl is substituted 480 | // directly, without using Expand. 481 | func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 482 | b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 483 | return append(dst, repl(src[match[0]:match[1]])...) 484 | }) 485 | return string(b) 486 | } 487 | 488 | func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 489 | lastMatchEnd := 0 // end position of the most recent match 490 | searchPos := 0 // position where we next look for a match 491 | var buf []byte 492 | var endPos int 493 | if bsrc != nil { 494 | endPos = len(bsrc) 495 | } else { 496 | endPos = len(src) 497 | } 498 | for searchPos <= endPos { 499 | a := re.doExecute(nil, bsrc, src, searchPos, nmatch) 500 | if len(a) == 0 { 501 | break // no more matches 502 | } 503 | 504 | // Copy the unmatched characters before this match. 505 | if bsrc != nil { 506 | buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 507 | } else { 508 | buf = append(buf, src[lastMatchEnd:a[0]]...) 509 | } 510 | 511 | // Now insert a copy of the replacement string, but not for a 512 | // match of the empty string immediately after another match. 513 | // (Otherwise, we get double replacement for patterns that 514 | // match both empty and nonempty strings.) 515 | if a[1] > lastMatchEnd || a[0] == 0 { 516 | buf = repl(buf, a) 517 | } 518 | lastMatchEnd = a[1] 519 | 520 | // Advance past this match; always advance at least one character. 521 | var width int 522 | if bsrc != nil { 523 | _, width = utf8.DecodeRune(bsrc[searchPos:]) 524 | } else { 525 | _, width = utf8.DecodeRuneInString(src[searchPos:]) 526 | } 527 | if searchPos+width > a[1] { 528 | searchPos += width 529 | } else if searchPos+1 > a[1] { 530 | // This clause is only needed at the end of the input 531 | // string. In that case, DecodeRuneInString returns width=0. 532 | searchPos++ 533 | } else { 534 | searchPos = a[1] 535 | } 536 | } 537 | 538 | // Copy the unmatched characters after the last match. 539 | if bsrc != nil { 540 | buf = append(buf, bsrc[lastMatchEnd:]...) 541 | } else { 542 | buf = append(buf, src[lastMatchEnd:]...) 543 | } 544 | 545 | return buf 546 | } 547 | 548 | // ReplaceAll returns a copy of src, replacing matches of the Regexp 549 | // with the replacement text repl. Inside repl, $ signs are interpreted as 550 | // in Expand, so for instance $1 represents the text of the first submatch. 551 | func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 552 | n := 2 553 | if bytes.IndexByte(repl, '$') >= 0 { 554 | n = 2 * (re.numSubexp + 1) 555 | } 556 | srepl := "" 557 | b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 558 | if len(srepl) != len(repl) { 559 | srepl = string(repl) 560 | } 561 | return re.expand(dst, srepl, src, "", match) 562 | }) 563 | return b 564 | } 565 | 566 | // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 567 | // with the replacement bytes repl. The replacement repl is substituted directly, 568 | // without using Expand. 569 | func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 570 | return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 571 | return append(dst, repl...) 572 | }) 573 | } 574 | 575 | // ReplaceAllFunc returns a copy of src in which all matches of the 576 | // Regexp have been replaced by the return value of function repl applied 577 | // to the matched byte slice. The replacement returned by repl is substituted 578 | // directly, without using Expand. 579 | func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 580 | return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 581 | return append(dst, repl(src[match[0]:match[1]])...) 582 | }) 583 | } 584 | 585 | var specialBytes = []byte(`\.+*?()|[]{}^$`) 586 | 587 | func special(b byte) bool { 588 | return bytes.IndexByte(specialBytes, b) >= 0 589 | } 590 | 591 | // QuoteMeta returns a string that quotes all regular expression metacharacters 592 | // inside the argument text; the returned string is a regular expression matching 593 | // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 594 | func QuoteMeta(s string) string { 595 | b := make([]byte, 2*len(s)) 596 | 597 | // A byte loop is correct because all metacharacters are ASCII. 598 | j := 0 599 | for i := 0; i < len(s); i++ { 600 | if special(s[i]) { 601 | b[j] = '\\' 602 | j++ 603 | } 604 | b[j] = s[i] 605 | j++ 606 | } 607 | return string(b[0:j]) 608 | } 609 | 610 | // The number of capture values in the program may correspond 611 | // to fewer capturing expressions than are in the regexp. 612 | // For example, "(a){0}" turns into an empty program, so the 613 | // maximum capture in the program is 0 but we need to return 614 | // an expression for \1. Pad appends -1s to the slice a as needed. 615 | func (re *Regexp) pad(a []int) []int { 616 | if a == nil { 617 | // No match. 618 | return nil 619 | } 620 | n := (1 + re.numSubexp) * 2 621 | for len(a) < n { 622 | a = append(a, -1) 623 | } 624 | return a 625 | } 626 | 627 | // Find matches in slice b if b is non-nil, otherwise find matches in string s. 628 | func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 629 | var end int 630 | if b == nil { 631 | end = len(s) 632 | } else { 633 | end = len(b) 634 | } 635 | 636 | for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 637 | matches := re.doExecute(nil, b, s, pos, re.prog.NumCap) 638 | if len(matches) == 0 { 639 | break 640 | } 641 | 642 | accept := true 643 | if matches[1] == pos { 644 | // We've found an empty match. 645 | if matches[0] == prevMatchEnd { 646 | // We don't allow an empty match right 647 | // after a previous match, so ignore it. 648 | accept = false 649 | } 650 | var width int 651 | // TODO: use step() 652 | if b == nil { 653 | _, width = utf8.DecodeRuneInString(s[pos:end]) 654 | } else { 655 | _, width = utf8.DecodeRune(b[pos:end]) 656 | } 657 | if width > 0 { 658 | pos += width 659 | } else { 660 | pos = end + 1 661 | } 662 | } else { 663 | pos = matches[1] 664 | } 665 | prevMatchEnd = matches[1] 666 | 667 | if accept { 668 | deliver(re.pad(matches)) 669 | i++ 670 | } 671 | } 672 | } 673 | 674 | // Find returns a slice holding the text of the leftmost match in b of the regular expression. 675 | // A return value of nil indicates no match. 676 | func (re *Regexp) Find(b []byte) []byte { 677 | a := re.doExecute(nil, b, "", 0, 2) 678 | if a == nil { 679 | return nil 680 | } 681 | return b[a[0]:a[1]] 682 | } 683 | 684 | // FindIndex returns a two-element slice of integers defining the location of 685 | // the leftmost match in b of the regular expression. The match itself is at 686 | // b[loc[0]:loc[1]]. 687 | // A return value of nil indicates no match. 688 | func (re *Regexp) FindIndex(b []byte) (loc []int) { 689 | a := re.doExecute(nil, b, "", 0, 2) 690 | if a == nil { 691 | return nil 692 | } 693 | return a[0:2] 694 | } 695 | 696 | // FindString returns a string holding the text of the leftmost match in s of the regular 697 | // expression. If there is no match, the return value is an empty string, 698 | // but it will also be empty if the regular expression successfully matches 699 | // an empty string. Use FindStringIndex or FindStringSubmatch if it is 700 | // necessary to distinguish these cases. 701 | func (re *Regexp) FindString(s string) string { 702 | a := re.doExecute(nil, nil, s, 0, 2) 703 | if a == nil { 704 | return "" 705 | } 706 | return s[a[0]:a[1]] 707 | } 708 | 709 | // FindStringIndex returns a two-element slice of integers defining the 710 | // location of the leftmost match in s of the regular expression. The match 711 | // itself is at s[loc[0]:loc[1]]. 712 | // A return value of nil indicates no match. 713 | func (re *Regexp) FindStringIndex(s string) (loc []int) { 714 | a := re.doExecute(nil, nil, s, 0, 2) 715 | if a == nil { 716 | return nil 717 | } 718 | return a[0:2] 719 | } 720 | 721 | // FindReaderIndex returns a two-element slice of integers defining the 722 | // location of the leftmost match of the regular expression in text read from 723 | // the RuneReader. The match text was found in the input stream at 724 | // byte offset loc[0] through loc[1]-1. 725 | // A return value of nil indicates no match. 726 | func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 727 | a := re.doExecute(r, nil, "", 0, 2) 728 | if a == nil { 729 | return nil 730 | } 731 | return a[0:2] 732 | } 733 | 734 | // FindSubmatch returns a slice of slices holding the text of the leftmost 735 | // match of the regular expression in b and the matches, if any, of its 736 | // subexpressions, as defined by the 'Submatch' descriptions in the package 737 | // comment. 738 | // A return value of nil indicates no match. 739 | func (re *Regexp) FindSubmatch(b []byte) [][]byte { 740 | a := re.doExecute(nil, b, "", 0, re.prog.NumCap) 741 | if a == nil { 742 | return nil 743 | } 744 | ret := make([][]byte, 1+re.numSubexp) 745 | for i := range ret { 746 | if 2*i < len(a) && a[2*i] >= 0 { 747 | ret[i] = b[a[2*i]:a[2*i+1]] 748 | } 749 | } 750 | return ret 751 | } 752 | 753 | // Expand appends template to dst and returns the result; during the 754 | // append, Expand replaces variables in the template with corresponding 755 | // matches drawn from src. The match slice should have been returned by 756 | // FindSubmatchIndex. 757 | // 758 | // In the template, a variable is denoted by a substring of the form 759 | // $name or ${name}, where name is a non-empty sequence of letters, 760 | // digits, and underscores. A purely numeric name like $1 refers to 761 | // the submatch with the corresponding index; other names refer to 762 | // capturing parentheses named with the (?P...) syntax. A 763 | // reference to an out of range or unmatched index or a name that is not 764 | // present in the regular expression is replaced with an empty slice. 765 | // 766 | // In the $name form, name is taken to be as long as possible: $1x is 767 | // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 768 | // 769 | // To insert a literal $ in the output, use $$ in the template. 770 | func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 771 | return re.expand(dst, string(template), src, "", match) 772 | } 773 | 774 | // ExpandString is like Expand but the template and source are strings. 775 | // It appends to and returns a byte slice in order to give the calling 776 | // code control over allocation. 777 | func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 778 | return re.expand(dst, template, nil, src, match) 779 | } 780 | 781 | func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 782 | for len(template) > 0 { 783 | i := strings.Index(template, "$") 784 | if i < 0 { 785 | break 786 | } 787 | dst = append(dst, template[:i]...) 788 | template = template[i:] 789 | if len(template) > 1 && template[1] == '$' { 790 | // Treat $$ as $. 791 | dst = append(dst, '$') 792 | template = template[2:] 793 | continue 794 | } 795 | name, num, rest, ok := extract(template) 796 | if !ok { 797 | // Malformed; treat $ as raw text. 798 | dst = append(dst, '$') 799 | template = template[1:] 800 | continue 801 | } 802 | template = rest 803 | if num >= 0 { 804 | if 2*num+1 < len(match) && match[2*num] >= 0 { 805 | if bsrc != nil { 806 | dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 807 | } else { 808 | dst = append(dst, src[match[2*num]:match[2*num+1]]...) 809 | } 810 | } 811 | } else { 812 | for i, namei := range re.subexpNames { 813 | if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 814 | if bsrc != nil { 815 | dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 816 | } else { 817 | dst = append(dst, src[match[2*i]:match[2*i+1]]...) 818 | } 819 | break 820 | } 821 | } 822 | } 823 | } 824 | dst = append(dst, template...) 825 | return dst 826 | } 827 | 828 | // extract returns the name from a leading "$name" or "${name}" in str. 829 | // If it is a number, extract returns num set to that number; otherwise num = -1. 830 | func extract(str string) (name string, num int, rest string, ok bool) { 831 | if len(str) < 2 || str[0] != '$' { 832 | return 833 | } 834 | brace := false 835 | if str[1] == '{' { 836 | brace = true 837 | str = str[2:] 838 | } else { 839 | str = str[1:] 840 | } 841 | i := 0 842 | for i < len(str) { 843 | rune, size := utf8.DecodeRuneInString(str[i:]) 844 | if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 845 | break 846 | } 847 | i += size 848 | } 849 | if i == 0 { 850 | // empty name is not okay 851 | return 852 | } 853 | name = str[:i] 854 | if brace { 855 | if i >= len(str) || str[i] != '}' { 856 | // missing closing brace 857 | return 858 | } 859 | i++ 860 | } 861 | 862 | // Parse number. 863 | num = 0 864 | for i := 0; i < len(name); i++ { 865 | if name[i] < '0' || '9' < name[i] || num >= 1e8 { 866 | num = -1 867 | break 868 | } 869 | num = num*10 + int(name[i]) - '0' 870 | } 871 | // Disallow leading zeros. 872 | if name[0] == '0' && len(name) > 1 { 873 | num = -1 874 | } 875 | 876 | rest = str[i:] 877 | ok = true 878 | return 879 | } 880 | 881 | // FindSubmatchIndex returns a slice holding the index pairs identifying the 882 | // leftmost match of the regular expression in b and the matches, if any, of 883 | // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 884 | // in the package comment. 885 | // A return value of nil indicates no match. 886 | func (re *Regexp) FindSubmatchIndex(b []byte) []int { 887 | return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap)) 888 | } 889 | 890 | // FindStringSubmatch returns a slice of strings holding the text of the 891 | // leftmost match of the regular expression in s and the matches, if any, of 892 | // its subexpressions, as defined by the 'Submatch' description in the 893 | // package comment. 894 | // A return value of nil indicates no match. 895 | func (re *Regexp) FindStringSubmatch(s string) []string { 896 | a := re.doExecute(nil, nil, s, 0, re.prog.NumCap) 897 | if a == nil { 898 | return nil 899 | } 900 | ret := make([]string, 1+re.numSubexp) 901 | for i := range ret { 902 | if 2*i < len(a) && a[2*i] >= 0 { 903 | ret[i] = s[a[2*i]:a[2*i+1]] 904 | } 905 | } 906 | return ret 907 | } 908 | 909 | // FindStringSubmatchIndex returns a slice holding the index pairs 910 | // identifying the leftmost match of the regular expression in s and the 911 | // matches, if any, of its subexpressions, as defined by the 'Submatch' and 912 | // 'Index' descriptions in the package comment. 913 | // A return value of nil indicates no match. 914 | func (re *Regexp) FindStringSubmatchIndex(s string) []int { 915 | return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap)) 916 | } 917 | 918 | // FindReaderSubmatchIndex returns a slice holding the index pairs 919 | // identifying the leftmost match of the regular expression of text read by 920 | // the RuneReader, and the matches, if any, of its subexpressions, as defined 921 | // by the 'Submatch' and 'Index' descriptions in the package comment. A 922 | // return value of nil indicates no match. 923 | func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 924 | return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap)) 925 | } 926 | 927 | const startSize = 10 // The size at which to start a slice in the 'All' routines. 928 | 929 | // FindAll is the 'All' version of Find; it returns a slice of all successive 930 | // matches of the expression, as defined by the 'All' description in the 931 | // package comment. 932 | // A return value of nil indicates no match. 933 | func (re *Regexp) FindAll(b []byte, n int) [][]byte { 934 | if n < 0 { 935 | n = len(b) + 1 936 | } 937 | result := make([][]byte, 0, startSize) 938 | re.allMatches("", b, n, func(match []int) { 939 | result = append(result, b[match[0]:match[1]]) 940 | }) 941 | if len(result) == 0 { 942 | return nil 943 | } 944 | return result 945 | } 946 | 947 | // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 948 | // successive matches of the expression, as defined by the 'All' description 949 | // in the package comment. 950 | // A return value of nil indicates no match. 951 | func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 952 | if n < 0 { 953 | n = len(b) + 1 954 | } 955 | result := make([][]int, 0, startSize) 956 | re.allMatches("", b, n, func(match []int) { 957 | result = append(result, match[0:2]) 958 | }) 959 | if len(result) == 0 { 960 | return nil 961 | } 962 | return result 963 | } 964 | 965 | // FindAllString is the 'All' version of FindString; it returns a slice of all 966 | // successive matches of the expression, as defined by the 'All' description 967 | // in the package comment. 968 | // A return value of nil indicates no match. 969 | func (re *Regexp) FindAllString(s string, n int) []string { 970 | if n < 0 { 971 | n = len(s) + 1 972 | } 973 | result := make([]string, 0, startSize) 974 | re.allMatches(s, nil, n, func(match []int) { 975 | result = append(result, s[match[0]:match[1]]) 976 | }) 977 | if len(result) == 0 { 978 | return nil 979 | } 980 | return result 981 | } 982 | 983 | // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 984 | // slice of all successive matches of the expression, as defined by the 'All' 985 | // description in the package comment. 986 | // A return value of nil indicates no match. 987 | func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 988 | if n < 0 { 989 | n = len(s) + 1 990 | } 991 | result := make([][]int, 0, startSize) 992 | re.allMatches(s, nil, n, func(match []int) { 993 | result = append(result, match[0:2]) 994 | }) 995 | if len(result) == 0 { 996 | return nil 997 | } 998 | return result 999 | } 1000 | 1001 | // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1002 | // of all successive matches of the expression, as defined by the 'All' 1003 | // description in the package comment. 1004 | // A return value of nil indicates no match. 1005 | func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1006 | if n < 0 { 1007 | n = len(b) + 1 1008 | } 1009 | result := make([][][]byte, 0, startSize) 1010 | re.allMatches("", b, n, func(match []int) { 1011 | slice := make([][]byte, len(match)/2) 1012 | for j := range slice { 1013 | if match[2*j] >= 0 { 1014 | slice[j] = b[match[2*j]:match[2*j+1]] 1015 | } 1016 | } 1017 | result = append(result, slice) 1018 | }) 1019 | if len(result) == 0 { 1020 | return nil 1021 | } 1022 | return result 1023 | } 1024 | 1025 | // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1026 | // a slice of all successive matches of the expression, as defined by the 1027 | // 'All' description in the package comment. 1028 | // A return value of nil indicates no match. 1029 | func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1030 | if n < 0 { 1031 | n = len(b) + 1 1032 | } 1033 | result := make([][]int, 0, startSize) 1034 | re.allMatches("", b, n, func(match []int) { 1035 | result = append(result, match) 1036 | }) 1037 | if len(result) == 0 { 1038 | return nil 1039 | } 1040 | return result 1041 | } 1042 | 1043 | // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1044 | // returns a slice of all successive matches of the expression, as defined by 1045 | // the 'All' description in the package comment. 1046 | // A return value of nil indicates no match. 1047 | func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1048 | if n < 0 { 1049 | n = len(s) + 1 1050 | } 1051 | result := make([][]string, 0, startSize) 1052 | re.allMatches(s, nil, n, func(match []int) { 1053 | slice := make([]string, len(match)/2) 1054 | for j := range slice { 1055 | if match[2*j] >= 0 { 1056 | slice[j] = s[match[2*j]:match[2*j+1]] 1057 | } 1058 | } 1059 | result = append(result, slice) 1060 | }) 1061 | if len(result) == 0 { 1062 | return nil 1063 | } 1064 | return result 1065 | } 1066 | 1067 | // FindAllStringSubmatchIndex is the 'All' version of 1068 | // FindStringSubmatchIndex; it returns a slice of all successive matches of 1069 | // the expression, as defined by the 'All' description in the package 1070 | // comment. 1071 | // A return value of nil indicates no match. 1072 | func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1073 | if n < 0 { 1074 | n = len(s) + 1 1075 | } 1076 | result := make([][]int, 0, startSize) 1077 | re.allMatches(s, nil, n, func(match []int) { 1078 | result = append(result, match) 1079 | }) 1080 | if len(result) == 0 { 1081 | return nil 1082 | } 1083 | return result 1084 | } 1085 | -------------------------------------------------------------------------------- /restructure.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "regexp/syntax" 7 | 8 | "github.com/alexflint/go-restructure/regex" 9 | ) 10 | 11 | // Style determines whether we are in Perl or POSIX or custom mode 12 | type Style int 13 | 14 | const ( 15 | Perl Style = iota 16 | POSIX 17 | CustomStyle 18 | ) 19 | 20 | // Options represents optional parameters for compilation 21 | type Options struct { 22 | Style Style // Style can be set to Perl, POSIX, or CustomStyle 23 | SyntaxFlags syntax.Flags 24 | } 25 | 26 | type subcapture struct { 27 | begin, end int 28 | } 29 | 30 | func (r subcapture) wasMatched() bool { 31 | return r.begin != -1 && r.end != -1 32 | } 33 | 34 | type match struct { 35 | input []byte 36 | captures []subcapture 37 | } 38 | 39 | func matchFromIndices(indices []int, input []byte) *match { 40 | match := &match{ 41 | input: input, 42 | } 43 | for i := 0; i < len(indices); i += 2 { 44 | match.captures = append(match.captures, subcapture{indices[i], indices[i+1]}) 45 | } 46 | return match 47 | } 48 | 49 | // Pos represents a position within a matched region. If a matched struct contains 50 | // a field of type Pos then this field will be assigned a value indicating a position 51 | // in the input string, where the position corresponds to the index of the Pos field. 52 | type Pos int 53 | 54 | // Submatch represents a matched region. It is a used to determine the begin and and 55 | // position of the match corresponding to a field. This library treats fields of type 56 | // `Submatch` just like `string` or `[]byte` fields, except that the matched string 57 | // is inserted into `Submatch.Str` and its begin and end position are inserted into 58 | // `Submatch.Begin` and `Submatch.End`. 59 | type Submatch struct { 60 | Begin Pos 61 | End Pos 62 | Bytes []byte 63 | } 64 | 65 | // String gets the matched substring 66 | func (r *Submatch) String() string { 67 | return string(r.Bytes) 68 | } 69 | 70 | // Regexp is a regular expression that captures submatches into struct fields. 71 | type Regexp struct { 72 | st *Struct 73 | re *regex.Regexp 74 | t reflect.Type 75 | opts Options 76 | } 77 | 78 | // Find attempts to match the regular expression against the input string. It 79 | // returns true if there was a match, and also populates the fields of the provided 80 | // struct with the contents of each submatch. 81 | func (r *Regexp) Find(dest interface{}, s string) bool { 82 | v := reflect.ValueOf(dest) 83 | input := []byte(s) 84 | 85 | // Check the type 86 | expected := reflect.PtrTo(r.t) 87 | if v.Type() != expected { 88 | panic(fmt.Errorf("expected destination to be *%s but got %T", r.t.String(), dest)) 89 | } 90 | 91 | // Execute the regular expression 92 | indices := r.re.FindSubmatchIndex(input) 93 | if indices == nil { 94 | return false 95 | } 96 | 97 | // Inflate matches into original struct 98 | match := matchFromIndices(indices, input) 99 | 100 | err := inflateStruct(v, match, r.st) 101 | if err != nil { 102 | panic(err) 103 | } 104 | return true 105 | } 106 | 107 | // FindAll attempts to match the regular expression against the input string. It returns true 108 | // if there was at least one match. 109 | func (r *Regexp) FindAll(dest interface{}, s string, limit int) { 110 | // Check the type 111 | v := reflect.ValueOf(dest) 112 | t := v.Type() 113 | if t.Kind() != reflect.Ptr { 114 | panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest)) 115 | } 116 | 117 | sliceType := t.Elem() 118 | if sliceType.Kind() != reflect.Slice { 119 | panic(fmt.Errorf("parameter to FindAll should be a pointer to a slice but got %T", dest)) 120 | } 121 | 122 | itemType := sliceType.Elem() 123 | if itemType != r.t && itemType != reflect.PtrTo(r.t) { 124 | panic(fmt.Errorf("expected the slice element to be %s or *%s but it was %s", r.t, r.t, t)) 125 | } 126 | 127 | // Execute the regular expression 128 | input := []byte(s) 129 | matches := r.re.FindAllSubmatchIndex(input, limit) 130 | 131 | // Allocate a slice with the desired length 132 | v.Elem().Set(reflect.MakeSlice(sliceType, len(matches), len(matches))) 133 | 134 | // Inflate the matches into the slice elements 135 | for i, indices := range matches { 136 | // Get the i-th element of the slice 137 | destItem := v.Elem().Index(i) 138 | if itemType.Kind() != reflect.Ptr { 139 | destItem = destItem.Addr() 140 | } 141 | 142 | // Create the match object 143 | match := matchFromIndices(indices, input) 144 | 145 | // Inflate the match into the dest item 146 | err := inflateStruct(destItem, match, r.st) 147 | if err != nil { 148 | panic(err) 149 | } 150 | } 151 | } 152 | 153 | // String returns a string representation of the regular expression 154 | func (r *Regexp) String() string { 155 | return r.re.String() 156 | } 157 | 158 | // Compile constructs a regular expression from the struct fields on the 159 | // provided struct. 160 | func Compile(proto interface{}, opts Options) (*Regexp, error) { 161 | return CompileType(reflect.TypeOf(proto), opts) 162 | } 163 | 164 | // CompileType is like Compile but takes a reflect.Type instead. 165 | func CompileType(t reflect.Type, opts Options) (*Regexp, error) { 166 | // We do this so that the zero value for Options gives us Perl mode, 167 | // which is also the default used by the standard library regexp package 168 | switch opts.Style { 169 | case Perl: 170 | opts.SyntaxFlags = syntax.Perl 171 | case POSIX: 172 | opts.SyntaxFlags = syntax.POSIX 173 | } 174 | 175 | if t.Kind() == reflect.Ptr { 176 | t = t.Elem() 177 | } 178 | 179 | // Traverse the struct 180 | b := newBuilder(opts) 181 | st, expr, err := b.structure(t) 182 | if err != nil { 183 | return nil, err 184 | } 185 | 186 | // Compile regular expression 187 | re, err := regex.CompileSyntax(expr) 188 | if err != nil { 189 | return nil, err 190 | } 191 | 192 | // Return 193 | return &Regexp{ 194 | st: st, 195 | re: re, 196 | t: t, 197 | opts: opts, 198 | }, nil 199 | } 200 | 201 | // MustCompile is like Compile but panics if there is a compilation error 202 | func MustCompile(proto interface{}, opts Options) *Regexp { 203 | re, err := Compile(proto, opts) 204 | if err != nil { 205 | panic(err) 206 | } 207 | return re 208 | } 209 | 210 | // MustCompileType is like CompileType but panics if there is a compilation error 211 | func MustCompileType(t reflect.Type, opts Options) *Regexp { 212 | re, err := CompileType(t, opts) 213 | if err != nil { 214 | panic(err) 215 | } 216 | return re 217 | } 218 | 219 | // Find constructs a regular expression from the given struct and executes it on the 220 | // given string, placing submatches into the fields of the struct. The first parameter 221 | // must be a non-nil struct pointer. It returns true if the match succeeded. The only 222 | // errors that are returned are compilation errors. 223 | func Find(dest interface{}, s string) (bool, error) { 224 | re, err := Compile(dest, Options{}) 225 | if err != nil { 226 | return false, err 227 | } 228 | return re.Find(dest, s), nil 229 | } 230 | -------------------------------------------------------------------------------- /restructure_test.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func assertRegion(t *testing.T, s string, begin int, end int, r *Submatch) { 12 | assert.NotNil(t, r) 13 | assert.Equal(t, s, string(r.Bytes)) 14 | assert.EqualValues(t, begin, r.Begin) 15 | assert.EqualValues(t, end, r.End) 16 | } 17 | 18 | type DotName struct { 19 | Dot string `regexp:"\\."` 20 | Name string `regexp:"\\w+"` 21 | } 22 | 23 | type DotExpr struct { 24 | _ struct{} `regexp:"^"` 25 | Head string `regexp:"\\w+"` 26 | Tail *DotName `regexp:"?"` 27 | _ struct{} `regexp:"$"` 28 | } 29 | 30 | func TestMatchNameDotName(t *testing.T) { 31 | pattern, err := Compile(DotExpr{}, Options{}) 32 | require.NoError(t, err) 33 | 34 | var v DotExpr 35 | assert.True(t, pattern.Find(&v, "foo.bar")) 36 | assert.Equal(t, "foo", v.Head) 37 | require.NotNil(t, v.Tail) 38 | assert.Equal(t, ".", v.Tail.Dot) 39 | assert.Equal(t, "bar", v.Tail.Name) 40 | } 41 | 42 | func TestMatchNameDotNameHeadOnly(t *testing.T) { 43 | pattern, err := Compile(DotExpr{}, Options{}) 44 | require.NoError(t, err) 45 | 46 | var v DotExpr 47 | assert.True(t, pattern.Find(&v, "head")) 48 | assert.Equal(t, "head", v.Head) 49 | assert.Nil(t, v.Tail) 50 | } 51 | 52 | func TestMatchNameDotNameFails(t *testing.T) { 53 | pattern, err := Compile(DotExpr{}, Options{}) 54 | require.NoError(t, err) 55 | 56 | var v DotExpr 57 | assert.False(t, pattern.Find(&v, ".oops")) 58 | } 59 | 60 | type URL struct { 61 | _ string `regexp:"^"` 62 | Scheme string `regexp:"[[:alpha:]]+" json:"scheme"` 63 | _ string `regexp:"://"` 64 | Host string `regexp:".*" json:"host"` 65 | _ string `regexp:"$"` 66 | } 67 | 68 | func TestMatchURL(t *testing.T) { 69 | pattern, err := Compile(URL{}, Options{}) 70 | require.NoError(t, err) 71 | 72 | var v URL 73 | require.True(t, pattern.Find(&v, "http://example.com")) 74 | assert.Equal(t, "http", v.Scheme) 75 | assert.Equal(t, "example.com", v.Host) 76 | } 77 | 78 | func TestCombinationWithJSONTags(t *testing.T) { 79 | pattern, err := Compile(URL{}, Options{}) 80 | require.NoError(t, err) 81 | 82 | var v URL 83 | require.True(t, pattern.Find(&v, "http://example.com")) 84 | 85 | js, err := json.Marshal(&v) 86 | require.NoError(t, err) 87 | 88 | assert.Equal(t, "{\"scheme\":\"http\",\"host\":\"example.com\"}", string(js)) 89 | } 90 | 91 | type PtrURL struct { 92 | _ struct{} `regexp:"^"` 93 | Scheme *string `regexp:"[[:alpha:]]+"` 94 | _ struct{} `regexp:"://"` 95 | Host *string `regexp:".*"` 96 | _ struct{} `regexp:"$"` 97 | } 98 | 99 | func TestMatchPtrURL(t *testing.T) { 100 | pattern, err := Compile(PtrURL{}, Options{}) 101 | require.NoError(t, err) 102 | 103 | var v PtrURL 104 | require.True(t, pattern.Find(&v, "http://example.com")) 105 | require.NotNil(t, v.Scheme) 106 | require.NotNil(t, v.Host) 107 | assert.Equal(t, "http", *v.Scheme) 108 | assert.Equal(t, "example.com", *v.Host) 109 | } 110 | 111 | func TestMatchPtrURLFailed(t *testing.T) { 112 | pattern, err := Compile(PtrURL{}, Options{}) 113 | require.NoError(t, err) 114 | 115 | var v PtrURL 116 | require.False(t, pattern.Find(&v, "oops")) 117 | assert.Nil(t, v.Scheme) 118 | assert.Nil(t, v.Host) 119 | } 120 | 121 | type NakedURL struct { 122 | _ string `^` 123 | Scheme string `[[:alpha:]]+` 124 | _ string `://` 125 | Host string `.*` 126 | _ string `$` 127 | } 128 | 129 | func TestMatchNakedURL(t *testing.T) { 130 | pattern, err := Compile(NakedURL{}, Options{}) 131 | require.NoError(t, err) 132 | 133 | var v NakedURL 134 | require.True(t, pattern.Find(&v, "http://example.com")) 135 | assert.Equal(t, "http", v.Scheme) 136 | assert.Equal(t, "example.com", v.Host) 137 | } 138 | 139 | type Nothing struct { 140 | X string 141 | } 142 | 143 | func TestEmptyPattern(t *testing.T) { 144 | pattern, err := Compile(Nothing{}, Options{}) 145 | require.NoError(t, err) 146 | 147 | var v Nothing 148 | require.True(t, pattern.Find(&v, "abc")) 149 | } 150 | 151 | type Malformed struct { 152 | X string `regexp:"\w"` // this is malformed because \w is not a valid escape sequence 153 | } 154 | 155 | func TestErrorOnMalformedTag(t *testing.T) { 156 | _, err := Compile(Malformed{}, Options{}) 157 | assert.Error(t, err) 158 | } 159 | 160 | type HasSubcaptures struct { 161 | Name string `a(bc)?d` 162 | } 163 | 164 | func TestRemoveSubcaptures(t *testing.T) { 165 | pattern, err := Compile(HasSubcaptures{}, Options{}) 166 | require.NoError(t, err) 167 | 168 | var v HasSubcaptures 169 | require.True(t, pattern.Find(&v, "abcd")) 170 | assert.Equal(t, "abcd", v.Name) 171 | } 172 | 173 | type DotNameRegion struct { 174 | Dot *Submatch `regexp:"\\."` 175 | Name *Submatch `regexp:"\\w+"` 176 | } 177 | 178 | type DotExprRegion struct { 179 | _ struct{} `regexp:"^"` 180 | Head Submatch `regexp:"\\w+"` 181 | Tail *DotNameRegion `regexp:"?"` 182 | _ struct{} `regexp:"$"` 183 | } 184 | 185 | func TestMatchNameDotNameRegion(t *testing.T) { 186 | pattern, err := Compile(DotExprRegion{}, Options{}) 187 | require.NoError(t, err) 188 | 189 | var v DotExprRegion 190 | assert.True(t, pattern.Find(&v, "foo.bar")) 191 | assertRegion(t, "foo", 0, 3, &v.Head) 192 | assert.NotNil(t, v.Tail) 193 | assertRegion(t, ".", 3, 4, v.Tail.Dot) 194 | assertRegion(t, "bar", 4, 7, v.Tail.Name) 195 | } 196 | 197 | type DotNamePos struct { 198 | Begin Pos 199 | Dot string `regexp:"\\."` 200 | Middle Pos 201 | Name string `regexp:"\\w+"` 202 | End Pos 203 | } 204 | 205 | type DotExprPos struct { 206 | Begin Pos 207 | _ struct{} `regexp:"^"` 208 | Head string `regexp:"\\w+"` 209 | Middle Pos 210 | Tail *DotNamePos `regexp:"?"` 211 | _ struct{} `regexp:"$"` 212 | End Pos 213 | } 214 | 215 | func TestMatchNameDotNamePos(t *testing.T) { 216 | pattern, err := Compile(DotExprPos{}, Options{}) 217 | require.NoError(t, err) 218 | 219 | var v DotExprPos 220 | assert.True(t, pattern.Find(&v, "foo.bar")) 221 | assert.EqualValues(t, 0, v.Begin) 222 | assert.EqualValues(t, 3, v.Middle) 223 | assert.EqualValues(t, 3, v.Tail.Begin) 224 | assert.EqualValues(t, 4, v.Tail.Middle) 225 | assert.EqualValues(t, 7, v.Tail.End) 226 | assert.EqualValues(t, 7, v.End) 227 | } 228 | 229 | type DegeneratePos struct { 230 | X Pos 231 | Y Pos 232 | } 233 | 234 | func TestDegeneratePos(t *testing.T) { 235 | // This tests what happens if there are degenerate position captures 236 | pattern, err := Compile(DegeneratePos{}, Options{}) 237 | require.NoError(t, err) 238 | var v DegeneratePos 239 | assert.True(t, pattern.Find(&v, "abc")) 240 | assert.EqualValues(t, 0, v.X) 241 | assert.EqualValues(t, 0, v.Y) 242 | } 243 | 244 | type UnexportedPos struct { 245 | Exported Pos 246 | unexported Pos 247 | _ struct{} `regexp:"$"` 248 | } 249 | 250 | func TestUnexportedPos(t *testing.T) { 251 | // This tests what happens if there are non-exported Pos fields 252 | pattern, err := Compile(UnexportedPos{}, Options{}) 253 | require.NoError(t, err) 254 | var v UnexportedPos 255 | assert.True(t, pattern.Find(&v, "abc")) 256 | assert.EqualValues(t, 3, v.Exported) 257 | assert.EqualValues(t, 0, v.unexported) // should be ignored 258 | } 259 | 260 | type Word struct { 261 | S string `\w+` 262 | } 263 | 264 | func TestFindAllWords_Simple(t *testing.T) { 265 | pattern := MustCompile(Word{}, Options{}) 266 | var words []Word 267 | pattern.FindAll(&words, "ham is spam", -1) 268 | require.Len(t, words, 3) 269 | assert.EqualValues(t, "ham", words[0].S) 270 | assert.EqualValues(t, "is", words[1].S) 271 | assert.EqualValues(t, "spam", words[2].S) 272 | } 273 | 274 | func TestFindAllWords_Ptr(t *testing.T) { 275 | pattern := MustCompile(Word{}, Options{}) 276 | var words []*Word 277 | pattern.FindAll(&words, "ham is spam", -1) 278 | require.Len(t, words, 3) 279 | assert.EqualValues(t, "ham", words[0].S) 280 | assert.EqualValues(t, "is", words[1].S) 281 | assert.EqualValues(t, "spam", words[2].S) 282 | } 283 | 284 | func TestFindAllWords_NoMatches(t *testing.T) { 285 | pattern := MustCompile(Word{}, Options{}) 286 | var words []*Word 287 | pattern.FindAll(&words, "*&!", -1) 288 | require.Empty(t, words) 289 | } 290 | 291 | func TestFindAllWords_ByValueSlicePanics(t *testing.T) { 292 | pattern := MustCompile(Word{}, Options{}) 293 | var words []*Word 294 | // This should panic because words is passed by value not by pointer: 295 | assert.Panics(t, func() { pattern.FindAll(words, "*&!", -1) }) 296 | } 297 | 298 | type WordSubmatch struct { 299 | S *Submatch `\w+` 300 | } 301 | 302 | func TestFindAllWords_Regions(t *testing.T) { 303 | pattern := MustCompile(WordSubmatch{}, Options{}) 304 | var words []*WordSubmatch 305 | pattern.FindAll(&words, "ham is spam", -1) 306 | require.Len(t, words, 3) 307 | assertRegion(t, "ham", 0, 3, words[0].S) 308 | assertRegion(t, "is", 4, 6, words[1].S) 309 | assertRegion(t, "spam", 7, 11, words[2].S) 310 | } 311 | 312 | type ExprWithInt struct { 313 | Number int `regexp:"^\\d+"` 314 | _ string `regexp:"\\s+"` 315 | Animal string `regexp:"\\w+$"` 316 | } 317 | 318 | func TestMatchWithInt(t *testing.T) { 319 | pattern, err := Compile(ExprWithInt{}, Options{}) 320 | require.NoError(t, err) 321 | 322 | var v ExprWithInt 323 | assert.True(t, pattern.Find(&v, "4 wombats")) 324 | assert.Equal(t, 4, v.Number) 325 | assert.Equal(t, "wombats", v.Animal) 326 | } 327 | -------------------------------------------------------------------------------- /samples/email-address/email-address.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/alexflint/go-restructure" 7 | ) 8 | 9 | type Hostname struct { 10 | Domain string `\w+` 11 | _ struct{} `\.` 12 | TLD string `\w+` 13 | } 14 | 15 | type EmailAddress struct { 16 | _ struct{} `^` 17 | User string `[a-zA-Z0-9._%+-]+` 18 | _ struct{} `@` 19 | Host *Hostname 20 | _ struct{} `$` 21 | } 22 | 23 | func main() { 24 | var addr EmailAddress 25 | success, _ := restructure.Find(&addr, "joe@example.com") 26 | if success { 27 | fmt.Println(addr.User) // prints "joe" 28 | fmt.Println(addr.Host.Domain) // prints "example" 29 | fmt.Println(addr.Host.TLD) // prints "com" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /samples/find-all-floats/find-all-floats.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/alexflint/go-restructure" 7 | ) 8 | 9 | var src = ` 10 | The US economy went through an economic downturn following the financial 11 | crisis of 2007–08, with output as late as 2013 still below potential 12 | according to the Congressional Budget Office.[57] The economy, however, 13 | began to recover in the second half of 2009, and as of November 2015, 14 | unemployment had declined from a high of 10% to 5%; the government's 15 | broader U-6 unemployment rate, which includes the part-time underemployed, 16 | was 9.8% (it had reached 16% in 2009).[13] At 11.3%, the U.S. has one of 17 | the lowest labor union participation rates in the OECD.[58] Households 18 | living on less than $2 per day before government benefits, doubled from 19 | 1996 levels to 1.5 million households in 2011, including 2.8 million 20 | children.[59] The gap in income between rich and poor is greater in the 21 | United States than in any other developed country.[60] Total public and 22 | private debt was $50 trillion at the end of the first quarter of 2010, 23 | or 3.5 times GDP.[61] In December 2014, public debt was slightly more 24 | than 100% of GDP.[62] Domestic financial assets totaled $131 trillion 25 | and domestic financial liabilities totaled $106 trillion.[63] 26 | ` 27 | 28 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{}) 29 | 30 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123" 31 | type Float struct { 32 | Begin restructure.Pos 33 | Sign *Sign `?` 34 | Whole string `[0-9]*` 35 | Period struct{} `\.?` 36 | Frac string `[0-9]+` 37 | Exponent *Exponent `?` 38 | End restructure.Pos 39 | } 40 | 41 | // Matches "+" or "-" 42 | type Sign struct { 43 | Ch string `[+-]` 44 | } 45 | 46 | // Matches "e+4", "E6", "e-03" 47 | type Exponent struct { 48 | _ struct{} `[eE]` 49 | Sign *Sign `?` 50 | Num string `[0-9]+` 51 | } 52 | 53 | func main() { 54 | var floats []Float 55 | floatRegexp.FindAll(&floats, src, -1) 56 | for _, f := range floats { 57 | fmt.Println(src[f.Begin:f.End]) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /samples/floating-point/floating-point.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | 7 | "github.com/alexflint/go-restructure" 8 | ) 9 | 10 | var floatRegexp = restructure.MustCompile(Float{}, restructure.Options{}) 11 | 12 | // Matches "123", "1.23", "1.23e-4", "-12.3E+5", ".123" 13 | type Float struct { 14 | Sign *Sign `?` 15 | Whole string `[0-9]*` 16 | Period struct{} `\.?` 17 | Frac string `[0-9]+` 18 | Exponent *Exponent `?` 19 | } 20 | 21 | // Matches "+" or "-" 22 | type Sign struct { 23 | Ch string `[+-]` 24 | } 25 | 26 | // Matches "e+4", "E6", "e-03" 27 | type Exponent struct { 28 | _ struct{} `[eE]` 29 | Sign *Sign `?` 30 | Num string `[0-9]+` 31 | } 32 | 33 | func prettyPrint(x interface{}) string { 34 | buf, err := json.MarshalIndent(x, "", " ") 35 | if err != nil { 36 | return err.Error() 37 | } 38 | return string(buf) 39 | } 40 | 41 | func main() { 42 | var f Float 43 | for _, str := range []string{"1.23", "1.23e+45", ".123", "12e3"} { 44 | floatRegexp.Find(&f, str) 45 | fmt.Printf("\"%s\" -> %s\n\n", str, prettyPrint(f)) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /samples/name-dot-name/name-dot-name.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/alexflint/go-arg" 9 | "github.com/alexflint/go-restructure" 10 | ) 11 | 12 | type DotName struct { 13 | Dot string `\.` 14 | Name string `\w+` 15 | } 16 | 17 | type DotExpr struct { 18 | _ struct{} `^` 19 | Head string `foo` 20 | Tail *DotName `?` 21 | _ struct{} `$` 22 | } 23 | 24 | func prettyPrint(x interface{}) string { 25 | buf, err := json.MarshalIndent(x, "", " ") 26 | if err != nil { 27 | return err.Error() 28 | } 29 | return string(buf) 30 | } 31 | 32 | func main() { 33 | var args struct { 34 | Str string `arg:"positional"` 35 | } 36 | arg.MustParse(&args) 37 | 38 | // Construct the regular expression 39 | pattern, err := restructure.Compile(&DotExpr{}, restructure.Options{}) 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | 44 | // Match 45 | var v DotExpr 46 | fmt.Println(pattern.Find(&v, args.Str)) 47 | } 48 | -------------------------------------------------------------------------------- /samples/python-import/python-import.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/alexflint/go-restructure" 7 | ) 8 | 9 | var importRegexp = restructure.MustCompile(Import{}, restructure.Options{}) 10 | 11 | // Import matches "import foo" and "import foo as bar" 12 | type Import struct { 13 | _ struct{} `^import\s+` 14 | Package restructure.Submatch `\w+` 15 | Alias *AsName `?` 16 | _ struct{} `$` 17 | } 18 | 19 | // AsName matches "as xyz" 20 | type AsName struct { 21 | _ struct{} `\s+as\s+` 22 | Name restructure.Submatch `\w+` 23 | } 24 | 25 | func main() { 26 | var imp Import 27 | importRegexp.Find(&imp, "import foo as bar") 28 | fmt.Printf("IMPORT %s (bytes %d...%d)\n", imp.Package.String(), imp.Package.Begin, imp.Package.End) 29 | fmt.Printf(" AS %s (bytes %d...%d)\n", imp.Alias.Name.String(), imp.Alias.Name.Begin, imp.Alias.Name.End) 30 | } 31 | -------------------------------------------------------------------------------- /samples/quaternion-in-json/quaternion-in-json.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | 7 | "github.com/alexflint/go-restructure" 8 | ) 9 | 10 | var quaternionRegexp = restructure.MustCompile(QuotedQuaternion{}, restructure.Options{}) 11 | 12 | type RealPart struct { 13 | Sign string `regexp:"[+-]?"` 14 | Real string `regexp:"[0-9]+"` 15 | } 16 | 17 | type SignedInt struct { 18 | Sign string `regexp:"[+-]"` 19 | Real string `regexp:"[0-9]+"` 20 | } 21 | 22 | type IPart struct { 23 | Magnitude SignedInt 24 | _ struct{} `regexp:"i"` 25 | } 26 | 27 | type JPart struct { 28 | Magnitude SignedInt 29 | _ struct{} `regexp:"j"` 30 | } 31 | 32 | type KPart struct { 33 | Magnitude SignedInt 34 | _ struct{} `regexp:"k"` 35 | } 36 | 37 | // matches "1+2i+3j+4k", "-1+2k", "-1", etc 38 | type Quaternion struct { 39 | Real *RealPart 40 | I *IPart `regexp:"?"` 41 | J *JPart `regexp:"?"` 42 | K *KPart `regexp:"?"` 43 | } 44 | 45 | // matches the quoted strings `"-1+2i+3j+4k"`, `"3-4k"`, `"12+34i"`, etc 46 | type QuotedQuaternion struct { 47 | _ struct{} `regexp:"^"` 48 | _ struct{} `regexp:"\""` 49 | Quaternion *Quaternion 50 | _ struct{} `regexp:"\""` 51 | _ struct{} `regexp:"$"` 52 | } 53 | 54 | func (c *QuotedQuaternion) UnmarshalJSON(b []byte) error { 55 | if !quaternionRegexp.Find(c, string(b)) { 56 | return fmt.Errorf("%s is not a quaternion number", string(b)) 57 | } 58 | return nil 59 | } 60 | 61 | // this struct is handled by JSON 62 | type Var struct { 63 | Name string 64 | Value *QuotedQuaternion 65 | } 66 | 67 | func prettyPrint(x interface{}) string { 68 | buf, err := json.MarshalIndent(x, "", " ") 69 | if err != nil { 70 | return err.Error() 71 | } 72 | return string(buf) 73 | } 74 | 75 | func main() { 76 | src := `{"name": "foo", "value": "1+2i+3j+4k"}` 77 | var v Var 78 | err := json.Unmarshal([]byte(src), &v) 79 | if err != nil { 80 | fmt.Println(err) 81 | } 82 | fmt.Println(prettyPrint(v)) 83 | } 84 | -------------------------------------------------------------------------------- /samples/simple-email/simple-email.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/alexflint/go-restructure" 7 | ) 8 | 9 | type EmailAddress struct { 10 | _ struct{} `^` 11 | User string `\w+` 12 | _ struct{} `@` 13 | Host string `[^@]+` 14 | _ struct{} `$` 15 | } 16 | 17 | func main() { 18 | var addr EmailAddress 19 | success, err := restructure.Find(&addr, "joe@example.com") 20 | if err != nil { 21 | fmt.Println(err) 22 | } 23 | if success { 24 | fmt.Println(addr.User) // prints "joe" 25 | fmt.Println(addr.Host) // prints "example.com" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /transform.go: -------------------------------------------------------------------------------- 1 | package restructure 2 | 3 | import "regexp/syntax" 4 | 5 | type transformer func(expr *syntax.Regexp) ([]*syntax.Regexp, error) 6 | 7 | // transform replaces each node in a regex AST with the return value of the given function 8 | // it processes the children of a node before the node itself 9 | func transform(expr *syntax.Regexp, f transformer) (*syntax.Regexp, error) { 10 | var newchildren []*syntax.Regexp 11 | for _, child := range expr.Sub { 12 | newchild, err := transform(child, f) 13 | if err != nil { 14 | return nil, err 15 | } 16 | replacements, err := f(newchild) 17 | if err != nil { 18 | return nil, err 19 | } 20 | newchildren = append(newchildren, replacements...) 21 | } 22 | expr.Sub = newchildren 23 | return expr, nil 24 | } 25 | --------------------------------------------------------------------------------