├── .gitignore ├── .travis.yml ├── interface.go ├── LICENSE ├── README.md ├── pcre2_bench_test.go ├── pcre2_test.go └── pcre2.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.5 4 | - tip 5 | sudo: true 6 | before_install: 7 | - wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-10.20.tar.gz -O /tmp/pcre2-10.20.tar.gz 8 | - cd /tmp && tar -xvzf pcre2-10.20.tar.gz && cd /tmp/pcre2-10.20 && ./configure --enable-pcre2-32 --disable-pcre2-8 --prefix=/usr && sudo make install 9 | install: 10 | - cd $HOME/gopath/src/github.com/lestrrat/go-pcre2 11 | - go get -t -v ./... 12 | script: 13 | - cd $HOME/gopath/src/github.com/lestrrat/go-pcre2 14 | - go test -v ./... 15 | -------------------------------------------------------------------------------- /interface.go: -------------------------------------------------------------------------------- 1 | package pcre2 2 | 3 | import "errors" 4 | 5 | // Regexp represents a compiled regular expression. Internally 6 | // it wraps a reference to `pcre2_code` type. 7 | type Regexp struct { 8 | pattern string 9 | ptr uintptr // *C.pcre2_code 10 | } 11 | 12 | var ( 13 | // ErrInvalidRegexp is returned when the provided Regexp is 14 | // not backed by a proper C pointer to pcre2_code 15 | ErrInvalidRegexp = errors.New("invalid regexp") 16 | // ErrInvalidUTF8String is returned when the input string cannot 17 | // be decoded into runes 18 | ErrInvalidUTF8String = errors.New("invalid utf8 string") 19 | ) 20 | 21 | // ErrCompile is returned when compiling the regular expression fails. 22 | type ErrCompile struct { 23 | message string 24 | offset int 25 | pattern string 26 | } 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 lestrrat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-pcre2 2 | (Work In Progress) PCRE2 binding for Go 3 | 4 | ## Benchmarks 5 | 6 | ``` 7 | shoebill% go test -v -run=none -benchmem -benchtime=5s -bench . 8 | PASS 9 | BenchmarkGoRegexpMatch-4 300000 27505 ns/op 46400 B/op 95 allocs/op 10 | BenchmarkPCRE2RegexpMatch-4 500000 14602 ns/op 2632 B/op 70 allocs/op 11 | BenchmarkGoRegexpMatchString-4 300000 29537 ns/op 46304 B/op 89 allocs/op 12 | BenchmarkPCRE2RegexpMatchString-4 500000 13864 ns/op 2536 B/op 64 allocs/op 13 | BenchmarkGoFindAllIndex-4 300000 28054 ns/op 43808 B/op 52 allocs/op 14 | BenchmarkPCRE2FindAllIndex-4 500000 17762 ns/op 3112 B/op 64 allocs/op 15 | BenchmarkGoFindAllStringIndex-4 300000 27144 ns/op 43712 B/op 49 allocs/op 16 | BenchmarkPCRE2FindAllStringIndex-4 500000 18827 ns/op 3016 B/op 61 allocs/op 17 | BenchmarkGoFindSubmatchIndex-4 300000 21354 ns/op 42801 B/op 43 allocs/op 18 | BenchmarkPCRE2FindSubmatchIndex-4 500000 12326 ns/op 2776 B/op 55 allocs/op 19 | BenchmarkGoFindStringSubmatchIndex-4 300000 21164 ns/op 42705 B/op 40 allocs/op 20 | BenchmarkPCRE2FindStringSubmatchIndex-4 1000000 11843 ns/op 2680 B/op 52 allocs/op 21 | BenchmarkGoFindAllSubmatchIndex-4 300000 28490 ns/op 43808 B/op 52 allocs/op 22 | BenchmarkPCRE2FindAllSubmatchIndex-4 300000 20977 ns/op 3688 B/op 73 allocs/op 23 | BenchmarkGoFindAllStringSubmatchIndex-4 300000 27409 ns/op 43712 B/op 49 allocs/op 24 | BenchmarkPCRE2FindAllStringSubmatchIndex-4 500000 18764 ns/op 3592 B/op 70 allocs/op 25 | ok github.com/lestrrat/go-pcre2 137.179s 26 | ``` 27 | -------------------------------------------------------------------------------- /pcre2_bench_test.go: -------------------------------------------------------------------------------- 1 | package pcre2_test 2 | 3 | import ( 4 | "regexp" 5 | "testing" 6 | 7 | "github.com/lestrrat/go-pcre2" 8 | ) 9 | 10 | type regexper interface { 11 | Match([]byte) bool 12 | MatchString(string) bool 13 | FindAllIndex([]byte, int) [][]int 14 | FindAllSubmatchIndex([]byte, int) [][]int 15 | FindAllStringIndex(string, int) [][]int 16 | FindAllStringSubmatchIndex(string, int) [][]int 17 | FindSubmatchIndex([]byte) []int 18 | FindStringSubmatchIndex(string) []int 19 | } 20 | 21 | func benchMatch(b *testing.B, re regexper, dos bool) { 22 | patterns := []string{`Hello World!`, `Hello Friend!`, `Hello 友達!`} 23 | for _, pat := range patterns { 24 | var rv bool 25 | if dos { 26 | rv = re.MatchString(pat) 27 | } else { 28 | rv = re.Match([]byte(pat)) 29 | } 30 | if !rv { 31 | b.Errorf("Expected to match, failed") 32 | return 33 | } 34 | } 35 | 36 | patterns = []string{`Goodbye World!`, `Hell no!`, `HelloWorld!`} 37 | for _, pat := range patterns { 38 | var rv bool 39 | if dos { 40 | rv = re.MatchString(pat) 41 | } else { 42 | rv = re.Match([]byte(pat)) 43 | } 44 | 45 | if rv { 46 | b.Errorf("Expected to NOT match, matched") 47 | return 48 | } 49 | } 50 | } 51 | 52 | func benchFindAllIndex(b *testing.B, re regexper, dos bool) { 53 | patterns := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 54 | for _, pat := range patterns { 55 | var matches [][]int 56 | if dos { 57 | matches = re.FindAllStringIndex(pat, -1) 58 | } else { 59 | matches = re.FindAllIndex([]byte(pat), -1) 60 | } 61 | 62 | if len(matches) != 3 { 63 | b.Errorf("Expected to match '%s' against '%#v', got %d", pat, re, len(matches)) 64 | b.Logf("%#v", matches) 65 | return 66 | } 67 | } 68 | } 69 | 70 | func benchFindSubmatchIndex(b *testing.B, re regexper, dos bool) { 71 | patterns := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 72 | for _, pat := range patterns { 73 | var matches []int 74 | if dos { 75 | matches = re.FindStringSubmatchIndex(pat) 76 | } else { 77 | matches = re.FindSubmatchIndex([]byte(pat)) 78 | } 79 | 80 | if len(matches) != 6 { 81 | b.Errorf("Expected to match '%s' against '%#v', got %d", pat, re, len(matches)) 82 | b.Logf("%#v", matches) 83 | return 84 | } 85 | } 86 | } 87 | 88 | func benchFindAllSubmatchIndex(b *testing.B, re regexper, dos bool) { 89 | patterns := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 90 | for _, pat := range patterns { 91 | var matches [][]int 92 | if dos { 93 | matches = re.FindAllStringSubmatchIndex(pat, -1) 94 | } else { 95 | matches = re.FindAllSubmatchIndex([]byte(pat), -1) 96 | } 97 | 98 | if len(matches) != 3 { 99 | b.Errorf("Expected to match '%s' against '%#v', got %d", pat, re, len(matches)) 100 | b.Logf("%#v", matches) 101 | return 102 | } 103 | } 104 | } 105 | 106 | func makeBenchFunc(b *testing.B, which bool, dos bool, pattern string, f func(*testing.B, regexper, bool)) func() { 107 | // Forcing a function call so that we have chance to 108 | // run garbage collection for each iteration 109 | return func() { 110 | var re regexper 111 | var err error 112 | if which { // true == pcre2 113 | re, err = pcre2.Compile(pattern) 114 | } else { 115 | re, err = regexp.Compile(pattern) 116 | } 117 | if err != nil { 118 | b.Errorf("compile failed: %s", err) 119 | return 120 | } 121 | f(b, re, dos) 122 | } 123 | } 124 | 125 | const ( 126 | UseGoRegexp = false 127 | UsePCRE2Regexp = true 128 | UseBytes = false 129 | UseString = true 130 | ) 131 | 132 | // Match, MatchString 133 | const RegexpMatchRegex = `^Hello (.+)!$` 134 | 135 | func BenchmarkGoRegexpMatch(b *testing.B) { 136 | benchf := makeBenchFunc(b, UseGoRegexp, UseBytes, RegexpMatchRegex, benchMatch) 137 | for i := 0; i < b.N; i++ { 138 | benchf() 139 | } 140 | } 141 | 142 | func BenchmarkPCRE2RegexpMatch(b *testing.B) { 143 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseBytes, RegexpMatchRegex, benchMatch) 144 | for i := 0; i < b.N; i++ { 145 | benchf() 146 | } 147 | } 148 | 149 | func BenchmarkGoRegexpMatchString(b *testing.B) { 150 | benchf := makeBenchFunc(b, UseGoRegexp, UseString, RegexpMatchRegex, benchMatch) 151 | for i := 0; i < b.N; i++ { 152 | benchf() 153 | } 154 | } 155 | 156 | func BenchmarkPCRE2RegexpMatchString(b *testing.B) { 157 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseString, RegexpMatchRegex, benchMatch) 158 | for i := 0; i < b.N; i++ { 159 | benchf() 160 | } 161 | } 162 | 163 | // FindAllIndex, FindAllStringIndex 164 | const FindAllIndexRegex = `(\S+):(\S+)` 165 | 166 | func BenchmarkGoFindAllIndex(b *testing.B) { 167 | benchf := makeBenchFunc(b, UseGoRegexp, UseBytes, FindAllIndexRegex, benchFindAllIndex) 168 | for i := 0; i < b.N; i++ { 169 | benchf() 170 | } 171 | } 172 | 173 | func BenchmarkPCRE2FindAllIndex(b *testing.B) { 174 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseBytes, FindAllIndexRegex, benchFindAllIndex) 175 | for i := 0; i < b.N; i++ { 176 | benchf() 177 | } 178 | } 179 | 180 | func BenchmarkGoFindAllStringIndex(b *testing.B) { 181 | benchf := makeBenchFunc(b, UseGoRegexp, UseString, FindAllIndexRegex, benchFindAllIndex) 182 | for i := 0; i < b.N; i++ { 183 | benchf() 184 | } 185 | } 186 | 187 | func BenchmarkPCRE2FindAllStringIndex(b *testing.B) { 188 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseString, FindAllIndexRegex, benchFindAllIndex) 189 | for i := 0; i < b.N; i++ { 190 | benchf() 191 | } 192 | } 193 | 194 | // FindSubmatchIndex, FindStringSubmatchIndex 195 | const FindSubmatchIndexRegex = `(\S+):(\S+)` 196 | 197 | func BenchmarkGoFindSubmatchIndex(b *testing.B) { 198 | benchf := makeBenchFunc(b, UseGoRegexp, UseBytes, FindSubmatchIndexRegex, benchFindSubmatchIndex) 199 | for i := 0; i < b.N; i++ { 200 | benchf() 201 | } 202 | } 203 | 204 | func BenchmarkPCRE2FindSubmatchIndex(b *testing.B) { 205 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseBytes, FindSubmatchIndexRegex, benchFindSubmatchIndex) 206 | for i := 0; i < b.N; i++ { 207 | benchf() 208 | } 209 | } 210 | 211 | func BenchmarkGoFindStringSubmatchIndex(b *testing.B) { 212 | benchf := makeBenchFunc(b, UseGoRegexp, UseString, FindSubmatchIndexRegex, benchFindSubmatchIndex) 213 | for i := 0; i < b.N; i++ { 214 | benchf() 215 | } 216 | } 217 | 218 | func BenchmarkPCRE2FindStringSubmatchIndex(b *testing.B) { 219 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseString, FindSubmatchIndexRegex, benchFindSubmatchIndex) 220 | for i := 0; i < b.N; i++ { 221 | benchf() 222 | } 223 | } 224 | 225 | // FindAllSubmatchIndex, FindAllStringSubmatchIndex 226 | const FindAllSubmatchIndexRegex = `(\S+):(\S+)` 227 | 228 | func BenchmarkGoFindAllSubmatchIndex(b *testing.B) { 229 | benchf := makeBenchFunc(b, UseGoRegexp, UseBytes, FindAllSubmatchIndexRegex, benchFindAllSubmatchIndex) 230 | for i := 0; i < b.N; i++ { 231 | benchf() 232 | } 233 | } 234 | 235 | func BenchmarkPCRE2FindAllSubmatchIndex(b *testing.B) { 236 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseBytes, FindAllSubmatchIndexRegex, benchFindAllSubmatchIndex) 237 | for i := 0; i < b.N; i++ { 238 | benchf() 239 | } 240 | } 241 | 242 | func BenchmarkGoFindAllStringSubmatchIndex(b *testing.B) { 243 | benchf := makeBenchFunc(b, UseGoRegexp, UseString, FindAllSubmatchIndexRegex, benchFindAllSubmatchIndex) 244 | for i := 0; i < b.N; i++ { 245 | benchf() 246 | } 247 | } 248 | 249 | func BenchmarkPCRE2FindAllStringSubmatchIndex(b *testing.B) { 250 | benchf := makeBenchFunc(b, UsePCRE2Regexp, UseString, FindAllSubmatchIndexRegex, benchFindAllSubmatchIndex) 251 | for i := 0; i < b.N; i++ { 252 | benchf() 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /pcre2_test.go: -------------------------------------------------------------------------------- 1 | package pcre2_test 2 | 3 | import ( 4 | "regexp" 5 | "testing" 6 | 7 | "github.com/lestrrat/go-pcre2" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestBadPattern(t *testing.T) { 12 | re, err := pcre2.Compile(`^Hello [World!$`) 13 | t.Logf("%s", err) 14 | if !assert.Error(t, err, "Compile works") { 15 | return 16 | } 17 | defer re.Free() 18 | } 19 | 20 | func TestBasic(t *testing.T) { 21 | re, err := pcre2.Compile(`^Hello (.+)!$`) 22 | if !assert.NoError(t, err, "Compile works") { 23 | return 24 | } 25 | defer re.Free() 26 | 27 | patterns := []string{`Hello World!`, `Hello Friend!`, `Hello 友達!`} 28 | for _, pat := range patterns { 29 | t.Logf("Matching against []byte '%s' (expect MATCH)", pat) 30 | if !assert.True(t, re.Match([]byte(pat)), "Match succeeds for %s", pat) { 31 | return 32 | } 33 | 34 | t.Logf("Matching against string '%s' (expect MATCH)", pat) 35 | if !assert.True(t, re.MatchString(pat), "MatchString succeeds for %s", pat) { 36 | return 37 | } 38 | } 39 | 40 | patterns = []string{`Goodbye World!`, `Hell no!`, `HelloWorld!`} 41 | for _, pat := range patterns { 42 | t.Logf("Matching against []byte '%s' (expect FAIL)", pat) 43 | if !assert.False(t, re.Match([]byte(pat)), "Match fails for %s", pat) { 44 | return 45 | } 46 | 47 | t.Logf("Matching against string '%s' (expect FAIL)", pat) 48 | if !assert.False(t, re.MatchString(pat), "MatchString fails for %s", pat) { 49 | return 50 | } 51 | } 52 | } 53 | 54 | func TestFind(t *testing.T) { 55 | pattern := `(\S+):(\S+)` 56 | gore, err := regexp.Compile(pattern) 57 | if !assert.NoError(t, err, "Compile works (Go)") { 58 | return 59 | } 60 | 61 | re, err := pcre2.Compile(pattern) 62 | if !assert.NoError(t, err, "Compile works (pcre2)") { 63 | return 64 | } 65 | defer re.Free() 66 | 67 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 68 | for _, doString := range []bool{true, false} { 69 | var methodName string 70 | if doString { 71 | methodName = "FindString" 72 | } else { 73 | methodName = "Find" 74 | } 75 | 76 | var expected interface{} 77 | var ret interface{} 78 | 79 | for _, subject := range data { 80 | t.Logf(`%s("%s")`, methodName, subject) 81 | if doString { 82 | expected = gore.FindString(subject) 83 | ret = re.FindString(subject) 84 | } else { 85 | expected = gore.Find([]byte(subject)) 86 | ret = re.Find([]byte(subject)) 87 | } 88 | 89 | if !assert.Equal(t, expected, ret, "returned byte sequence should match") { 90 | return 91 | } 92 | } 93 | } 94 | } 95 | 96 | func TestFindIndex(t *testing.T) { 97 | pattern := `(\S+):(\S+)` 98 | gore, err := regexp.Compile(pattern) 99 | if !assert.NoError(t, err, "Compile works (Go)") { 100 | return 101 | } 102 | 103 | re, err := pcre2.Compile(pattern) 104 | if !assert.NoError(t, err, "Compile works (pcre2)") { 105 | return 106 | } 107 | defer re.Free() 108 | 109 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 110 | for _, doString := range []bool{true, false} { 111 | var methodName string 112 | if doString { 113 | methodName = "FindStringIndex" 114 | } else { 115 | methodName = "FindIndex" 116 | } 117 | 118 | var expected interface{} 119 | var ret interface{} 120 | 121 | for _, subject := range data { 122 | t.Logf(`%s("%s")`, methodName, subject) 123 | if doString { 124 | expected = gore.FindStringIndex(subject) 125 | ret = re.FindStringIndex(subject) 126 | } else { 127 | expected = gore.FindIndex([]byte(subject)) 128 | ret = re.FindIndex([]byte(subject)) 129 | } 130 | 131 | if !assert.Equal(t, expected, ret, "returned byte sequence should match") { 132 | return 133 | } 134 | } 135 | } 136 | } 137 | 138 | func TestFindAllIndex(t *testing.T) { 139 | pattern := `(\S+):(\S+)` 140 | gore, err := regexp.Compile(pattern) 141 | if !assert.NoError(t, err, "Compile works (Go)") { 142 | return 143 | } 144 | 145 | re, err := pcre2.Compile(pattern) 146 | if !assert.NoError(t, err, "Compile works (pcre2)") { 147 | return 148 | } 149 | defer re.Free() 150 | 151 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 152 | for _, doString := range []bool{true, false} { 153 | var methodName string 154 | if doString { 155 | methodName = "FindAllStringIndex" 156 | } else { 157 | methodName = "FindAllIndex" 158 | } 159 | 160 | for n := -1; n < 4; n++ { 161 | var expected [][]int 162 | var ret [][]int 163 | 164 | for _, subject := range data { 165 | t.Logf(`%s("%s", %d)`, methodName, subject, n) 166 | if doString { 167 | expected = gore.FindAllStringIndex(subject, n) 168 | ret = re.FindAllStringIndex(subject, n) 169 | } else { 170 | expected = gore.FindAllIndex([]byte(subject), n) 171 | ret = re.FindAllIndex([]byte(subject), n) 172 | } 173 | 174 | if !assert.Equal(t, expected, ret, "indices should match") { 175 | return 176 | } 177 | } 178 | } 179 | } 180 | } 181 | 182 | func TestFindAll(t *testing.T) { 183 | pattern := `(\S+):(\S+)` 184 | gore, err := regexp.Compile(pattern) 185 | if !assert.NoError(t, err, "Compile works (Go)") { 186 | return 187 | } 188 | 189 | re, err := pcre2.Compile(pattern) 190 | if !assert.NoError(t, err, "Compile works (pcre2)") { 191 | return 192 | } 193 | defer re.Free() 194 | 195 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 196 | for _, doString := range []bool{true, false} { 197 | var methodName string 198 | if doString { 199 | methodName = "FindAllString" 200 | } else { 201 | methodName = "FindAll" 202 | } 203 | 204 | for n := -1; n < 4; n++ { 205 | var expected interface{} 206 | var ret interface{} 207 | 208 | for _, subject := range data { 209 | t.Logf(`%s("%s", %d)`, methodName, subject, n) 210 | if doString { 211 | expected = gore.FindAllString(subject, n) 212 | ret = re.FindAllString(subject, n) 213 | } else { 214 | expected = gore.FindAll([]byte(subject), n) 215 | ret = re.FindAll([]byte(subject), n) 216 | } 217 | 218 | if !assert.Equal(t, expected, ret, "indices should match") { 219 | return 220 | } 221 | } 222 | } 223 | } 224 | } 225 | 226 | func TestFindAllSubmatchIndex(t *testing.T) { 227 | pattern := `(\S+):(\S+)` 228 | gore, err := regexp.Compile(pattern) 229 | if !assert.NoError(t, err, "Compile works (Go)") { 230 | return 231 | } 232 | 233 | re, err := pcre2.Compile(pattern) 234 | if !assert.NoError(t, err, "Compile works (pcre2)") { 235 | return 236 | } 237 | defer re.Free() 238 | 239 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 240 | for _, doString := range []bool{true, false} { 241 | var methodName string 242 | if doString { 243 | methodName = "FindAllStringSubmatchIndex" 244 | } else { 245 | methodName = "FindAllSubmatcnIndex" 246 | } 247 | 248 | for n := -1; n < 4; n++ { 249 | var expected [][]int 250 | var ret [][]int 251 | 252 | for _, subject := range data { 253 | t.Logf(`%s("%s", %d)`, methodName, subject, n) 254 | if doString { 255 | expected = gore.FindAllStringSubmatchIndex(subject, n) 256 | ret = re.FindAllStringSubmatchIndex(subject, n) 257 | } else { 258 | expected = gore.FindAllSubmatchIndex([]byte(subject), n) 259 | ret = re.FindAllSubmatchIndex([]byte(subject), n) 260 | } 261 | 262 | if !assert.Equal(t, expected, ret, "indices should match") { 263 | return 264 | } 265 | } 266 | } 267 | } 268 | } 269 | 270 | func TestFindAllSubmatch(t *testing.T) { 271 | pattern := `(\S+):(\S+)` 272 | gore, err := regexp.Compile(pattern) 273 | if !assert.NoError(t, err, "Compile works (Go)") { 274 | return 275 | } 276 | 277 | re, err := pcre2.Compile(pattern) 278 | if !assert.NoError(t, err, "Compile works (pcre2)") { 279 | return 280 | } 281 | defer re.Free() 282 | 283 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 284 | for _, doString := range []bool{true, false} { 285 | var methodName string 286 | if doString { 287 | methodName = "FindAllStringSubmatch" 288 | } else { 289 | methodName = "FindAllSubmatcn" 290 | } 291 | 292 | for n := -1; n < 4; n++ { 293 | var expected interface{} 294 | var ret interface{} 295 | 296 | for _, subject := range data { 297 | t.Logf(`%s("%s", %d)`, methodName, subject, n) 298 | if doString { 299 | expected = gore.FindAllStringSubmatch(subject, n) 300 | ret = re.FindAllStringSubmatch(subject, n) 301 | } else { 302 | expected = gore.FindAllSubmatch([]byte(subject), n) 303 | ret = re.FindAllSubmatch([]byte(subject), n) 304 | } 305 | 306 | if !assert.Equal(t, expected, ret, "indices should match") { 307 | return 308 | } 309 | } 310 | } 311 | } 312 | } 313 | 314 | func TestFindSubmatch(t *testing.T) { 315 | pattern := `(\S+):(\S+)` 316 | gore, err := regexp.Compile(pattern) 317 | if !assert.NoError(t, err, "Compile works (Go)") { 318 | return 319 | } 320 | 321 | re, err := pcre2.Compile(pattern) 322 | if !assert.NoError(t, err, "Compile works (pcre2)") { 323 | return 324 | } 325 | defer re.Free() 326 | 327 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 328 | for _, doString := range []bool{true, false} { 329 | var methodName string 330 | if doString { 331 | methodName = "FindStringSubmatch" 332 | } else { 333 | methodName = "FindSubmatcn" 334 | } 335 | 336 | var expected interface{} 337 | var ret interface{} 338 | 339 | for _, subject := range data { 340 | t.Logf(`%s("%s")`, methodName, subject) 341 | if doString { 342 | expected = gore.FindStringSubmatch(subject) 343 | ret = re.FindStringSubmatch(subject) 344 | } else { 345 | expected = gore.FindSubmatch([]byte(subject)) 346 | ret = re.FindSubmatch([]byte(subject)) 347 | } 348 | 349 | if !assert.Equal(t, expected, ret, "indices should match") { 350 | return 351 | } 352 | } 353 | } 354 | } 355 | 356 | func TestFindSubmatchIndex(t *testing.T) { 357 | pattern := `(\S+):(\S+)` 358 | gore, err := regexp.Compile(pattern) 359 | if !assert.NoError(t, err, "Compile works (Go)") { 360 | return 361 | } 362 | 363 | re, err := pcre2.Compile(pattern) 364 | if !assert.NoError(t, err, "Compile works (pcre2)") { 365 | return 366 | } 367 | defer re.Free() 368 | 369 | data := []string{`Alice:35 Bob:42 Charlie:21`, `桃:三年 栗:三年 柿:八年`, `vini:came vidi:saw vici:won`} 370 | for _, doString := range []bool{true, false} { 371 | var methodName string 372 | if doString { 373 | methodName = "FindStringSubmatchIndex" 374 | } else { 375 | methodName = "FindSubmatcnIndex" 376 | } 377 | 378 | var expected interface{} 379 | var ret interface{} 380 | 381 | for _, subject := range data { 382 | t.Logf(`%s("%s")`, methodName, subject) 383 | if doString { 384 | expected = gore.FindStringSubmatchIndex(subject) 385 | ret = re.FindStringSubmatchIndex(subject) 386 | } else { 387 | expected = gore.FindSubmatchIndex([]byte(subject)) 388 | ret = re.FindSubmatchIndex([]byte(subject)) 389 | } 390 | 391 | if !assert.Equal(t, expected, ret, "indices should match") { 392 | return 393 | } 394 | } 395 | } 396 | } 397 | 398 | -------------------------------------------------------------------------------- /pcre2.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package pcre2 is a wrapper around PCRE2 C library. This library aims to 3 | provide compatible API as that of regexp package from Go stdlib. 4 | 5 | Note that while PCRE2 provides support for 8, 16, and 32 bit inputs, 6 | this library assumes UTF-8 (32bit) input. Therefore if you use anything 7 | other than UTF-8, matches will not succeed. 8 | */ 9 | package pcre2 10 | 11 | /* 12 | #define PCRE2_CODE_UNIT_WIDTH 32 13 | #cgo pkg-config: libpcre2-32 14 | #include 15 | #include 16 | #include 17 | 18 | #define MY_PCRE2_ERROR_MESSAGE_BUF_LEN 256 19 | static 20 | void * 21 | MY_pcre2_get_error_message(int errnum) { 22 | PCRE2_UCHAR *buf = (PCRE2_UCHAR *) malloc(sizeof(PCRE2_UCHAR) * MY_PCRE2_ERROR_MESSAGE_BUF_LEN); 23 | pcre2_get_error_message(errnum, buf, MY_PCRE2_ERROR_MESSAGE_BUF_LEN); 24 | return buf; 25 | } 26 | 27 | */ 28 | import "C" 29 | import ( 30 | "fmt" 31 | "reflect" 32 | "unicode/utf8" 33 | "unsafe" 34 | ) 35 | 36 | // Error returns the string representation of the error. 37 | func (e ErrCompile) Error() string { 38 | return fmt.Sprintf("PCRE2 compilation failed at offset %d: %s", e.offset, e.message) 39 | } 40 | 41 | func strToRuneArray(s string) ([]rune, []int, error) { 42 | rs := []rune{} 43 | ls := []int{} // length of each rune 44 | for len(s) > 0 { 45 | r, n := utf8.DecodeRuneInString(s) 46 | if r == utf8.RuneError { 47 | return nil, nil, ErrInvalidUTF8String 48 | } 49 | s = s[n:] 50 | rs = append(rs, r) 51 | ls = append(ls, n) 52 | } 53 | return rs, ls, nil 54 | } 55 | 56 | func bytesToRuneArray(b []byte) ([]rune, []int, error) { 57 | rs := []rune{} // actual runes 58 | ls := []int{} // length of each rune 59 | for len(b) > 0 { 60 | r, n := utf8.DecodeRune(b) 61 | if r == utf8.RuneError { 62 | return nil, nil, ErrInvalidUTF8String 63 | } 64 | b = b[n:] 65 | rs = append(rs, r) 66 | ls = append(ls, n) 67 | } 68 | return rs, ls, nil 69 | } 70 | 71 | // Compile takes the input string and creates a compiled Regexp object. 72 | // Regexp objects created by Compile must be released by calling Free 73 | func Compile(pattern string) (*Regexp, error) { 74 | patc, _, err := strToRuneArray(pattern) 75 | if err != nil { 76 | return nil, err 77 | } 78 | 79 | var errnum C.int 80 | var erroff C.PCRE2_SIZE 81 | re := C.pcre2_compile( 82 | (C.PCRE2_SPTR)(unsafe.Pointer(&patc[0])), 83 | C.size_t(len(patc)), 84 | 0, 85 | &errnum, 86 | &erroff, 87 | nil, 88 | ) 89 | if re == nil { 90 | rawbytes := C.MY_pcre2_get_error_message(errnum) 91 | msg := C.GoBytes(rawbytes, 32/8*256) 92 | defer C.free(unsafe.Pointer(rawbytes)) 93 | 94 | return nil, ErrCompile{ 95 | pattern: pattern, 96 | offset: int(erroff), 97 | message: string(msg), 98 | } 99 | } 100 | return &Regexp{ 101 | pattern: pattern, 102 | ptr: uintptr(unsafe.Pointer(re)), 103 | }, nil 104 | } 105 | 106 | // MustCompile is like Compile but panics if the expression cannot be 107 | // parsed. 108 | func MustCompile(pattern string) *Regexp { 109 | r, err := Compile(pattern) 110 | if err != nil { 111 | panic(err) 112 | } 113 | return r 114 | } 115 | 116 | func (r *Regexp) validRegexpPtr() (*C.pcre2_code, error) { 117 | if r == nil { 118 | return nil, ErrInvalidRegexp 119 | } 120 | 121 | if rptr := r.ptr; rptr != 0 { 122 | return (*C.pcre2_code)(unsafe.Pointer(rptr)), nil 123 | } 124 | return nil, ErrInvalidRegexp 125 | } 126 | 127 | // Free releases the underlying C resources 128 | func (r *Regexp) Free() error { 129 | rptr, err := r.validRegexpPtr() 130 | if err != nil { 131 | return err 132 | } 133 | C.pcre2_code_free(rptr) 134 | r.ptr = 0 135 | return nil 136 | } 137 | 138 | // String returns the source text used to compile the regular expression. 139 | func (r Regexp) String() string { 140 | return r.pattern 141 | } 142 | 143 | func (r *Regexp) Match(b []byte) bool { 144 | rs, _, err := bytesToRuneArray(b) 145 | if err != nil { 146 | return false 147 | } 148 | return r.matchRuneArray(rs, 0, 0, nil) >= 0 149 | } 150 | 151 | func (r *Regexp) MatchString(s string) bool { 152 | rs, _, err := strToRuneArray(s) 153 | if err != nil { 154 | return false 155 | } 156 | return r.matchRuneArray(rs, 0, 0, nil) >= 0 157 | } 158 | 159 | func (r *Regexp) matchRuneArray(rs []rune, offset int, options int, matchData *C.pcre2_match_data) int { 160 | rptr, err := r.validRegexpPtr() 161 | if err != nil { 162 | return -1 163 | } 164 | 165 | if matchData == nil { 166 | matchData = C.pcre2_match_data_create_from_pattern(rptr, nil) 167 | defer C.pcre2_match_data_free(matchData) 168 | } 169 | 170 | rc := C.pcre2_match( 171 | rptr, 172 | (C.PCRE2_SPTR)(unsafe.Pointer(&rs[0])), 173 | C.size_t(len(rs)), 174 | (C.PCRE2_SIZE)(offset), 175 | (C.uint32_t)(options), 176 | matchData, 177 | nil, 178 | ) 179 | 180 | return int(rc) 181 | } 182 | 183 | func pcre2GetOvectorPointer(matchData *C.pcre2_match_data, howmany int) []C.size_t { 184 | ovector := C.pcre2_get_ovector_pointer(matchData) 185 | // Note that by doing this SliceHeader maigc, we allow Go 186 | // slice syntax but Go doesn't own the underlying pointer. 187 | // We need to free it. In this case, it means the caller 188 | // must remember to free matchData 189 | hdr := reflect.SliceHeader{ 190 | Data: uintptr(unsafe.Pointer(ovector)), 191 | Len: howmany * 2, 192 | Cap: howmany * 2, 193 | } 194 | return *(*[]C.size_t)(unsafe.Pointer(&hdr)) 195 | } 196 | 197 | func (r *Regexp) HasOption(opt int) bool { 198 | rptr, err := r.validRegexpPtr() 199 | if err != nil { 200 | return false 201 | } 202 | 203 | var i C.uint32_t 204 | C.pcre2_pattern_info(rptr, C.PCRE2_INFO_ALLOPTIONS, unsafe.Pointer(&i)) 205 | return (uint32(i) & uint32(opt)) != 0 206 | } 207 | 208 | func (r *Regexp) isCRLFValid() bool { 209 | rptr, err := r.validRegexpPtr() 210 | if err != nil { 211 | return false 212 | } 213 | 214 | var i C.uint32_t 215 | C.pcre2_pattern_info(rptr, C.PCRE2_INFO_NEWLINE, unsafe.Pointer(&i)) 216 | switch i { 217 | case C.PCRE2_NEWLINE_ANY, C.PCRE2_NEWLINE_CRLF, C.PCRE2_NEWLINE_ANYCRLF: 218 | return true 219 | } 220 | 221 | return false 222 | } 223 | 224 | func (r *Regexp) FindIndex(b []byte) []int { 225 | rs, ls, err := bytesToRuneArray(b) 226 | if err != nil { 227 | return nil 228 | } 229 | 230 | is := r.findAllIndex(rs, ls, 1) 231 | if len(is) != 1 { 232 | return nil 233 | } 234 | return is[0] 235 | } 236 | 237 | func (r *Regexp) Find(b []byte) []byte { 238 | is := r.FindIndex(b) 239 | if is == nil { 240 | return nil 241 | } 242 | return b[is[0]:is[1]] 243 | } 244 | 245 | func (r *Regexp) FindStringIndex(s string) []int { 246 | rs, ls, err := strToRuneArray(s) 247 | if err != nil { 248 | return nil 249 | } 250 | 251 | is := r.findAllIndex(rs, ls, 1) 252 | if len(is) != 1 { 253 | return nil 254 | } 255 | return is[0] 256 | } 257 | 258 | func (r *Regexp) FindSubmatch(b []byte) [][]byte { 259 | matches := r.FindSubmatchIndex(b) 260 | if matches == nil { 261 | return nil 262 | } 263 | 264 | ret := make([][]byte, 0, len(matches)/2) 265 | for i := 0; i < len(matches)/2; i++ { 266 | ret = append(ret, b[matches[2*i]:matches[2*i+1]]) 267 | } 268 | return ret 269 | } 270 | 271 | func (r *Regexp) FindSubmatchIndex(b []byte) []int { 272 | rs, ls, err := bytesToRuneArray(b) 273 | if err != nil { 274 | return nil 275 | } 276 | return r.findSubmatchIndex(rs, ls) 277 | } 278 | 279 | func (r *Regexp) FindStringSubmatchIndex(s string) []int { 280 | rs, ls, err := strToRuneArray(s) 281 | if err != nil { 282 | return nil 283 | } 284 | return r.findSubmatchIndex(rs, ls) 285 | } 286 | 287 | func (r *Regexp) findSubmatchIndex(rs []rune, ls []int) []int { 288 | rptr, err := r.validRegexpPtr() 289 | if err != nil { 290 | return nil 291 | } 292 | 293 | matchData := C.pcre2_match_data_create_from_pattern(rptr, nil) 294 | defer C.pcre2_match_data_free(matchData) 295 | 296 | out := []int(nil) 297 | options := 0 298 | 299 | count := r.matchRuneArray(rs, 0, options, matchData) 300 | if count <= 0 { 301 | return nil 302 | } 303 | 304 | ovector := pcre2GetOvectorPointer(matchData, count) 305 | for i := 0; i < count; i++ { 306 | ovec0 := int(ovector[2*i]) 307 | b1 := 0 308 | for x := 0; x < ovec0; x++ { 309 | b1 += ls[x] 310 | } 311 | 312 | ovec1 := int(ovector[2*i+1]) 313 | b2 := b1 314 | for x := ovec0; x < ovec1; x++ { 315 | b2 += ls[x] 316 | } 317 | out = append(out, []int{b1, b2}...) 318 | } 319 | 320 | return out 321 | } 322 | 323 | func (r *Regexp) FindStringSubmatch(s string) []string { 324 | matches := r.FindStringSubmatchIndex(s) 325 | if matches == nil { 326 | return nil 327 | } 328 | 329 | ret := make([]string, 0, len(matches)) 330 | for i := 0; i < len(matches)/2; i++ { 331 | ret = append(ret, s[matches[2*i]:matches[2*i+1]]) 332 | } 333 | return ret 334 | } 335 | 336 | func (r *Regexp) FindString(s string) string { 337 | is := r.FindStringIndex(s) 338 | if is == nil { 339 | return "" 340 | } 341 | return s[is[0]:is[1]] 342 | } 343 | 344 | func (r *Regexp) FindAll(b []byte, n int) [][]byte { 345 | rs, ls, err := bytesToRuneArray(b) 346 | if err != nil { 347 | return nil 348 | } 349 | ret := [][]byte(nil) 350 | for _, is := range r.findAllIndex(rs, ls, n) { 351 | ret = append(ret, b[is[0]:is[1]]) 352 | } 353 | return ret 354 | } 355 | 356 | func (r *Regexp) FindAllString(s string, n int) []string { 357 | if n == 0 { 358 | return nil 359 | } 360 | 361 | rs, ls, err := strToRuneArray(s) 362 | if err != nil { 363 | return nil 364 | } 365 | ret := []string{} 366 | for _, is := range r.findAllIndex(rs, ls, n) { 367 | ret = append(ret, s[is[0]:is[1]]) 368 | if n > 0 && len(ret) >= n { 369 | break 370 | } 371 | } 372 | return ret 373 | } 374 | 375 | func (r *Regexp) findAllIndex(rs []rune, ls []int, n int) [][]int { 376 | if n == 0 { 377 | return nil 378 | } 379 | 380 | rptr, err := r.validRegexpPtr() 381 | if err != nil { 382 | return nil 383 | } 384 | 385 | matchData := C.pcre2_match_data_create_from_pattern(rptr, nil) 386 | defer C.pcre2_match_data_free(matchData) 387 | 388 | out := [][]int(nil) 389 | offset := 0 390 | options := 0 391 | for len(rs) > 0 { 392 | count := r.matchRuneArray(rs, 0, options, matchData) 393 | if count <= 0 { 394 | break 395 | } 396 | 397 | ovector := pcre2GetOvectorPointer(matchData, count) 398 | ovec0 := int(ovector[0]) 399 | b1 := 0 400 | for x := 0; x < ovec0; x++ { 401 | b1 += ls[x] 402 | } 403 | b2 := b1 404 | for x := ovec0; x < int(ovector[1]); x++ { 405 | b2 += ls[x] 406 | } 407 | out = append(out, []int{offset + b1, offset + b2}) 408 | units := int(ovector[1]) 409 | for x := 0; x < units; x++ { 410 | offset += ls[x] 411 | } 412 | 413 | rs = rs[units:] 414 | ls = ls[units:] 415 | 416 | if n > 0 && len(out) >= n { 417 | break 418 | } 419 | } 420 | 421 | return out 422 | } 423 | 424 | func (r *Regexp) FindAllIndex(b []byte, n int) [][]int { 425 | rs, ls, err := bytesToRuneArray(b) 426 | if err != nil { 427 | return nil 428 | } 429 | return r.findAllIndex(rs, ls, n) 430 | } 431 | 432 | func (r *Regexp) FindAllStringIndex(s string, n int) [][]int { 433 | rs, ls, err := strToRuneArray(s) 434 | if err != nil { 435 | return nil 436 | } 437 | return r.findAllIndex(rs, ls, n) 438 | } 439 | 440 | func (r *Regexp) findAllSubmatchIndex(rs []rune, ls []int, n int) [][]int { 441 | if n == 0 { 442 | return nil 443 | } 444 | 445 | rptr, err := r.validRegexpPtr() 446 | if err != nil { 447 | return nil 448 | } 449 | 450 | matchData := C.pcre2_match_data_create_from_pattern(rptr, nil) 451 | defer C.pcre2_match_data_free(matchData) 452 | 453 | out := [][]int(nil) 454 | offset := 0 455 | options := 0 456 | for len(rs) > 0 { 457 | count := r.matchRuneArray(rs, 0, options, matchData) 458 | if count <= 0 { 459 | break 460 | } 461 | 462 | ovector := pcre2GetOvectorPointer(matchData, count) 463 | curmatch := make([]int, 0, count) 464 | for i := 0; i < count; i++ { 465 | ovec2i := int(ovector[2*i]) 466 | 467 | b1 := 0 468 | for x := 0; x < ovec2i; x++ { 469 | b1 += ls[x] 470 | } 471 | b2 := b1 472 | for x := ovec2i; x < int(ovector[2*i+1]); x++ { 473 | b2 += ls[x] 474 | } 475 | curmatch = append(curmatch, offset+b1, offset+b2) 476 | } 477 | out = append(out, curmatch) 478 | 479 | units := int(ovector[1]) 480 | for x := 0; x < units; x++ { 481 | offset += ls[x] 482 | } 483 | 484 | rs = rs[units:] 485 | ls = ls[units:] 486 | 487 | if n > 0 && len(out) >= n { 488 | break 489 | } 490 | } 491 | 492 | return out 493 | } 494 | 495 | func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 496 | rs, ls, err := bytesToRuneArray(b) 497 | if err != nil { 498 | return nil 499 | } 500 | 501 | all := r.findAllSubmatchIndex(rs, ls, n) 502 | if all == nil { 503 | return nil 504 | } 505 | 506 | ret := make([][][]byte, 0, len(all)) 507 | for _, is := range all { 508 | l := len(is) / 2 509 | cur := make([][]byte, 0, l) 510 | for i := 0; i < l; i++ { 511 | cur = append(cur, b[is[2*i]:is[2*i+1]]) 512 | } 513 | 514 | ret = append(ret, cur) 515 | if n > 0 && len(ret) >= n { 516 | break 517 | } 518 | } 519 | return ret 520 | } 521 | 522 | func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 523 | rs, ls, err := strToRuneArray(s) 524 | if err != nil { 525 | return nil 526 | } 527 | 528 | all := r.findAllSubmatchIndex(rs, ls, n) 529 | if all == nil { 530 | return nil 531 | } 532 | 533 | ret := make([][]string, 0, len(all)) 534 | for _, is := range all { 535 | l := len(is) / 2 536 | cur := make([]string, 0, l) 537 | for i := 0; i < l; i++ { 538 | cur = append(cur, s[is[2*i]:is[2*i+1]]) 539 | } 540 | ret = append(ret, cur) 541 | if n > 0 && len(ret) >= n { 542 | break 543 | } 544 | } 545 | return ret 546 | } 547 | 548 | func (r *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 549 | rs, ls, err := bytesToRuneArray(b) 550 | if err != nil { 551 | return nil 552 | } 553 | return r.findAllSubmatchIndex(rs, ls, n) 554 | } 555 | 556 | func (r *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 557 | rs, ls, err := strToRuneArray(s) 558 | if err != nil { 559 | return nil 560 | } 561 | return r.findAllSubmatchIndex(rs, ls, n) 562 | } 563 | --------------------------------------------------------------------------------