├── go.mod ├── .github └── workflows │ └── main.yaml ├── _samples └── gosax-count.go ├── xmlb ├── example_test.go └── xmlb.go ├── bench_test.go ├── LICENSE ├── reader.go ├── README.md ├── compat.go ├── example_test.go └── gosax.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/orisano/gosax 2 | 3 | go 1.22.2 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: main 2 | on: push 3 | jobs: 4 | build: 5 | runs-on: ubuntu-slim 6 | strategy: 7 | matrix: 8 | go: [ '1.23', '1.24', '1.25' ] 9 | name: go ${{ matrix.go }} 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: actions/setup-go@v5 13 | with: 14 | go-version: ${{ matrix.go }} 15 | - run: go test -race -v ./... 16 | -------------------------------------------------------------------------------- /_samples/gosax-count.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "log" 7 | "os" 8 | 9 | "github.com/orisano/gosax" 10 | ) 11 | 12 | func main() { 13 | f, err := os.Open(os.Args[1]) 14 | if err != nil { 15 | log.Fatal(err) 16 | } 17 | defer f.Close() 18 | 19 | r := gosax.NewReader(f) 20 | count := 0 21 | inLocation := false 22 | for { 23 | e, err := r.Event() 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | if e.Type() == gosax.EventEOF { 28 | break 29 | } 30 | switch e.Type() { 31 | case gosax.EventStart: 32 | name, _ := gosax.Name(e.Bytes) 33 | if string(name) == "location" { 34 | inLocation = true 35 | } else { 36 | inLocation = false 37 | } 38 | case gosax.EventEnd: 39 | inLocation = false 40 | case gosax.EventText: 41 | if inLocation { 42 | if bytes.Contains(e.Bytes, []byte("Africa")) { 43 | count++ 44 | } 45 | } 46 | default: 47 | } 48 | } 49 | fmt.Println("counter =", count) 50 | } 51 | -------------------------------------------------------------------------------- /xmlb/example_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | package xmlb_test 28 | 29 | import ( 30 | "fmt" 31 | "io" 32 | "strings" 33 | 34 | "github.com/orisano/gosax/xmlb" 35 | ) 36 | 37 | func Example() { 38 | r := strings.NewReader(`Value`) 39 | d := xmlb.NewDecoder(r, make([]byte, 64*1024)) 40 | for { 41 | tok, err := d.Token() 42 | if err == io.EOF { 43 | break 44 | } 45 | if err != nil { 46 | break 47 | } 48 | switch tok.Type() { 49 | case xmlb.StartElement: 50 | t, _ := tok.StartElement() 51 | fmt.Println("StartElement", t.Name.Local) 52 | case xmlb.CharData: 53 | t, _ := tok.CharData() 54 | fmt.Println("CharData", string(t)) 55 | case xmlb.EndElement: 56 | fmt.Println("EndElement", string(tok.Name().Local())) 57 | } 58 | } 59 | // Output: 60 | // StartElement root 61 | // StartElement element 62 | // CharData Value 63 | // EndElement element 64 | // EndElement root 65 | } 66 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | package gosax_test 28 | 29 | import ( 30 | "bytes" 31 | "os" 32 | "testing" 33 | 34 | "github.com/orisano/gosax" 35 | ) 36 | 37 | func BenchmarkReader_Event(b *testing.B) { 38 | b.ReportAllocs() 39 | for i := 0; i < b.N; i++ { 40 | if err := countAfrica(b); err != nil { 41 | b.Fatal(err) 42 | } 43 | } 44 | } 45 | 46 | func countAfrica(b *testing.B) error { 47 | f, err := os.Open("testdata/out.xml") 48 | if err != nil { 49 | return err 50 | } 51 | defer f.Close() 52 | if stat, err := f.Stat(); err == nil { 53 | b.SetBytes(stat.Size()) 54 | } 55 | 56 | r := gosax.NewReader(f) 57 | count := 0 58 | inLocation := false 59 | for { 60 | e, err := r.Event() 61 | if err != nil { 62 | return err 63 | } 64 | if e.Type() == gosax.EventEOF { 65 | break 66 | } 67 | switch e.Type() { 68 | case gosax.EventStart: 69 | name, _ := gosax.Name(e.Bytes) 70 | if string(name) == "location" { 71 | inLocation = true 72 | } else { 73 | inLocation = false 74 | } 75 | case gosax.EventEnd: 76 | inLocation = false 77 | case gosax.EventText: 78 | if inLocation { 79 | if bytes.Contains(e.Bytes, []byte("Africa")) { 80 | count++ 81 | } 82 | } 83 | default: 84 | } 85 | } 86 | return nil 87 | } 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024, Nao Yonashiro 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | Additional Notices: 26 | 27 | This project includes a file licensed under the BSD 2-Clause License: 28 | 29 | reader.go 30 | --------------------- 31 | 32 | Copyright (c) 2020, Dave Cheney 33 | All rights reserved. 34 | 35 | Redistribution and use in source and binary forms, with or without 36 | modification, are permitted provided that the following conditions are met: 37 | 38 | * Redistributions of source code must retain the above copyright notice, this 39 | list of conditions and the following disclaimer. 40 | 41 | * Redistributions in binary form must reproduce the above copyright notice, 42 | this list of conditions and the following disclaimer in the documentation 43 | and/or other materials provided with the distribution. 44 | 45 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 46 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 48 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 49 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 51 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 52 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 53 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 54 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55 | -------------------------------------------------------------------------------- /reader.go: -------------------------------------------------------------------------------- 1 | /* 2 | This file is based on the source code available at https://github.com/pkg/json under BSD-2-Clause License 3 | 4 | Copyright (c) 2020, Dave Cheney 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | package gosax 30 | 31 | import "io" 32 | 33 | // A byteReader implements a sliding window over an io.Reader. 34 | type byteReader struct { 35 | data []byte 36 | offset int 37 | r io.Reader 38 | err error 39 | } 40 | 41 | // release discards n bytes from the front of the window. 42 | func (b *byteReader) release(n int) { 43 | b.offset += n 44 | } 45 | 46 | // window returns the current window. 47 | // The window is invalidated by calls to release or extend. 48 | func (b *byteReader) window() []byte { 49 | return b.data[b.offset:] 50 | } 51 | 52 | // tuning constants for byteReader.extend. 53 | const ( 54 | newBufferSize = 4096 55 | minReadSize = newBufferSize >> 2 56 | ) 57 | 58 | // extend extends the window with data from the underlying reader. 59 | func (b *byteReader) extend() int { 60 | if b.err != nil { 61 | return 0 62 | } 63 | 64 | remaining := len(b.data) - b.offset 65 | if remaining == 0 { 66 | b.data = b.data[:0] 67 | b.offset = 0 68 | } 69 | if cap(b.data)-len(b.data) >= minReadSize { 70 | // nothing to do, enough space exists between len and cap. 71 | } else if cap(b.data)-remaining >= minReadSize { 72 | // buffer has enough space if we move the data to the front. 73 | b.compact() 74 | } else { 75 | // otherwise, we must allocate/extend a new buffer 76 | b.grow() 77 | } 78 | remaining += b.offset 79 | n, err := b.r.Read(b.data[remaining:cap(b.data)]) 80 | // reduce length to the existing plus the data we read. 81 | b.data = b.data[:remaining+n] 82 | b.err = err 83 | return n 84 | } 85 | 86 | // grow grows the buffer, moving the active data to the front. 87 | func (b *byteReader) grow() { 88 | buf := make([]byte, max(cap(b.data)*2, newBufferSize)) 89 | copy(buf, b.data[b.offset:]) 90 | b.data = buf 91 | b.offset = 0 92 | } 93 | 94 | // compact moves the active data to the front of the buffer. 95 | func (b *byteReader) compact() { 96 | copy(b.data, b.data[b.offset:]) 97 | b.offset = 0 98 | } 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gosax 2 | 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/orisano/gosax.svg)](https://pkg.go.dev/github.com/orisano/gosax) 4 | 5 | `gosax` is a Go library for XML SAX (Simple API for XML) parsing, supporting read-only functionality. This library is 6 | designed for efficient and memory-conscious XML parsing, drawing inspiration from various sources to provide a 7 | performant parser. 8 | 9 | ## Features 10 | 11 | - **Read-only SAX parsing**: Stream and process XML documents without loading the entire document into memory. 12 | - **Efficient parsing**: Utilizes techniques inspired by `quick-xml` and `pkg/json` for high performance. 13 | - **SWAR (SIMD Within A Register)**: Optimizations for fast text processing, inspired by `memchr`. 14 | - **Compatibility with encoding/xml**: Includes utility functions to bridge `gosax` types with `encoding/xml` types, facilitating easy integration with existing code that uses the standard library. 15 | 16 | ## Benchmark 17 | ``` 18 | goos: darwin 19 | goarch: arm64 20 | pkg: github.com/orisano/gosax 21 | BenchmarkReader_Event-12 5 211845800 ns/op 1103.30 MB/s 2097606 B/op 6 allocs/op 22 | ``` 23 | 24 | ## Installation 25 | 26 | To install `gosax`, use `go get`: 27 | 28 | ```bash 29 | go get github.com/orisano/gosax 30 | ``` 31 | 32 | ## Usage 33 | 34 | Here is a basic example of how to use `gosax` to parse an XML document: 35 | 36 | ```go 37 | package main 38 | 39 | import ( 40 | "fmt" 41 | "log" 42 | "strings" 43 | 44 | "github.com/orisano/gosax" 45 | ) 46 | 47 | func main() { 48 | xmlData := `Value` 49 | reader := strings.NewReader(xmlData) 50 | 51 | r := gosax.NewReader(reader) 52 | for { 53 | e, err := r.Event() 54 | if err != nil { 55 | log.Fatal(err) 56 | } 57 | if e.Type() == gosax.EventEOF { 58 | break 59 | } 60 | fmt.Println(string(e.Bytes)) 61 | } 62 | // Output: 63 | // 64 | // 65 | // Value 66 | // 67 | // 68 | } 69 | 70 | ``` 71 | 72 | ### Bridging with encoding/xml 73 | 74 | **Important Note for encoding/xml Users:** 75 | > When migrating from `encoding/xml` to `gosax`, note that self-closing tags are handled differently. To mimic `encoding/xml` behavior, set `gosax.Reader.EmitSelfClosingTag` to `true`. This ensures self-closing tags are recognized and processed correctly. 76 | 77 | #### Using TokenE 78 | If you are used to `encoding/xml`'s `Token`, start with `gosax.TokenE`. 79 | **Note:** Using `gosax.TokenE` and `gosax.Token` involves memory allocation due to interfaces. 80 | 81 | **Before:** 82 | ```go 83 | var dec *xml.Decoder 84 | for { 85 | tok, err := dec.Token() 86 | if err == io.EOF { 87 | break 88 | } 89 | // ... 90 | } 91 | ``` 92 | 93 | **After:** 94 | ```go 95 | var dec *gosax.Reader 96 | for { 97 | tok, err := gosax.TokenE(dec.Event()) 98 | if err == io.EOF { 99 | break 100 | } 101 | // ... 102 | } 103 | ``` 104 | 105 | #### Utilizing xmlb 106 | `xmlb` is an extension for `gosax` to simplify rewriting code from `encoding/xml`. It provides a higher-performance bridge for XML parsing and processing. 107 | 108 | **Before:** 109 | ```go 110 | var dec *xml.Decoder 111 | for { 112 | tok, err := dec.Token() 113 | if err == io.EOF { 114 | break 115 | } 116 | switch t := tok.(type) { 117 | case xml.StartElement: 118 | // ... 119 | case xml.CharData: 120 | // ... 121 | case xml.EndElement: 122 | // ... 123 | } 124 | } 125 | ``` 126 | 127 | **After:** 128 | ```go 129 | var dec *xmlb.Decoder 130 | for { 131 | tok, err := dec.Token() 132 | if err == io.EOF { 133 | break 134 | } 135 | switch tok.Type() { 136 | case xmlb.StartElement: 137 | t, _ := tok.StartElement() 138 | // ... 139 | case xmlb.CharData: 140 | t, _ := tok.CharData() 141 | // ... 142 | case xmlb.EndElement: 143 | t := tok.EndElement() 144 | // ... 145 | } 146 | } 147 | ``` 148 | 149 | ## License 150 | 151 | This library is licensed under the terms specified in the LICENSE file. 152 | 153 | ## Acknowledgements 154 | 155 | `gosax` is inspired by the following projects and resources: 156 | 157 | - [Dave Cheney's GopherCon SG 2023 Talk](https://dave.cheney.net/paste/gophercon-sg-2023.html) 158 | - [quick-xml](https://github.com/tafia/quick-xml) 159 | - [memchr](https://github.com/BurntSushi/memchr) (SWAR part) 160 | 161 | ## Contributing 162 | 163 | Contributions are welcome! Please fork the repository and submit pull requests. 164 | 165 | ## Contact 166 | 167 | For any questions or feedback, feel free to open an issue on the GitHub repository. 168 | -------------------------------------------------------------------------------- /compat.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | // This file contains utility functions to bridge gosax with encoding/xml. 28 | // These functions provide convenient ways to convert gosax types to encoding/xml types, 29 | // facilitating interoperability between the two packages. 30 | 31 | package gosax 32 | 33 | import ( 34 | "bytes" 35 | "encoding/xml" 36 | "errors" 37 | "io" 38 | ) 39 | 40 | var errSyntaxError = errors.New("syntax error") 41 | 42 | // StartElement converts a byte slice to an xml.StartElement. 43 | func StartElement(b []byte) (xml.StartElement, error) { 44 | name, b := Name(b) 45 | e := xml.StartElement{ 46 | Name: xmlName(name), 47 | } 48 | for len(b) > 0 { 49 | var attr Attribute 50 | var err error 51 | attr, b, err = NextAttribute(b) 52 | if err != nil { 53 | return xml.StartElement{}, err 54 | } 55 | if len(attr.Key) == 0 { 56 | break 57 | } 58 | if len(attr.Value) == 0 { 59 | return e, errSyntaxError 60 | } 61 | value, err := Unescape(attr.Value[1 : len(attr.Value)-1]) 62 | if err != nil { 63 | return xml.StartElement{}, err 64 | } 65 | e.Attr = append(e.Attr, xml.Attr{ 66 | Name: xmlName(attr.Key), 67 | Value: string(value), 68 | }) 69 | } 70 | return e, nil 71 | } 72 | 73 | // EndElement converts a byte slice to an xml.EndElement. 74 | func EndElement(b []byte) xml.EndElement { 75 | name, _ := Name(b) 76 | return xml.EndElement{ 77 | Name: xmlName(name), 78 | } 79 | } 80 | 81 | // CharData converts a byte slice to xml.CharData. 82 | func CharData(b []byte) (xml.CharData, error) { 83 | return Unescape(b) 84 | } 85 | 86 | // Comment converts a byte slice to an xml.Comment. 87 | func Comment(b []byte) xml.Comment { 88 | return trim(b, "") 89 | } 90 | 91 | // ProcInst converts a byte slice to an xml.ProcInst. 92 | func ProcInst(b []byte) xml.ProcInst { 93 | name, b := Name(b) 94 | return xml.ProcInst{ 95 | Target: string(name[1:]), 96 | Inst: b[:len(b)-1], 97 | } 98 | } 99 | 100 | // Directive converts a byte slice to an xml.Directive. 101 | func Directive(b []byte) xml.Directive { 102 | return trim(b, "") 103 | } 104 | 105 | // Token converts an Event to an xml.Token. 106 | // This function is provided for convenience, but it may allocate memory. 107 | // 108 | // Note: For performance-critical applications, it's recommended to use 109 | // the direct conversion functions (StartElement, EndElement, CharData, etc.) 110 | // instead of Token, as they allow better control over memory allocations. 111 | func Token(e Event) (xml.Token, error) { 112 | switch e.Type() { 113 | case EventStart: 114 | return StartElement(e.Bytes) 115 | case EventEnd: 116 | return EndElement(e.Bytes), nil 117 | case EventText: 118 | return CharData(e.Bytes) 119 | case EventCData: 120 | return xml.CharData(trim(e.Bytes, "")), nil 121 | case EventComment: 122 | return Comment(e.Bytes), nil 123 | case EventProcessingInstruction: 124 | return ProcInst(e.Bytes), nil 125 | case EventDocType: 126 | return Directive(e.Bytes), nil 127 | case EventEOF: 128 | return nil, io.EOF 129 | default: 130 | panic("unknown event type") 131 | } 132 | } 133 | 134 | // TokenE returns an xml.Token from an Event or an error if one is passed. 135 | // If err is not nil, it immediately returns nil and the provided error. 136 | func TokenE(e Event, err error) (xml.Token, error) { 137 | if err != nil { 138 | return nil, err 139 | } 140 | return Token(e) 141 | } 142 | 143 | // Skip advances the XML reader to the end of the current nested scope, returning an error if encountered. 144 | func Skip(r *Reader) error { 145 | var depth int64 146 | for { 147 | ev, err := r.Event() 148 | if err != nil { 149 | return err 150 | } 151 | switch ev.Type() { 152 | case EventStart: 153 | depth++ 154 | case EventEnd: 155 | if depth == 0 { 156 | return nil 157 | } 158 | depth-- 159 | default: 160 | } 161 | } 162 | } 163 | 164 | func xmlName(b []byte) xml.Name { 165 | if i := bytes.IndexByte(b, ':'); i >= 0 { 166 | return xml.Name{ 167 | Space: string(b[:i]), 168 | Local: string(b[i+1:]), 169 | } 170 | } else { 171 | return xml.Name{ 172 | Local: string(b), 173 | } 174 | } 175 | } 176 | 177 | func trim(b []byte, prefix, suffix string) []byte { 178 | return bytes.TrimSuffix(bytes.TrimPrefix(b, []byte(prefix)), []byte(suffix)) 179 | } 180 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | package gosax_test 28 | 29 | import ( 30 | "encoding/xml" 31 | "fmt" 32 | "log" 33 | "strings" 34 | "sync" 35 | 36 | "github.com/orisano/gosax" 37 | ) 38 | 39 | func ExampleReader_Event() { 40 | xmlData := `Value` 41 | reader := strings.NewReader(xmlData) 42 | 43 | r := gosax.NewReader(reader) 44 | for { 45 | e, err := r.Event() 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | if e.Type() == gosax.EventEOF { 50 | break 51 | } 52 | fmt.Println(string(e.Bytes)) 53 | } 54 | // Output: 55 | // 56 | // 57 | // Value 58 | // 59 | // 60 | } 61 | 62 | func ExampleNewReaderBuf() { 63 | xmlData := `Value` 64 | reader := strings.NewReader(xmlData) 65 | 66 | var buf [4096]byte 67 | r := gosax.NewReaderBuf(reader, buf[:]) 68 | for { 69 | e, err := r.Event() 70 | if err != nil { 71 | log.Fatal(err) 72 | } 73 | if e.Type() == gosax.EventEOF { 74 | break 75 | } 76 | fmt.Println(string(e.Bytes)) 77 | } 78 | // Output: 79 | // 80 | // 81 | // Value 82 | // 83 | // 84 | } 85 | 86 | func ExampleReader_Reset() { 87 | pool := sync.Pool{ 88 | New: func() any { 89 | return gosax.NewReaderSize(nil, 16*1024) 90 | }, 91 | } 92 | func(p *sync.Pool) { 93 | xmlData := `Value` 94 | reader := strings.NewReader(xmlData) 95 | 96 | r := p.Get().(*gosax.Reader) 97 | defer p.Put(r) 98 | r.Reset(reader) 99 | for { 100 | e, err := r.Event() 101 | if err != nil { 102 | log.Fatal(err) 103 | } 104 | if e.Type() == gosax.EventEOF { 105 | break 106 | } 107 | fmt.Println(string(e.Bytes)) 108 | } 109 | }(&pool) 110 | // Output: 111 | // 112 | // 113 | // Value 114 | // 115 | // 116 | } 117 | 118 | func ExampleToken() { 119 | xmlData := `Value` 120 | reader := strings.NewReader(xmlData) 121 | 122 | r := gosax.NewReader(reader) 123 | for { 124 | e, err := r.Event() 125 | if err != nil { 126 | log.Fatal(err) 127 | } 128 | if e.Type() == gosax.EventEOF { 129 | break 130 | } 131 | t, err := gosax.Token(e) 132 | if err != nil { 133 | log.Fatal(err) 134 | } 135 | switch t := t.(type) { 136 | case xml.StartElement: 137 | fmt.Println("StartElement", t.Name.Local) 138 | for _, attr := range t.Attr { 139 | fmt.Println("Attr", attr.Name.Local, attr.Value) 140 | } 141 | case xml.EndElement: 142 | fmt.Println("EndElement", t.Name.Local) 143 | case xml.CharData: 144 | fmt.Println("CharData", string(t)) 145 | } 146 | } 147 | // Output: 148 | // StartElement root 149 | // StartElement element 150 | // Attr foo 151 | // Attr bar qux 152 | // CharData Value 153 | // EndElement element 154 | // EndElement root 155 | } 156 | 157 | func ExampleReader_EmitSelfClosingTag() { 158 | xmlData := `Value` 159 | reader := strings.NewReader(xmlData) 160 | 161 | r := gosax.NewReader(reader) 162 | r.EmitSelfClosingTag = true 163 | for { 164 | e, err := r.Event() 165 | if err != nil { 166 | log.Fatal(err) 167 | } 168 | if e.Type() == gosax.EventEOF { 169 | break 170 | } 171 | switch e.Type() { 172 | case gosax.EventStart: 173 | name, _ := gosax.Name(e.Bytes) 174 | fmt.Println("EventStart", string(name)) 175 | case gosax.EventEnd: 176 | name, _ := gosax.Name(e.Bytes) 177 | fmt.Println("EventEnd", string(name)) 178 | case gosax.EventText: 179 | fmt.Println("EventText", string(e.Bytes)) 180 | default: 181 | } 182 | } 183 | // Output: 184 | // EventStart root 185 | // EventStart element 186 | // EventText Value 187 | // EventEnd element 188 | // EventStart selfclosing 189 | // EventEnd selfclosing 190 | // EventEnd root 191 | } 192 | 193 | func ExampleUnescape() { 194 | xmlData := "Line1\r\nLine2\rLine3\nLine4\r\nLine5\r\n" 195 | b, _ := gosax.Unescape([]byte(xmlData)) 196 | fmt.Printf("%q", string(b)) 197 | // Output: 198 | // "Line1\nLine2\nLine3\nLine4\nLine5\n" 199 | } 200 | 201 | func ExampleStartElement() { 202 | xmlData := ` 205 | ` 206 | reader := strings.NewReader(xmlData) 207 | 208 | r := gosax.NewReader(reader) 209 | for { 210 | e, err := r.Event() 211 | if err != nil { 212 | log.Fatal(err) 213 | } 214 | if e.Type() == gosax.EventEOF { 215 | break 216 | } 217 | t, err := gosax.Token(e) 218 | if err != nil { 219 | log.Fatal(err) 220 | } 221 | switch t := t.(type) { 222 | case xml.StartElement: 223 | fmt.Println("StartElement", t.Name.Local) 224 | for _, attr := range t.Attr { 225 | fmt.Println("Attr", attr.Name.Local, attr.Value) 226 | } 227 | case xml.EndElement: 228 | fmt.Println("EndElement", t.Name.Local) 229 | case xml.CharData: 230 | continue 231 | } 232 | } 233 | // Output: 234 | // StartElement root 235 | // StartElement element 236 | // Attr foo bar 237 | // EndElement element 238 | // EndElement root 239 | } 240 | -------------------------------------------------------------------------------- /xmlb/xmlb.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | // Package xmlb provides a high-performance bridge between the gosax library and encoding/xml. 28 | // It is designed to facilitate the rewriting of code that uses encoding/xml, offering a more efficient 29 | // and memory-conscious approach to XML parsing. 30 | // 31 | // While gosax provides a low-level bridge with encoding/xml through various utility functions, 32 | // xmlb offers a higher-performance bridge intended for rewriting. 33 | package xmlb 34 | 35 | import ( 36 | "bytes" 37 | "encoding/xml" 38 | "errors" 39 | "io" 40 | 41 | "github.com/orisano/gosax" 42 | ) 43 | 44 | const ( 45 | StartElement = iota + 1 46 | EndElement 47 | CharData 48 | ProcInst 49 | Comment 50 | Directive 51 | ) 52 | 53 | type Decoder struct { 54 | r *gosax.Reader 55 | 56 | t *Token 57 | err error 58 | } 59 | 60 | func NewDecoder(r io.Reader, buf []byte) *Decoder { 61 | gr := gosax.NewReaderBuf(r, buf) 62 | gr.EmitSelfClosingTag = true 63 | return &Decoder{r: gr} 64 | } 65 | 66 | func (d *Decoder) Token() (Token, error) { 67 | if d.err != nil { 68 | return Token{}, d.err 69 | } 70 | if d.t != nil { 71 | t := *d.t 72 | d.t = nil 73 | return t, nil 74 | } 75 | ev, err := d.r.Event() 76 | if err == nil && ev.Type() == gosax.EventEOF { 77 | err = io.EOF 78 | } 79 | if err != nil { 80 | return Token{}, err 81 | } 82 | return Token(ev), nil 83 | } 84 | 85 | func (d *Decoder) Peek() (Token, error) { 86 | if d.err != nil { 87 | return Token{}, d.err 88 | } 89 | if d.t == nil { 90 | ev, err := d.r.Event() 91 | if err == nil && ev.Type() == gosax.EventEOF { 92 | d.err = io.EOF 93 | } else { 94 | d.err = err 95 | } 96 | if d.err != nil { 97 | return Token{}, d.err 98 | } 99 | t := Token(ev) 100 | d.t = &t 101 | } 102 | return *d.t, nil 103 | } 104 | 105 | func (d *Decoder) Text() (string, error) { 106 | t, err := d.Peek() 107 | if err != nil { 108 | return "", err 109 | } 110 | if t.Type() != CharData { 111 | return "", nil 112 | } 113 | cd, err := t.CharData() 114 | if err != nil { 115 | return "", err 116 | } 117 | return string(cd), nil 118 | } 119 | 120 | func (d *Decoder) Skip() error { 121 | var depth int64 122 | for { 123 | tok, err := d.Token() 124 | if err != nil { 125 | return err 126 | } 127 | switch tok.Type() { 128 | case StartElement: 129 | depth++ 130 | case EndElement: 131 | if depth == 0 { 132 | return nil 133 | } 134 | depth-- 135 | default: 136 | } 137 | } 138 | } 139 | 140 | type Token gosax.Event 141 | 142 | func (t Token) Type() uint8 { 143 | switch gosax.Event(t).Type() { 144 | case gosax.EventStart: 145 | return StartElement 146 | case gosax.EventEnd: 147 | return EndElement 148 | case gosax.EventText: 149 | return CharData 150 | case gosax.EventCData: 151 | return CharData 152 | case gosax.EventProcessingInstruction: 153 | return ProcInst 154 | case gosax.EventComment: 155 | return Comment 156 | case gosax.EventDocType: 157 | return Directive 158 | case gosax.EventEOF: 159 | return 0 160 | default: 161 | panic("unreachable") 162 | } 163 | } 164 | 165 | type NameBytes struct { 166 | b []byte 167 | p int 168 | } 169 | 170 | func (n NameBytes) Space() []byte { 171 | if n.p == 0 { 172 | return nil 173 | } 174 | return n.b[:n.p-1] 175 | } 176 | 177 | func (n NameBytes) Local() []byte { 178 | return n.b[n.p:] 179 | } 180 | 181 | var ErrNoAttributes = errors.New("no attributes") 182 | 183 | type AttributesBytes []byte 184 | 185 | func (a AttributesBytes) Get(key string) ([]byte, error) { 186 | b := []byte(a) 187 | for len(b) > 0 { 188 | attr, b2, err := gosax.NextAttribute(b) 189 | if err != nil { 190 | return nil, err 191 | } 192 | b = b2 193 | if string(attr.Key) != key { 194 | continue 195 | } 196 | v, err := gosax.Unescape(attr.Value[1 : len(attr.Value)-1]) 197 | if err != nil { 198 | return nil, err 199 | } 200 | return v, nil 201 | } 202 | return nil, ErrNoAttributes 203 | } 204 | 205 | type StartElementBytes struct { 206 | Name NameBytes 207 | Attrs AttributesBytes 208 | } 209 | 210 | func (t Token) Name() NameBytes { 211 | name, _ := gosax.Name(t.Bytes) 212 | p := bytes.IndexByte(name, ':') 213 | if p < 0 { 214 | return NameBytes{name, 0} 215 | } 216 | return NameBytes{name, p + 1} 217 | } 218 | 219 | func (t Token) StartElement() (xml.StartElement, error) { 220 | return gosax.StartElement(t.Bytes) 221 | } 222 | 223 | func (t Token) StartElementBytes() StartElementBytes { 224 | name, attrs := gosax.Name(t.Bytes) 225 | p := bytes.IndexByte(name, ':') 226 | if p < 0 { 227 | p = 0 228 | } else { 229 | p += 1 230 | } 231 | return StartElementBytes{NameBytes{name, p}, attrs} 232 | } 233 | 234 | func (t Token) EndElement() xml.EndElement { 235 | return gosax.EndElement(t.Bytes) 236 | } 237 | 238 | func (t Token) CharData() (xml.CharData, error) { 239 | switch gosax.Event(t).Type() { 240 | case gosax.EventText: 241 | return gosax.CharData(t.Bytes) 242 | case gosax.EventCData: 243 | return bytes.TrimSuffix(bytes.TrimPrefix(t.Bytes, []byte("")), nil 244 | default: 245 | panic("unreachable") 246 | } 247 | } 248 | 249 | func (t Token) ProcInst() xml.ProcInst { 250 | return gosax.ProcInst(t.Bytes) 251 | } 252 | 253 | func (t Token) Comment() xml.Comment { 254 | return gosax.Comment(t.Bytes) 255 | } 256 | 257 | func (t Token) Directive() xml.Directive { 258 | return gosax.Directive(t.Bytes) 259 | } 260 | -------------------------------------------------------------------------------- /gosax.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2024, Nao Yonashiro 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | // Package gosax provides a Simple API for XML (SAX) parser for Go. 28 | // It offers efficient, read-only XML parsing with streaming capabilities, 29 | // inspired by quick-xml and other high-performance parsing techniques. 30 | package gosax 31 | 32 | import ( 33 | "bytes" 34 | "encoding/binary" 35 | "fmt" 36 | "io" 37 | "strconv" 38 | "unicode/utf8" 39 | ) 40 | 41 | const ( 42 | eventUnknown = iota 43 | EventStart 44 | EventEnd 45 | EventText 46 | EventCData 47 | EventComment 48 | EventProcessingInstruction 49 | EventDocType 50 | EventEOF 51 | ) 52 | 53 | type Event struct { 54 | Bytes []byte 55 | value uint32 56 | } 57 | 58 | func (e Event) Type() uint8 { 59 | return uint8(e.value) 60 | } 61 | 62 | type Reader struct { 63 | reader byteReader 64 | state func(*Reader) (Event, error) 65 | 66 | EmitSelfClosingTag bool 67 | selfClosingEnd int 68 | } 69 | 70 | func NewReader(r io.Reader) *Reader { 71 | return NewReaderSize(r, 2*1024*1024) 72 | } 73 | 74 | func NewReaderSize(r io.Reader, bufSize int) *Reader { 75 | return NewReaderBuf(r, make([]byte, 0, bufSize)) 76 | } 77 | 78 | func NewReaderBuf(r io.Reader, buf []byte) *Reader { 79 | var xr Reader 80 | xr.reader.data = buf 81 | xr.Reset(r) 82 | return &xr 83 | } 84 | 85 | // Event returns the next Event from the XML stream. 86 | // It returns an Event and any error encountered. 87 | // 88 | // Note: The returned Event object is only valid until the next call to Event. 89 | // The underlying byte slice may be overwritten by subsequent calls. 90 | // If you need to retain the Event data, make a copy before the next Event call. 91 | func (r *Reader) Event() (Event, error) { 92 | return r.state(r) 93 | } 94 | 95 | func (r *Reader) Reset(reader io.Reader) { 96 | data := r.reader.data 97 | if data != nil { 98 | data = data[:0] 99 | } 100 | r.reader = byteReader{ 101 | data: data, 102 | r: reader, 103 | } 104 | r.state = (*Reader).stateInit 105 | r.EmitSelfClosingTag = false 106 | } 107 | 108 | func (r *Reader) stateInit() (Event, error) { 109 | // remove_utf8_bom 110 | return r.stateInsideText() 111 | } 112 | 113 | func (r *Reader) stateInsideText() (Event, error) { 114 | end, err := readText(&r.reader) 115 | if err == io.EOF { 116 | r.state = (*Reader).stateDone 117 | if end == 0 { 118 | return Event{ 119 | value: EventEOF, 120 | }, nil 121 | } else { 122 | w := r.reader.window() 123 | r.reader.offset += len(w) 124 | return Event{ 125 | Bytes: w, 126 | value: EventText, 127 | }, nil 128 | } 129 | } 130 | if err != nil { 131 | return Event{}, err 132 | } 133 | if end == 0 { 134 | return r.stateInsideMarkup() 135 | } else { 136 | r.state = (*Reader).stateInsideMarkup 137 | w := r.reader.window()[:end] 138 | r.reader.offset += len(w) 139 | return Event{ 140 | Bytes: w, 141 | value: EventText, 142 | }, nil 143 | } 144 | } 145 | 146 | var stateChangeMarker = [256]bool{ 147 | '"': true, 148 | '\'': true, 149 | '>': true, 150 | } 151 | 152 | func (r *Reader) stateInsideMarkup() (Event, error) { 153 | r.state = (*Reader).stateInsideText 154 | rr := &r.reader 155 | if rr.offset+1 >= len(rr.data) { 156 | if rr.extend() == 0 { 157 | return Event{}, rr.err 158 | } 159 | } 160 | switch w := rr.window(); w[1] { 161 | case '!': 162 | if len(w) < 3 { 163 | if rr.extend() == 0 { 164 | return Event{}, rr.err 165 | } 166 | w = rr.window() 167 | } 168 | switch w[2] { 169 | case '[': // CData 170 | offset := 3 171 | for { 172 | if i := bytes.Index(w[offset:], []byte("]]>")); i >= 0 { 173 | r.reader.offset += offset + i + 3 174 | return Event{ 175 | Bytes: w[:offset+i+3], 176 | value: EventCData, 177 | }, nil 178 | } 179 | offset = len(w) - 2 180 | if rr.extend() == 0 { 181 | return Event{}, rr.err 182 | } 183 | w = rr.window() 184 | } 185 | case '-': // Comment 186 | offset := 3 187 | for { 188 | if i := bytes.Index(w[offset:], []byte("-->")); i >= 0 { 189 | r.reader.offset += offset + i + 3 190 | return Event{ 191 | Bytes: w[:offset+i+3], 192 | value: EventComment, 193 | }, nil 194 | } 195 | offset = len(w) - 2 196 | if rr.extend() == 0 { 197 | return Event{}, rr.err 198 | } 199 | w = rr.window() 200 | } 201 | case 'D', 'd': // DocType 202 | offset := 2 203 | for { 204 | lv := 1 205 | for i, c := range w[offset:] { 206 | if c == '>' { 207 | lv-- 208 | if lv == 0 { 209 | r.reader.offset += offset + i + 1 210 | return Event{ 211 | Bytes: w[:offset+i+1], 212 | value: EventDocType, 213 | }, nil 214 | } 215 | } else if c == '<' { 216 | lv++ 217 | } 218 | } 219 | offset = len(w) 220 | if rr.extend() == 0 { 221 | return Event{}, rr.err 222 | } 223 | w = rr.window() 224 | } 225 | default: 226 | return Event{}, fmt.Errorf("unknown bang type: %c", w[1]) 227 | } 228 | case '/': // close tag 229 | offset := 2 230 | for { 231 | if i := bytes.IndexByte(w[offset:], '>'); i >= 0 { 232 | r.reader.offset += offset + i + 1 233 | return Event{ 234 | Bytes: w[:offset+i+1], 235 | value: EventEnd, 236 | }, nil 237 | } 238 | offset = len(w) 239 | if rr.extend() == 0 { 240 | return Event{}, rr.err 241 | } 242 | w = rr.window() 243 | } 244 | case '?': // processing instructions 245 | offset := 2 246 | for { 247 | if i := bytes.Index(w[offset:], []byte("?>")); i >= 0 { 248 | r.reader.offset += offset + i + 2 249 | return Event{ 250 | Bytes: w[:offset+i+2], 251 | value: EventProcessingInstruction, 252 | }, nil 253 | } 254 | offset = len(w) - 1 255 | if rr.extend() == 0 { 256 | return Event{}, rr.err 257 | } 258 | w = rr.window() 259 | } 260 | default: 261 | const ( 262 | splat uint64 = 0x0101010101010101 263 | v1 = '"' * splat 264 | v2 = '>' * splat 265 | v3 = '\'' * splat 266 | ) 267 | state := byte('>') 268 | offset := 1 269 | for { 270 | for offset < len(w) { 271 | if state == '>' { 272 | for ; offset+8 < len(w); offset += 8 { 273 | v := binary.LittleEndian.Uint64(w[offset : offset+8]) 274 | if hasZeroByte(v^v1) || hasZeroByte(v^v2) || hasZeroByte(v^v3) { 275 | break 276 | } 277 | } 278 | p := -1 279 | var ch byte 280 | for i, c := range w[offset:] { 281 | if stateChangeMarker[c] { 282 | p = i 283 | ch = c 284 | break 285 | } 286 | } 287 | if p >= 0 { 288 | if ch == '>' { 289 | if r.EmitSelfClosingTag && w[offset+p-1] == '/' { 290 | r.selfClosingEnd = offset + p 291 | r.state = (*Reader).stateSelfClosingTag 292 | } else { 293 | r.reader.offset += offset + p + 1 294 | } 295 | return Event{ 296 | Bytes: w[:offset+p+1], 297 | value: EventStart, 298 | }, nil 299 | } else { 300 | state = ch 301 | offset += p + 1 302 | } 303 | } else { 304 | break 305 | } 306 | } else { 307 | if i := bytes.IndexByte(w[offset:], state); i >= 0 { 308 | offset += i + 1 309 | state = '>' 310 | } else { 311 | break 312 | } 313 | } 314 | } 315 | offset = len(w) 316 | if rr.extend() == 0 { 317 | return Event{}, rr.err 318 | } 319 | w = rr.window() 320 | } 321 | } 322 | } 323 | 324 | func (r *Reader) stateSelfClosingTag() (Event, error) { 325 | r.state = (*Reader).stateInsideText 326 | w := r.reader.window() 327 | r.reader.offset += r.selfClosingEnd + 1 328 | return Event{ 329 | Bytes: w[:r.selfClosingEnd+1], 330 | value: EventEnd, 331 | }, nil 332 | } 333 | 334 | func (r *Reader) stateDone() (Event, error) { 335 | return Event{ 336 | value: EventEOF, 337 | }, nil 338 | } 339 | 340 | func hasZeroByte(x uint64) bool { 341 | const ( 342 | lo uint64 = 0x0101010101010101 343 | hi uint64 = 0x8080808080808080 344 | ) 345 | return (x-lo) & ^x & hi != 0 346 | } 347 | 348 | func readText(r *byteReader) (int, error) { 349 | offset := 0 350 | for { 351 | w := r.window() 352 | if i := bytes.IndexByte(w[offset:], '<'); i >= 0 { 353 | return offset + i, nil 354 | } 355 | offset = len(w) 356 | if r.extend() == 0 { 357 | return offset, r.err 358 | } 359 | } 360 | } 361 | 362 | // Name extracts the name from an XML tag. 363 | // It returns the name and the remaining bytes. 364 | func Name(b []byte) ([]byte, []byte) { 365 | if len(b) > 1 && b[0] == '<' { 366 | b = b[1:] 367 | } 368 | if len(b) > 1 && b[0] == '/' { 369 | b = b[1:] 370 | } 371 | if len(b) > 1 && b[len(b)-1] == '>' { 372 | b = b[:len(b)-1] 373 | } 374 | if len(b) > 1 && b[len(b)-1] == '/' { 375 | b = b[:len(b)-1] 376 | } 377 | for i, c := range b { 378 | if whitespace[c] { 379 | return b[:i], b[i+1:] 380 | } 381 | } 382 | return b, nil 383 | } 384 | 385 | type Attribute struct { 386 | Key []byte 387 | Value []byte 388 | } 389 | 390 | // NextAttribute extracts the next attribute from an XML tag. 391 | // It returns the Attribute and the remaining bytes. 392 | func NextAttribute(b []byte) (Attribute, []byte, error) { 393 | i := 0 394 | for ; i < len(b) && whitespace[b[i]]; i++ { 395 | } 396 | if i == len(b) { 397 | return Attribute{}, nil, nil 398 | } 399 | keyStart := i 400 | for ; i < len(b) && !whitespace[b[i]] && b[i] != '='; i++ { 401 | } 402 | if i == len(b) { 403 | return Attribute{Key: b[keyStart:]}, nil, nil 404 | } 405 | key := b[keyStart:i] 406 | for ; i < len(b) && whitespace[b[i]]; i++ { 407 | } 408 | if i == len(b) { 409 | return Attribute{Key: key}, nil, nil 410 | } 411 | if b[i] != '=' { 412 | return Attribute{Key: key}, b[i:], nil 413 | } 414 | i++ 415 | for ; i < len(b) && whitespace[b[i]]; i++ { 416 | } 417 | if i == len(b) { 418 | return Attribute{Key: key}, nil, fmt.Errorf("attribute value not found") 419 | } 420 | if b[i] == '"' { 421 | valueEnd := i + 1 + bytes.IndexByte(b[i+1:], '"') + 1 422 | value := b[i:valueEnd] 423 | return Attribute{Key: key, Value: value}, b[valueEnd:], nil 424 | } 425 | if b[i] == '\'' { 426 | valueEnd := i + 1 + bytes.IndexByte(b[i+1:], '\'') + 1 427 | value := b[i:valueEnd] 428 | return Attribute{Key: key, Value: value}, b[valueEnd:], nil 429 | } 430 | return Attribute{}, nil, fmt.Errorf("invalid attribute value: %c", b[i]) 431 | } 432 | 433 | var whitespace = [256]bool{ 434 | ' ': true, 435 | '\r': true, 436 | '\n': true, 437 | '\t': true, 438 | } 439 | 440 | // Unescape decodes XML entity references in a byte slice. 441 | // It returns the unescaped bytes and any error encountered. 442 | func Unescape(b []byte) ([]byte, error) { 443 | p := indexUnescape(b) 444 | if p < 0 { 445 | return b, nil 446 | } 447 | begin := 0 448 | cur := p 449 | for { 450 | if b[p] == '&' { 451 | var escaped []byte 452 | for i := 2; i < 13 && p+i < len(b); i++ { 453 | if b[p+i] == ';' { 454 | escaped = b[p+1 : p+i] 455 | break 456 | } 457 | } 458 | if len(escaped) <= 1 { 459 | return nil, fmt.Errorf("invalid escape sequence") 460 | } 461 | if cur != p && begin != p { 462 | cur += copy(b[cur:], b[begin:p]) 463 | } 464 | if escaped[0] == '#' { 465 | var x uint64 466 | var err error 467 | if escaped[1] == 'x' { 468 | x, err = strconv.ParseUint(string(escaped[2:]), 16, 32) 469 | } else { 470 | x, err = strconv.ParseUint(string(escaped[1:]), 10, 32) 471 | } 472 | if err != nil { 473 | return nil, fmt.Errorf("invalid char reference: %w", err) 474 | } 475 | cur += utf8.EncodeRune(b[cur:], rune(x)) 476 | } else { 477 | switch string(escaped) { 478 | case "lt": 479 | b[cur] = '<' 480 | case "gt": 481 | b[cur] = '>' 482 | case "amp": 483 | b[cur] = '&' 484 | case "apos": 485 | b[cur] = '\'' 486 | case "quot": 487 | b[cur] = '"' 488 | default: 489 | return nil, fmt.Errorf("invalid escape sequence: %q", string(escaped)) 490 | } 491 | cur++ 492 | } 493 | begin = p + len(escaped) + 2 494 | } else { 495 | if cur != p && begin != p { 496 | cur += copy(b[cur:], b[begin:p]) 497 | } 498 | b[cur] = '\n' 499 | cur++ 500 | begin = p + 1 501 | if p+1 < len(b) && b[p+1] == '\n' { 502 | begin += 1 503 | } 504 | } 505 | if i := indexUnescape(b[begin:]); i >= 0 { 506 | p = begin + i 507 | } else { 508 | break 509 | } 510 | } 511 | if len(b) != begin { 512 | cur += copy(b[cur:], b[begin:]) 513 | } 514 | return b[:cur], nil 515 | } 516 | 517 | func indexUnescape(s []byte) int { 518 | const ( 519 | splat uint64 = 0x0101010101010101 520 | v1 = '&' * splat 521 | v2 = '\r' * splat 522 | ) 523 | offset := 0 524 | for len(s) >= 8 { 525 | v := binary.LittleEndian.Uint64(s[:8]) 526 | if hasZeroByte(v^v1) || hasZeroByte(v^v2) { 527 | break 528 | } 529 | s = s[8:] 530 | offset += 8 531 | } 532 | for i, c := range s { 533 | if c == '&' || c == '\r' { 534 | return offset + i 535 | } 536 | } 537 | return -1 538 | } 539 | --------------------------------------------------------------------------------