├── go.mod
├── .github
└── workflows
│ └── main.yaml
├── _samples
└── gosax-count.go
├── xmlb
├── example_test.go
└── xmlb.go
├── bench_test.go
├── LICENSE
├── reader.go
├── README.md
├── compat.go
├── example_test.go
└── gosax.go
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/orisano/gosax
2 |
3 | go 1.22.2
4 |
--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
1 | name: main
2 | on: push
3 | jobs:
4 | build:
5 | runs-on: ubuntu-slim
6 | strategy:
7 | matrix:
8 | go: [ '1.23', '1.24', '1.25' ]
9 | name: go ${{ matrix.go }}
10 | steps:
11 | - uses: actions/checkout@v4
12 | - uses: actions/setup-go@v5
13 | with:
14 | go-version: ${{ matrix.go }}
15 | - run: go test -race -v ./...
16 |
--------------------------------------------------------------------------------
/_samples/gosax-count.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "log"
7 | "os"
8 |
9 | "github.com/orisano/gosax"
10 | )
11 |
12 | func main() {
13 | f, err := os.Open(os.Args[1])
14 | if err != nil {
15 | log.Fatal(err)
16 | }
17 | defer f.Close()
18 |
19 | r := gosax.NewReader(f)
20 | count := 0
21 | inLocation := false
22 | for {
23 | e, err := r.Event()
24 | if err != nil {
25 | log.Fatal(err)
26 | }
27 | if e.Type() == gosax.EventEOF {
28 | break
29 | }
30 | switch e.Type() {
31 | case gosax.EventStart:
32 | name, _ := gosax.Name(e.Bytes)
33 | if string(name) == "location" {
34 | inLocation = true
35 | } else {
36 | inLocation = false
37 | }
38 | case gosax.EventEnd:
39 | inLocation = false
40 | case gosax.EventText:
41 | if inLocation {
42 | if bytes.Contains(e.Bytes, []byte("Africa")) {
43 | count++
44 | }
45 | }
46 | default:
47 | }
48 | }
49 | fmt.Println("counter =", count)
50 | }
51 |
--------------------------------------------------------------------------------
/xmlb/example_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | package xmlb_test
28 |
29 | import (
30 | "fmt"
31 | "io"
32 | "strings"
33 |
34 | "github.com/orisano/gosax/xmlb"
35 | )
36 |
37 | func Example() {
38 | r := strings.NewReader(`Value`)
39 | d := xmlb.NewDecoder(r, make([]byte, 64*1024))
40 | for {
41 | tok, err := d.Token()
42 | if err == io.EOF {
43 | break
44 | }
45 | if err != nil {
46 | break
47 | }
48 | switch tok.Type() {
49 | case xmlb.StartElement:
50 | t, _ := tok.StartElement()
51 | fmt.Println("StartElement", t.Name.Local)
52 | case xmlb.CharData:
53 | t, _ := tok.CharData()
54 | fmt.Println("CharData", string(t))
55 | case xmlb.EndElement:
56 | fmt.Println("EndElement", string(tok.Name().Local()))
57 | }
58 | }
59 | // Output:
60 | // StartElement root
61 | // StartElement element
62 | // CharData Value
63 | // EndElement element
64 | // EndElement root
65 | }
66 |
--------------------------------------------------------------------------------
/bench_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | package gosax_test
28 |
29 | import (
30 | "bytes"
31 | "os"
32 | "testing"
33 |
34 | "github.com/orisano/gosax"
35 | )
36 |
37 | func BenchmarkReader_Event(b *testing.B) {
38 | b.ReportAllocs()
39 | for i := 0; i < b.N; i++ {
40 | if err := countAfrica(b); err != nil {
41 | b.Fatal(err)
42 | }
43 | }
44 | }
45 |
46 | func countAfrica(b *testing.B) error {
47 | f, err := os.Open("testdata/out.xml")
48 | if err != nil {
49 | return err
50 | }
51 | defer f.Close()
52 | if stat, err := f.Stat(); err == nil {
53 | b.SetBytes(stat.Size())
54 | }
55 |
56 | r := gosax.NewReader(f)
57 | count := 0
58 | inLocation := false
59 | for {
60 | e, err := r.Event()
61 | if err != nil {
62 | return err
63 | }
64 | if e.Type() == gosax.EventEOF {
65 | break
66 | }
67 | switch e.Type() {
68 | case gosax.EventStart:
69 | name, _ := gosax.Name(e.Bytes)
70 | if string(name) == "location" {
71 | inLocation = true
72 | } else {
73 | inLocation = false
74 | }
75 | case gosax.EventEnd:
76 | inLocation = false
77 | case gosax.EventText:
78 | if inLocation {
79 | if bytes.Contains(e.Bytes, []byte("Africa")) {
80 | count++
81 | }
82 | }
83 | default:
84 | }
85 | }
86 | return nil
87 | }
88 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024, Nao Yonashiro
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | Additional Notices:
26 |
27 | This project includes a file licensed under the BSD 2-Clause License:
28 |
29 | reader.go
30 | ---------------------
31 |
32 | Copyright (c) 2020, Dave Cheney
33 | All rights reserved.
34 |
35 | Redistribution and use in source and binary forms, with or without
36 | modification, are permitted provided that the following conditions are met:
37 |
38 | * Redistributions of source code must retain the above copyright notice, this
39 | list of conditions and the following disclaimer.
40 |
41 | * Redistributions in binary form must reproduce the above copyright notice,
42 | this list of conditions and the following disclaimer in the documentation
43 | and/or other materials provided with the distribution.
44 |
45 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
46 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
48 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
49 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
51 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
52 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
53 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 |
--------------------------------------------------------------------------------
/reader.go:
--------------------------------------------------------------------------------
1 | /*
2 | This file is based on the source code available at https://github.com/pkg/json under BSD-2-Clause License
3 |
4 | Copyright (c) 2020, Dave Cheney
5 | All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without
8 | modification, are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | */
28 |
29 | package gosax
30 |
31 | import "io"
32 |
33 | // A byteReader implements a sliding window over an io.Reader.
34 | type byteReader struct {
35 | data []byte
36 | offset int
37 | r io.Reader
38 | err error
39 | }
40 |
41 | // release discards n bytes from the front of the window.
42 | func (b *byteReader) release(n int) {
43 | b.offset += n
44 | }
45 |
46 | // window returns the current window.
47 | // The window is invalidated by calls to release or extend.
48 | func (b *byteReader) window() []byte {
49 | return b.data[b.offset:]
50 | }
51 |
52 | // tuning constants for byteReader.extend.
53 | const (
54 | newBufferSize = 4096
55 | minReadSize = newBufferSize >> 2
56 | )
57 |
58 | // extend extends the window with data from the underlying reader.
59 | func (b *byteReader) extend() int {
60 | if b.err != nil {
61 | return 0
62 | }
63 |
64 | remaining := len(b.data) - b.offset
65 | if remaining == 0 {
66 | b.data = b.data[:0]
67 | b.offset = 0
68 | }
69 | if cap(b.data)-len(b.data) >= minReadSize {
70 | // nothing to do, enough space exists between len and cap.
71 | } else if cap(b.data)-remaining >= minReadSize {
72 | // buffer has enough space if we move the data to the front.
73 | b.compact()
74 | } else {
75 | // otherwise, we must allocate/extend a new buffer
76 | b.grow()
77 | }
78 | remaining += b.offset
79 | n, err := b.r.Read(b.data[remaining:cap(b.data)])
80 | // reduce length to the existing plus the data we read.
81 | b.data = b.data[:remaining+n]
82 | b.err = err
83 | return n
84 | }
85 |
86 | // grow grows the buffer, moving the active data to the front.
87 | func (b *byteReader) grow() {
88 | buf := make([]byte, max(cap(b.data)*2, newBufferSize))
89 | copy(buf, b.data[b.offset:])
90 | b.data = buf
91 | b.offset = 0
92 | }
93 |
94 | // compact moves the active data to the front of the buffer.
95 | func (b *byteReader) compact() {
96 | copy(b.data, b.data[b.offset:])
97 | b.offset = 0
98 | }
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gosax
2 |
3 | [](https://pkg.go.dev/github.com/orisano/gosax)
4 |
5 | `gosax` is a Go library for XML SAX (Simple API for XML) parsing, supporting read-only functionality. This library is
6 | designed for efficient and memory-conscious XML parsing, drawing inspiration from various sources to provide a
7 | performant parser.
8 |
9 | ## Features
10 |
11 | - **Read-only SAX parsing**: Stream and process XML documents without loading the entire document into memory.
12 | - **Efficient parsing**: Utilizes techniques inspired by `quick-xml` and `pkg/json` for high performance.
13 | - **SWAR (SIMD Within A Register)**: Optimizations for fast text processing, inspired by `memchr`.
14 | - **Compatibility with encoding/xml**: Includes utility functions to bridge `gosax` types with `encoding/xml` types, facilitating easy integration with existing code that uses the standard library.
15 |
16 | ## Benchmark
17 | ```
18 | goos: darwin
19 | goarch: arm64
20 | pkg: github.com/orisano/gosax
21 | BenchmarkReader_Event-12 5 211845800 ns/op 1103.30 MB/s 2097606 B/op 6 allocs/op
22 | ```
23 |
24 | ## Installation
25 |
26 | To install `gosax`, use `go get`:
27 |
28 | ```bash
29 | go get github.com/orisano/gosax
30 | ```
31 |
32 | ## Usage
33 |
34 | Here is a basic example of how to use `gosax` to parse an XML document:
35 |
36 | ```go
37 | package main
38 |
39 | import (
40 | "fmt"
41 | "log"
42 | "strings"
43 |
44 | "github.com/orisano/gosax"
45 | )
46 |
47 | func main() {
48 | xmlData := `Value`
49 | reader := strings.NewReader(xmlData)
50 |
51 | r := gosax.NewReader(reader)
52 | for {
53 | e, err := r.Event()
54 | if err != nil {
55 | log.Fatal(err)
56 | }
57 | if e.Type() == gosax.EventEOF {
58 | break
59 | }
60 | fmt.Println(string(e.Bytes))
61 | }
62 | // Output:
63 | //
64 | //
65 | // Value
66 | //
67 | //
68 | }
69 |
70 | ```
71 |
72 | ### Bridging with encoding/xml
73 |
74 | **Important Note for encoding/xml Users:**
75 | > When migrating from `encoding/xml` to `gosax`, note that self-closing tags are handled differently. To mimic `encoding/xml` behavior, set `gosax.Reader.EmitSelfClosingTag` to `true`. This ensures self-closing tags are recognized and processed correctly.
76 |
77 | #### Using TokenE
78 | If you are used to `encoding/xml`'s `Token`, start with `gosax.TokenE`.
79 | **Note:** Using `gosax.TokenE` and `gosax.Token` involves memory allocation due to interfaces.
80 |
81 | **Before:**
82 | ```go
83 | var dec *xml.Decoder
84 | for {
85 | tok, err := dec.Token()
86 | if err == io.EOF {
87 | break
88 | }
89 | // ...
90 | }
91 | ```
92 |
93 | **After:**
94 | ```go
95 | var dec *gosax.Reader
96 | for {
97 | tok, err := gosax.TokenE(dec.Event())
98 | if err == io.EOF {
99 | break
100 | }
101 | // ...
102 | }
103 | ```
104 |
105 | #### Utilizing xmlb
106 | `xmlb` is an extension for `gosax` to simplify rewriting code from `encoding/xml`. It provides a higher-performance bridge for XML parsing and processing.
107 |
108 | **Before:**
109 | ```go
110 | var dec *xml.Decoder
111 | for {
112 | tok, err := dec.Token()
113 | if err == io.EOF {
114 | break
115 | }
116 | switch t := tok.(type) {
117 | case xml.StartElement:
118 | // ...
119 | case xml.CharData:
120 | // ...
121 | case xml.EndElement:
122 | // ...
123 | }
124 | }
125 | ```
126 |
127 | **After:**
128 | ```go
129 | var dec *xmlb.Decoder
130 | for {
131 | tok, err := dec.Token()
132 | if err == io.EOF {
133 | break
134 | }
135 | switch tok.Type() {
136 | case xmlb.StartElement:
137 | t, _ := tok.StartElement()
138 | // ...
139 | case xmlb.CharData:
140 | t, _ := tok.CharData()
141 | // ...
142 | case xmlb.EndElement:
143 | t := tok.EndElement()
144 | // ...
145 | }
146 | }
147 | ```
148 |
149 | ## License
150 |
151 | This library is licensed under the terms specified in the LICENSE file.
152 |
153 | ## Acknowledgements
154 |
155 | `gosax` is inspired by the following projects and resources:
156 |
157 | - [Dave Cheney's GopherCon SG 2023 Talk](https://dave.cheney.net/paste/gophercon-sg-2023.html)
158 | - [quick-xml](https://github.com/tafia/quick-xml)
159 | - [memchr](https://github.com/BurntSushi/memchr) (SWAR part)
160 |
161 | ## Contributing
162 |
163 | Contributions are welcome! Please fork the repository and submit pull requests.
164 |
165 | ## Contact
166 |
167 | For any questions or feedback, feel free to open an issue on the GitHub repository.
168 |
--------------------------------------------------------------------------------
/compat.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | // This file contains utility functions to bridge gosax with encoding/xml.
28 | // These functions provide convenient ways to convert gosax types to encoding/xml types,
29 | // facilitating interoperability between the two packages.
30 |
31 | package gosax
32 |
33 | import (
34 | "bytes"
35 | "encoding/xml"
36 | "errors"
37 | "io"
38 | )
39 |
40 | var errSyntaxError = errors.New("syntax error")
41 |
42 | // StartElement converts a byte slice to an xml.StartElement.
43 | func StartElement(b []byte) (xml.StartElement, error) {
44 | name, b := Name(b)
45 | e := xml.StartElement{
46 | Name: xmlName(name),
47 | }
48 | for len(b) > 0 {
49 | var attr Attribute
50 | var err error
51 | attr, b, err = NextAttribute(b)
52 | if err != nil {
53 | return xml.StartElement{}, err
54 | }
55 | if len(attr.Key) == 0 {
56 | break
57 | }
58 | if len(attr.Value) == 0 {
59 | return e, errSyntaxError
60 | }
61 | value, err := Unescape(attr.Value[1 : len(attr.Value)-1])
62 | if err != nil {
63 | return xml.StartElement{}, err
64 | }
65 | e.Attr = append(e.Attr, xml.Attr{
66 | Name: xmlName(attr.Key),
67 | Value: string(value),
68 | })
69 | }
70 | return e, nil
71 | }
72 |
73 | // EndElement converts a byte slice to an xml.EndElement.
74 | func EndElement(b []byte) xml.EndElement {
75 | name, _ := Name(b)
76 | return xml.EndElement{
77 | Name: xmlName(name),
78 | }
79 | }
80 |
81 | // CharData converts a byte slice to xml.CharData.
82 | func CharData(b []byte) (xml.CharData, error) {
83 | return Unescape(b)
84 | }
85 |
86 | // Comment converts a byte slice to an xml.Comment.
87 | func Comment(b []byte) xml.Comment {
88 | return trim(b, "")
89 | }
90 |
91 | // ProcInst converts a byte slice to an xml.ProcInst.
92 | func ProcInst(b []byte) xml.ProcInst {
93 | name, b := Name(b)
94 | return xml.ProcInst{
95 | Target: string(name[1:]),
96 | Inst: b[:len(b)-1],
97 | }
98 | }
99 |
100 | // Directive converts a byte slice to an xml.Directive.
101 | func Directive(b []byte) xml.Directive {
102 | return trim(b, "")
103 | }
104 |
105 | // Token converts an Event to an xml.Token.
106 | // This function is provided for convenience, but it may allocate memory.
107 | //
108 | // Note: For performance-critical applications, it's recommended to use
109 | // the direct conversion functions (StartElement, EndElement, CharData, etc.)
110 | // instead of Token, as they allow better control over memory allocations.
111 | func Token(e Event) (xml.Token, error) {
112 | switch e.Type() {
113 | case EventStart:
114 | return StartElement(e.Bytes)
115 | case EventEnd:
116 | return EndElement(e.Bytes), nil
117 | case EventText:
118 | return CharData(e.Bytes)
119 | case EventCData:
120 | return xml.CharData(trim(e.Bytes, "")), nil
121 | case EventComment:
122 | return Comment(e.Bytes), nil
123 | case EventProcessingInstruction:
124 | return ProcInst(e.Bytes), nil
125 | case EventDocType:
126 | return Directive(e.Bytes), nil
127 | case EventEOF:
128 | return nil, io.EOF
129 | default:
130 | panic("unknown event type")
131 | }
132 | }
133 |
134 | // TokenE returns an xml.Token from an Event or an error if one is passed.
135 | // If err is not nil, it immediately returns nil and the provided error.
136 | func TokenE(e Event, err error) (xml.Token, error) {
137 | if err != nil {
138 | return nil, err
139 | }
140 | return Token(e)
141 | }
142 |
143 | // Skip advances the XML reader to the end of the current nested scope, returning an error if encountered.
144 | func Skip(r *Reader) error {
145 | var depth int64
146 | for {
147 | ev, err := r.Event()
148 | if err != nil {
149 | return err
150 | }
151 | switch ev.Type() {
152 | case EventStart:
153 | depth++
154 | case EventEnd:
155 | if depth == 0 {
156 | return nil
157 | }
158 | depth--
159 | default:
160 | }
161 | }
162 | }
163 |
164 | func xmlName(b []byte) xml.Name {
165 | if i := bytes.IndexByte(b, ':'); i >= 0 {
166 | return xml.Name{
167 | Space: string(b[:i]),
168 | Local: string(b[i+1:]),
169 | }
170 | } else {
171 | return xml.Name{
172 | Local: string(b),
173 | }
174 | }
175 | }
176 |
177 | func trim(b []byte, prefix, suffix string) []byte {
178 | return bytes.TrimSuffix(bytes.TrimPrefix(b, []byte(prefix)), []byte(suffix))
179 | }
180 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | package gosax_test
28 |
29 | import (
30 | "encoding/xml"
31 | "fmt"
32 | "log"
33 | "strings"
34 | "sync"
35 |
36 | "github.com/orisano/gosax"
37 | )
38 |
39 | func ExampleReader_Event() {
40 | xmlData := `Value`
41 | reader := strings.NewReader(xmlData)
42 |
43 | r := gosax.NewReader(reader)
44 | for {
45 | e, err := r.Event()
46 | if err != nil {
47 | log.Fatal(err)
48 | }
49 | if e.Type() == gosax.EventEOF {
50 | break
51 | }
52 | fmt.Println(string(e.Bytes))
53 | }
54 | // Output:
55 | //
56 | //
57 | // Value
58 | //
59 | //
60 | }
61 |
62 | func ExampleNewReaderBuf() {
63 | xmlData := `Value`
64 | reader := strings.NewReader(xmlData)
65 |
66 | var buf [4096]byte
67 | r := gosax.NewReaderBuf(reader, buf[:])
68 | for {
69 | e, err := r.Event()
70 | if err != nil {
71 | log.Fatal(err)
72 | }
73 | if e.Type() == gosax.EventEOF {
74 | break
75 | }
76 | fmt.Println(string(e.Bytes))
77 | }
78 | // Output:
79 | //
80 | //
81 | // Value
82 | //
83 | //
84 | }
85 |
86 | func ExampleReader_Reset() {
87 | pool := sync.Pool{
88 | New: func() any {
89 | return gosax.NewReaderSize(nil, 16*1024)
90 | },
91 | }
92 | func(p *sync.Pool) {
93 | xmlData := `Value`
94 | reader := strings.NewReader(xmlData)
95 |
96 | r := p.Get().(*gosax.Reader)
97 | defer p.Put(r)
98 | r.Reset(reader)
99 | for {
100 | e, err := r.Event()
101 | if err != nil {
102 | log.Fatal(err)
103 | }
104 | if e.Type() == gosax.EventEOF {
105 | break
106 | }
107 | fmt.Println(string(e.Bytes))
108 | }
109 | }(&pool)
110 | // Output:
111 | //
112 | //
113 | // Value
114 | //
115 | //
116 | }
117 |
118 | func ExampleToken() {
119 | xmlData := `Value`
120 | reader := strings.NewReader(xmlData)
121 |
122 | r := gosax.NewReader(reader)
123 | for {
124 | e, err := r.Event()
125 | if err != nil {
126 | log.Fatal(err)
127 | }
128 | if e.Type() == gosax.EventEOF {
129 | break
130 | }
131 | t, err := gosax.Token(e)
132 | if err != nil {
133 | log.Fatal(err)
134 | }
135 | switch t := t.(type) {
136 | case xml.StartElement:
137 | fmt.Println("StartElement", t.Name.Local)
138 | for _, attr := range t.Attr {
139 | fmt.Println("Attr", attr.Name.Local, attr.Value)
140 | }
141 | case xml.EndElement:
142 | fmt.Println("EndElement", t.Name.Local)
143 | case xml.CharData:
144 | fmt.Println("CharData", string(t))
145 | }
146 | }
147 | // Output:
148 | // StartElement root
149 | // StartElement element
150 | // Attr foo
151 | // Attr bar qux
152 | // CharData Value
153 | // EndElement element
154 | // EndElement root
155 | }
156 |
157 | func ExampleReader_EmitSelfClosingTag() {
158 | xmlData := `Value`
159 | reader := strings.NewReader(xmlData)
160 |
161 | r := gosax.NewReader(reader)
162 | r.EmitSelfClosingTag = true
163 | for {
164 | e, err := r.Event()
165 | if err != nil {
166 | log.Fatal(err)
167 | }
168 | if e.Type() == gosax.EventEOF {
169 | break
170 | }
171 | switch e.Type() {
172 | case gosax.EventStart:
173 | name, _ := gosax.Name(e.Bytes)
174 | fmt.Println("EventStart", string(name))
175 | case gosax.EventEnd:
176 | name, _ := gosax.Name(e.Bytes)
177 | fmt.Println("EventEnd", string(name))
178 | case gosax.EventText:
179 | fmt.Println("EventText", string(e.Bytes))
180 | default:
181 | }
182 | }
183 | // Output:
184 | // EventStart root
185 | // EventStart element
186 | // EventText Value
187 | // EventEnd element
188 | // EventStart selfclosing
189 | // EventEnd selfclosing
190 | // EventEnd root
191 | }
192 |
193 | func ExampleUnescape() {
194 | xmlData := "Line1\r\nLine2\rLine3\nLine4\r\nLine5\r\n"
195 | b, _ := gosax.Unescape([]byte(xmlData))
196 | fmt.Printf("%q", string(b))
197 | // Output:
198 | // "Line1\nLine2\nLine3\nLine4\nLine5\n"
199 | }
200 |
201 | func ExampleStartElement() {
202 | xmlData := `
205 | `
206 | reader := strings.NewReader(xmlData)
207 |
208 | r := gosax.NewReader(reader)
209 | for {
210 | e, err := r.Event()
211 | if err != nil {
212 | log.Fatal(err)
213 | }
214 | if e.Type() == gosax.EventEOF {
215 | break
216 | }
217 | t, err := gosax.Token(e)
218 | if err != nil {
219 | log.Fatal(err)
220 | }
221 | switch t := t.(type) {
222 | case xml.StartElement:
223 | fmt.Println("StartElement", t.Name.Local)
224 | for _, attr := range t.Attr {
225 | fmt.Println("Attr", attr.Name.Local, attr.Value)
226 | }
227 | case xml.EndElement:
228 | fmt.Println("EndElement", t.Name.Local)
229 | case xml.CharData:
230 | continue
231 | }
232 | }
233 | // Output:
234 | // StartElement root
235 | // StartElement element
236 | // Attr foo bar
237 | // EndElement element
238 | // EndElement root
239 | }
240 |
--------------------------------------------------------------------------------
/xmlb/xmlb.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | // Package xmlb provides a high-performance bridge between the gosax library and encoding/xml.
28 | // It is designed to facilitate the rewriting of code that uses encoding/xml, offering a more efficient
29 | // and memory-conscious approach to XML parsing.
30 | //
31 | // While gosax provides a low-level bridge with encoding/xml through various utility functions,
32 | // xmlb offers a higher-performance bridge intended for rewriting.
33 | package xmlb
34 |
35 | import (
36 | "bytes"
37 | "encoding/xml"
38 | "errors"
39 | "io"
40 |
41 | "github.com/orisano/gosax"
42 | )
43 |
44 | const (
45 | StartElement = iota + 1
46 | EndElement
47 | CharData
48 | ProcInst
49 | Comment
50 | Directive
51 | )
52 |
53 | type Decoder struct {
54 | r *gosax.Reader
55 |
56 | t *Token
57 | err error
58 | }
59 |
60 | func NewDecoder(r io.Reader, buf []byte) *Decoder {
61 | gr := gosax.NewReaderBuf(r, buf)
62 | gr.EmitSelfClosingTag = true
63 | return &Decoder{r: gr}
64 | }
65 |
66 | func (d *Decoder) Token() (Token, error) {
67 | if d.err != nil {
68 | return Token{}, d.err
69 | }
70 | if d.t != nil {
71 | t := *d.t
72 | d.t = nil
73 | return t, nil
74 | }
75 | ev, err := d.r.Event()
76 | if err == nil && ev.Type() == gosax.EventEOF {
77 | err = io.EOF
78 | }
79 | if err != nil {
80 | return Token{}, err
81 | }
82 | return Token(ev), nil
83 | }
84 |
85 | func (d *Decoder) Peek() (Token, error) {
86 | if d.err != nil {
87 | return Token{}, d.err
88 | }
89 | if d.t == nil {
90 | ev, err := d.r.Event()
91 | if err == nil && ev.Type() == gosax.EventEOF {
92 | d.err = io.EOF
93 | } else {
94 | d.err = err
95 | }
96 | if d.err != nil {
97 | return Token{}, d.err
98 | }
99 | t := Token(ev)
100 | d.t = &t
101 | }
102 | return *d.t, nil
103 | }
104 |
105 | func (d *Decoder) Text() (string, error) {
106 | t, err := d.Peek()
107 | if err != nil {
108 | return "", err
109 | }
110 | if t.Type() != CharData {
111 | return "", nil
112 | }
113 | cd, err := t.CharData()
114 | if err != nil {
115 | return "", err
116 | }
117 | return string(cd), nil
118 | }
119 |
120 | func (d *Decoder) Skip() error {
121 | var depth int64
122 | for {
123 | tok, err := d.Token()
124 | if err != nil {
125 | return err
126 | }
127 | switch tok.Type() {
128 | case StartElement:
129 | depth++
130 | case EndElement:
131 | if depth == 0 {
132 | return nil
133 | }
134 | depth--
135 | default:
136 | }
137 | }
138 | }
139 |
140 | type Token gosax.Event
141 |
142 | func (t Token) Type() uint8 {
143 | switch gosax.Event(t).Type() {
144 | case gosax.EventStart:
145 | return StartElement
146 | case gosax.EventEnd:
147 | return EndElement
148 | case gosax.EventText:
149 | return CharData
150 | case gosax.EventCData:
151 | return CharData
152 | case gosax.EventProcessingInstruction:
153 | return ProcInst
154 | case gosax.EventComment:
155 | return Comment
156 | case gosax.EventDocType:
157 | return Directive
158 | case gosax.EventEOF:
159 | return 0
160 | default:
161 | panic("unreachable")
162 | }
163 | }
164 |
165 | type NameBytes struct {
166 | b []byte
167 | p int
168 | }
169 |
170 | func (n NameBytes) Space() []byte {
171 | if n.p == 0 {
172 | return nil
173 | }
174 | return n.b[:n.p-1]
175 | }
176 |
177 | func (n NameBytes) Local() []byte {
178 | return n.b[n.p:]
179 | }
180 |
181 | var ErrNoAttributes = errors.New("no attributes")
182 |
183 | type AttributesBytes []byte
184 |
185 | func (a AttributesBytes) Get(key string) ([]byte, error) {
186 | b := []byte(a)
187 | for len(b) > 0 {
188 | attr, b2, err := gosax.NextAttribute(b)
189 | if err != nil {
190 | return nil, err
191 | }
192 | b = b2
193 | if string(attr.Key) != key {
194 | continue
195 | }
196 | v, err := gosax.Unescape(attr.Value[1 : len(attr.Value)-1])
197 | if err != nil {
198 | return nil, err
199 | }
200 | return v, nil
201 | }
202 | return nil, ErrNoAttributes
203 | }
204 |
205 | type StartElementBytes struct {
206 | Name NameBytes
207 | Attrs AttributesBytes
208 | }
209 |
210 | func (t Token) Name() NameBytes {
211 | name, _ := gosax.Name(t.Bytes)
212 | p := bytes.IndexByte(name, ':')
213 | if p < 0 {
214 | return NameBytes{name, 0}
215 | }
216 | return NameBytes{name, p + 1}
217 | }
218 |
219 | func (t Token) StartElement() (xml.StartElement, error) {
220 | return gosax.StartElement(t.Bytes)
221 | }
222 |
223 | func (t Token) StartElementBytes() StartElementBytes {
224 | name, attrs := gosax.Name(t.Bytes)
225 | p := bytes.IndexByte(name, ':')
226 | if p < 0 {
227 | p = 0
228 | } else {
229 | p += 1
230 | }
231 | return StartElementBytes{NameBytes{name, p}, attrs}
232 | }
233 |
234 | func (t Token) EndElement() xml.EndElement {
235 | return gosax.EndElement(t.Bytes)
236 | }
237 |
238 | func (t Token) CharData() (xml.CharData, error) {
239 | switch gosax.Event(t).Type() {
240 | case gosax.EventText:
241 | return gosax.CharData(t.Bytes)
242 | case gosax.EventCData:
243 | return bytes.TrimSuffix(bytes.TrimPrefix(t.Bytes, []byte("")), nil
244 | default:
245 | panic("unreachable")
246 | }
247 | }
248 |
249 | func (t Token) ProcInst() xml.ProcInst {
250 | return gosax.ProcInst(t.Bytes)
251 | }
252 |
253 | func (t Token) Comment() xml.Comment {
254 | return gosax.Comment(t.Bytes)
255 | }
256 |
257 | func (t Token) Directive() xml.Directive {
258 | return gosax.Directive(t.Bytes)
259 | }
260 |
--------------------------------------------------------------------------------
/gosax.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2024, Nao Yonashiro
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 |
27 | // Package gosax provides a Simple API for XML (SAX) parser for Go.
28 | // It offers efficient, read-only XML parsing with streaming capabilities,
29 | // inspired by quick-xml and other high-performance parsing techniques.
30 | package gosax
31 |
32 | import (
33 | "bytes"
34 | "encoding/binary"
35 | "fmt"
36 | "io"
37 | "strconv"
38 | "unicode/utf8"
39 | )
40 |
41 | const (
42 | eventUnknown = iota
43 | EventStart
44 | EventEnd
45 | EventText
46 | EventCData
47 | EventComment
48 | EventProcessingInstruction
49 | EventDocType
50 | EventEOF
51 | )
52 |
53 | type Event struct {
54 | Bytes []byte
55 | value uint32
56 | }
57 |
58 | func (e Event) Type() uint8 {
59 | return uint8(e.value)
60 | }
61 |
62 | type Reader struct {
63 | reader byteReader
64 | state func(*Reader) (Event, error)
65 |
66 | EmitSelfClosingTag bool
67 | selfClosingEnd int
68 | }
69 |
70 | func NewReader(r io.Reader) *Reader {
71 | return NewReaderSize(r, 2*1024*1024)
72 | }
73 |
74 | func NewReaderSize(r io.Reader, bufSize int) *Reader {
75 | return NewReaderBuf(r, make([]byte, 0, bufSize))
76 | }
77 |
78 | func NewReaderBuf(r io.Reader, buf []byte) *Reader {
79 | var xr Reader
80 | xr.reader.data = buf
81 | xr.Reset(r)
82 | return &xr
83 | }
84 |
85 | // Event returns the next Event from the XML stream.
86 | // It returns an Event and any error encountered.
87 | //
88 | // Note: The returned Event object is only valid until the next call to Event.
89 | // The underlying byte slice may be overwritten by subsequent calls.
90 | // If you need to retain the Event data, make a copy before the next Event call.
91 | func (r *Reader) Event() (Event, error) {
92 | return r.state(r)
93 | }
94 |
95 | func (r *Reader) Reset(reader io.Reader) {
96 | data := r.reader.data
97 | if data != nil {
98 | data = data[:0]
99 | }
100 | r.reader = byteReader{
101 | data: data,
102 | r: reader,
103 | }
104 | r.state = (*Reader).stateInit
105 | r.EmitSelfClosingTag = false
106 | }
107 |
108 | func (r *Reader) stateInit() (Event, error) {
109 | // remove_utf8_bom
110 | return r.stateInsideText()
111 | }
112 |
113 | func (r *Reader) stateInsideText() (Event, error) {
114 | end, err := readText(&r.reader)
115 | if err == io.EOF {
116 | r.state = (*Reader).stateDone
117 | if end == 0 {
118 | return Event{
119 | value: EventEOF,
120 | }, nil
121 | } else {
122 | w := r.reader.window()
123 | r.reader.offset += len(w)
124 | return Event{
125 | Bytes: w,
126 | value: EventText,
127 | }, nil
128 | }
129 | }
130 | if err != nil {
131 | return Event{}, err
132 | }
133 | if end == 0 {
134 | return r.stateInsideMarkup()
135 | } else {
136 | r.state = (*Reader).stateInsideMarkup
137 | w := r.reader.window()[:end]
138 | r.reader.offset += len(w)
139 | return Event{
140 | Bytes: w,
141 | value: EventText,
142 | }, nil
143 | }
144 | }
145 |
146 | var stateChangeMarker = [256]bool{
147 | '"': true,
148 | '\'': true,
149 | '>': true,
150 | }
151 |
152 | func (r *Reader) stateInsideMarkup() (Event, error) {
153 | r.state = (*Reader).stateInsideText
154 | rr := &r.reader
155 | if rr.offset+1 >= len(rr.data) {
156 | if rr.extend() == 0 {
157 | return Event{}, rr.err
158 | }
159 | }
160 | switch w := rr.window(); w[1] {
161 | case '!':
162 | if len(w) < 3 {
163 | if rr.extend() == 0 {
164 | return Event{}, rr.err
165 | }
166 | w = rr.window()
167 | }
168 | switch w[2] {
169 | case '[': // CData
170 | offset := 3
171 | for {
172 | if i := bytes.Index(w[offset:], []byte("]]>")); i >= 0 {
173 | r.reader.offset += offset + i + 3
174 | return Event{
175 | Bytes: w[:offset+i+3],
176 | value: EventCData,
177 | }, nil
178 | }
179 | offset = len(w) - 2
180 | if rr.extend() == 0 {
181 | return Event{}, rr.err
182 | }
183 | w = rr.window()
184 | }
185 | case '-': // Comment
186 | offset := 3
187 | for {
188 | if i := bytes.Index(w[offset:], []byte("-->")); i >= 0 {
189 | r.reader.offset += offset + i + 3
190 | return Event{
191 | Bytes: w[:offset+i+3],
192 | value: EventComment,
193 | }, nil
194 | }
195 | offset = len(w) - 2
196 | if rr.extend() == 0 {
197 | return Event{}, rr.err
198 | }
199 | w = rr.window()
200 | }
201 | case 'D', 'd': // DocType
202 | offset := 2
203 | for {
204 | lv := 1
205 | for i, c := range w[offset:] {
206 | if c == '>' {
207 | lv--
208 | if lv == 0 {
209 | r.reader.offset += offset + i + 1
210 | return Event{
211 | Bytes: w[:offset+i+1],
212 | value: EventDocType,
213 | }, nil
214 | }
215 | } else if c == '<' {
216 | lv++
217 | }
218 | }
219 | offset = len(w)
220 | if rr.extend() == 0 {
221 | return Event{}, rr.err
222 | }
223 | w = rr.window()
224 | }
225 | default:
226 | return Event{}, fmt.Errorf("unknown bang type: %c", w[1])
227 | }
228 | case '/': // close tag
229 | offset := 2
230 | for {
231 | if i := bytes.IndexByte(w[offset:], '>'); i >= 0 {
232 | r.reader.offset += offset + i + 1
233 | return Event{
234 | Bytes: w[:offset+i+1],
235 | value: EventEnd,
236 | }, nil
237 | }
238 | offset = len(w)
239 | if rr.extend() == 0 {
240 | return Event{}, rr.err
241 | }
242 | w = rr.window()
243 | }
244 | case '?': // processing instructions
245 | offset := 2
246 | for {
247 | if i := bytes.Index(w[offset:], []byte("?>")); i >= 0 {
248 | r.reader.offset += offset + i + 2
249 | return Event{
250 | Bytes: w[:offset+i+2],
251 | value: EventProcessingInstruction,
252 | }, nil
253 | }
254 | offset = len(w) - 1
255 | if rr.extend() == 0 {
256 | return Event{}, rr.err
257 | }
258 | w = rr.window()
259 | }
260 | default:
261 | const (
262 | splat uint64 = 0x0101010101010101
263 | v1 = '"' * splat
264 | v2 = '>' * splat
265 | v3 = '\'' * splat
266 | )
267 | state := byte('>')
268 | offset := 1
269 | for {
270 | for offset < len(w) {
271 | if state == '>' {
272 | for ; offset+8 < len(w); offset += 8 {
273 | v := binary.LittleEndian.Uint64(w[offset : offset+8])
274 | if hasZeroByte(v^v1) || hasZeroByte(v^v2) || hasZeroByte(v^v3) {
275 | break
276 | }
277 | }
278 | p := -1
279 | var ch byte
280 | for i, c := range w[offset:] {
281 | if stateChangeMarker[c] {
282 | p = i
283 | ch = c
284 | break
285 | }
286 | }
287 | if p >= 0 {
288 | if ch == '>' {
289 | if r.EmitSelfClosingTag && w[offset+p-1] == '/' {
290 | r.selfClosingEnd = offset + p
291 | r.state = (*Reader).stateSelfClosingTag
292 | } else {
293 | r.reader.offset += offset + p + 1
294 | }
295 | return Event{
296 | Bytes: w[:offset+p+1],
297 | value: EventStart,
298 | }, nil
299 | } else {
300 | state = ch
301 | offset += p + 1
302 | }
303 | } else {
304 | break
305 | }
306 | } else {
307 | if i := bytes.IndexByte(w[offset:], state); i >= 0 {
308 | offset += i + 1
309 | state = '>'
310 | } else {
311 | break
312 | }
313 | }
314 | }
315 | offset = len(w)
316 | if rr.extend() == 0 {
317 | return Event{}, rr.err
318 | }
319 | w = rr.window()
320 | }
321 | }
322 | }
323 |
324 | func (r *Reader) stateSelfClosingTag() (Event, error) {
325 | r.state = (*Reader).stateInsideText
326 | w := r.reader.window()
327 | r.reader.offset += r.selfClosingEnd + 1
328 | return Event{
329 | Bytes: w[:r.selfClosingEnd+1],
330 | value: EventEnd,
331 | }, nil
332 | }
333 |
334 | func (r *Reader) stateDone() (Event, error) {
335 | return Event{
336 | value: EventEOF,
337 | }, nil
338 | }
339 |
340 | func hasZeroByte(x uint64) bool {
341 | const (
342 | lo uint64 = 0x0101010101010101
343 | hi uint64 = 0x8080808080808080
344 | )
345 | return (x-lo) & ^x & hi != 0
346 | }
347 |
348 | func readText(r *byteReader) (int, error) {
349 | offset := 0
350 | for {
351 | w := r.window()
352 | if i := bytes.IndexByte(w[offset:], '<'); i >= 0 {
353 | return offset + i, nil
354 | }
355 | offset = len(w)
356 | if r.extend() == 0 {
357 | return offset, r.err
358 | }
359 | }
360 | }
361 |
362 | // Name extracts the name from an XML tag.
363 | // It returns the name and the remaining bytes.
364 | func Name(b []byte) ([]byte, []byte) {
365 | if len(b) > 1 && b[0] == '<' {
366 | b = b[1:]
367 | }
368 | if len(b) > 1 && b[0] == '/' {
369 | b = b[1:]
370 | }
371 | if len(b) > 1 && b[len(b)-1] == '>' {
372 | b = b[:len(b)-1]
373 | }
374 | if len(b) > 1 && b[len(b)-1] == '/' {
375 | b = b[:len(b)-1]
376 | }
377 | for i, c := range b {
378 | if whitespace[c] {
379 | return b[:i], b[i+1:]
380 | }
381 | }
382 | return b, nil
383 | }
384 |
385 | type Attribute struct {
386 | Key []byte
387 | Value []byte
388 | }
389 |
390 | // NextAttribute extracts the next attribute from an XML tag.
391 | // It returns the Attribute and the remaining bytes.
392 | func NextAttribute(b []byte) (Attribute, []byte, error) {
393 | i := 0
394 | for ; i < len(b) && whitespace[b[i]]; i++ {
395 | }
396 | if i == len(b) {
397 | return Attribute{}, nil, nil
398 | }
399 | keyStart := i
400 | for ; i < len(b) && !whitespace[b[i]] && b[i] != '='; i++ {
401 | }
402 | if i == len(b) {
403 | return Attribute{Key: b[keyStart:]}, nil, nil
404 | }
405 | key := b[keyStart:i]
406 | for ; i < len(b) && whitespace[b[i]]; i++ {
407 | }
408 | if i == len(b) {
409 | return Attribute{Key: key}, nil, nil
410 | }
411 | if b[i] != '=' {
412 | return Attribute{Key: key}, b[i:], nil
413 | }
414 | i++
415 | for ; i < len(b) && whitespace[b[i]]; i++ {
416 | }
417 | if i == len(b) {
418 | return Attribute{Key: key}, nil, fmt.Errorf("attribute value not found")
419 | }
420 | if b[i] == '"' {
421 | valueEnd := i + 1 + bytes.IndexByte(b[i+1:], '"') + 1
422 | value := b[i:valueEnd]
423 | return Attribute{Key: key, Value: value}, b[valueEnd:], nil
424 | }
425 | if b[i] == '\'' {
426 | valueEnd := i + 1 + bytes.IndexByte(b[i+1:], '\'') + 1
427 | value := b[i:valueEnd]
428 | return Attribute{Key: key, Value: value}, b[valueEnd:], nil
429 | }
430 | return Attribute{}, nil, fmt.Errorf("invalid attribute value: %c", b[i])
431 | }
432 |
433 | var whitespace = [256]bool{
434 | ' ': true,
435 | '\r': true,
436 | '\n': true,
437 | '\t': true,
438 | }
439 |
440 | // Unescape decodes XML entity references in a byte slice.
441 | // It returns the unescaped bytes and any error encountered.
442 | func Unescape(b []byte) ([]byte, error) {
443 | p := indexUnescape(b)
444 | if p < 0 {
445 | return b, nil
446 | }
447 | begin := 0
448 | cur := p
449 | for {
450 | if b[p] == '&' {
451 | var escaped []byte
452 | for i := 2; i < 13 && p+i < len(b); i++ {
453 | if b[p+i] == ';' {
454 | escaped = b[p+1 : p+i]
455 | break
456 | }
457 | }
458 | if len(escaped) <= 1 {
459 | return nil, fmt.Errorf("invalid escape sequence")
460 | }
461 | if cur != p && begin != p {
462 | cur += copy(b[cur:], b[begin:p])
463 | }
464 | if escaped[0] == '#' {
465 | var x uint64
466 | var err error
467 | if escaped[1] == 'x' {
468 | x, err = strconv.ParseUint(string(escaped[2:]), 16, 32)
469 | } else {
470 | x, err = strconv.ParseUint(string(escaped[1:]), 10, 32)
471 | }
472 | if err != nil {
473 | return nil, fmt.Errorf("invalid char reference: %w", err)
474 | }
475 | cur += utf8.EncodeRune(b[cur:], rune(x))
476 | } else {
477 | switch string(escaped) {
478 | case "lt":
479 | b[cur] = '<'
480 | case "gt":
481 | b[cur] = '>'
482 | case "amp":
483 | b[cur] = '&'
484 | case "apos":
485 | b[cur] = '\''
486 | case "quot":
487 | b[cur] = '"'
488 | default:
489 | return nil, fmt.Errorf("invalid escape sequence: %q", string(escaped))
490 | }
491 | cur++
492 | }
493 | begin = p + len(escaped) + 2
494 | } else {
495 | if cur != p && begin != p {
496 | cur += copy(b[cur:], b[begin:p])
497 | }
498 | b[cur] = '\n'
499 | cur++
500 | begin = p + 1
501 | if p+1 < len(b) && b[p+1] == '\n' {
502 | begin += 1
503 | }
504 | }
505 | if i := indexUnescape(b[begin:]); i >= 0 {
506 | p = begin + i
507 | } else {
508 | break
509 | }
510 | }
511 | if len(b) != begin {
512 | cur += copy(b[cur:], b[begin:])
513 | }
514 | return b[:cur], nil
515 | }
516 |
517 | func indexUnescape(s []byte) int {
518 | const (
519 | splat uint64 = 0x0101010101010101
520 | v1 = '&' * splat
521 | v2 = '\r' * splat
522 | )
523 | offset := 0
524 | for len(s) >= 8 {
525 | v := binary.LittleEndian.Uint64(s[:8])
526 | if hasZeroByte(v^v1) || hasZeroByte(v^v2) {
527 | break
528 | }
529 | s = s[8:]
530 | offset += 8
531 | }
532 | for i, c := range s {
533 | if c == '&' || c == '\r' {
534 | return offset + i
535 | }
536 | }
537 | return -1
538 | }
539 |
--------------------------------------------------------------------------------