├── .github
├── FUNDING.yml
├── dependabot.yml
└── workflows
│ └── ci.yml
├── testdata
├── self_closing.xml
├── copyright_header.xml
├── corrupted
│ └── cdata_truncated.xml
├── cdata.xml
├── cdata_clrf.xml
├── dtd.xml
├── xlsx_sheet1.xml
└── long_comment_token.xml
├── go.mod
├── internal
├── README.md
├── gpx
│ ├── schema
│ │ ├── xml.go
│ │ ├── gpx.go
│ │ ├── extensions.go
│ │ ├── metadata.go
│ │ └── track.go
│ └── unmarshal.go
├── xlsx
│ ├── unmarshal.go
│ └── schema
│ │ └── sheet.go
└── main.go
├── go.sum
├── codecov.yml
├── CONTRIBUTING.md
├── LICENCE
├── README.md
├── token.go
├── benchmark_test.go
├── token_test.go
├── tokenizer_internal_test.go
├── docs
└── USAGE.md
├── tokenizer.go
└── tokenizer_test.go
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [muktihari]
2 |
--------------------------------------------------------------------------------
/testdata/self_closing.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/muktihari/xmltokenizer
2 |
3 | go 1.21
4 |
5 | require github.com/google/go-cmp v0.7.0
6 |
--------------------------------------------------------------------------------
/testdata/copyright_header.xml:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/testdata/corrupted/cdata_truncated.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
2 |
3 |
4 |
5 |
6 |
7 | text]]>
8 |
9 |
10 | text
12 | ]]>
13 |
14 |
--------------------------------------------------------------------------------
/testdata/cdata_clrf.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | text]]>
8 |
9 |
10 | text
12 | ]]>
13 |
14 |
15 |
--------------------------------------------------------------------------------
/internal/gpx/schema/xml.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 | )
7 |
8 | func getCharData(dec *xml.Decoder) (xml.CharData, error) {
9 | token, err := dec.Token()
10 | if err != nil {
11 | return nil, err
12 | }
13 | v, ok := token.(xml.CharData)
14 | if !ok {
15 | return nil, fmt.Errorf("not a chardata")
16 | }
17 | return v, nil
18 | }
19 |
--------------------------------------------------------------------------------
/testdata/dtd.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | ]>
7 |
8 |
9 | Tove
10 | Jani
11 | Reminder
12 | Don't forget me this weekend!
13 |
14 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5 |
6 | version: 2
7 |
8 | updates:
9 | - package-ecosystem: gomod
10 | directory: /
11 | schedule:
12 | interval: monthly
13 |
14 | - package-ecosystem: github-actions
15 | directory: /
16 | schedule:
17 | interval: monthly
18 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Hi, thank you for showing interest in this project. I would be grateful to receive feedback and any form of help.
4 |
5 | If you have trivial fix or improvement, go ahead and create a [pull request][prs] and mention me to review the changes.
6 |
7 | If you plan to do major changes, here are few guidelines to follow:
8 |
9 | 1. Check the [open issues][issues] and [pull requests][prs] for existing discussions.
10 | 1. Open an [issue][issues] first, to discuss new feature or enhancement.
11 | 1. Write the code with tests, and make sure it passes locally and on CI.
12 | 1. Open a pull request, and reference the relevant issue(s).
13 | 1. After receiving feedback, squash your commits and wrap them in an informative message.
14 |
15 | Have fun!
16 |
17 | [issues]: https://github.com/muktihari/xmltokenizer/issues
18 | [prs]: https://github.com/muktihari/xmltokenizer/pulls
19 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | # This workflow will build a golang project
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go
3 |
4 | name: CI
5 |
6 | on:
7 | push:
8 | branches: ["master"]
9 |
10 | pull_request:
11 | branches: ["master"]
12 |
13 | permissions: {}
14 |
15 | jobs:
16 | build:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20 |
21 | - name: Set up Go
22 | uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0
23 | with:
24 | go-version: "stable"
25 |
26 | - name: Test
27 | run: go test -v -cover -coverprofile=coverage.coverprofile ./...
28 |
29 | - name: Upload coverage reports to Codecov
30 | uses: codecov/codecov-action@0565863a31f2c772f9f0395002a31e3f06189574 # v5.4.0
31 | env:
32 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
33 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Hikmatulloh Hari Mukti
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/internal/gpx/unmarshal.go:
--------------------------------------------------------------------------------
1 | package gpx
2 |
3 | import (
4 | "encoding/xml"
5 | "io"
6 |
7 | "github.com/muktihari/xmltokenizer"
8 | "github.com/muktihari/xmltokenizer/internal/gpx/schema"
9 | )
10 |
11 | func UnmarshalWithXMLTokenizer(f io.Reader) (schema.GPX, error) {
12 | tok := xmltokenizer.New(f)
13 | var gpx schema.GPX
14 | loop:
15 | for {
16 | token, err := tok.Token()
17 | if err == io.EOF {
18 | break
19 | }
20 | if err != nil {
21 | return gpx, err
22 | }
23 |
24 | switch string(token.Name.Local) {
25 | case "gpx":
26 | se := xmltokenizer.GetToken().Copy(token)
27 | err = gpx.UnmarshalToken(tok, se)
28 | xmltokenizer.PutToken(se)
29 | if err != nil {
30 | return gpx, err
31 | }
32 | break loop
33 | }
34 | }
35 |
36 | return gpx, nil
37 | }
38 |
39 | func UnmarshalWithStdlibXML(f io.Reader) (schema.GPX, error) {
40 | dec := xml.NewDecoder(f)
41 | var gpx schema.GPX
42 | loop:
43 | for {
44 | token, err := dec.Token()
45 | if err == io.EOF {
46 | break
47 | }
48 | if err != nil {
49 | return gpx, err
50 | }
51 |
52 | se, ok := token.(xml.StartElement)
53 | if !ok {
54 | continue
55 | }
56 | switch se.Name.Local {
57 | case "gpx":
58 | if err = gpx.UnmarshalXML(dec, se); err != nil {
59 | return gpx, err
60 | }
61 | break loop
62 | }
63 | }
64 |
65 | return gpx, nil
66 | }
67 |
--------------------------------------------------------------------------------
/internal/xlsx/unmarshal.go:
--------------------------------------------------------------------------------
1 | package xlsx
2 |
3 | import (
4 | "encoding/xml"
5 | "io"
6 |
7 | "github.com/muktihari/xmltokenizer"
8 | "github.com/muktihari/xmltokenizer/internal/xlsx/schema"
9 | )
10 |
11 | func UnmarshalWithXMLTokenizer(r io.Reader) (schema.SheetData, error) {
12 | tok := xmltokenizer.New(r)
13 | var sheetData schema.SheetData
14 | loop:
15 | for {
16 | token, err := tok.Token()
17 | if err == io.EOF {
18 | break
19 | }
20 | if err != nil {
21 | return sheetData, err
22 | }
23 |
24 | switch string(token.Name.Local) {
25 | case "sheetData":
26 | se := xmltokenizer.GetToken().Copy(token)
27 | err = sheetData.UnmarshalToken(tok, se)
28 | xmltokenizer.PutToken(se)
29 | if err != nil {
30 | return sheetData, err
31 | }
32 | break loop
33 | }
34 | }
35 |
36 | return sheetData, nil
37 | }
38 |
39 | func UnmarshalWithStdlibXML(r io.Reader) (schema.SheetData, error) {
40 | dec := xml.NewDecoder(r)
41 | var sheetData schema.SheetData
42 | for {
43 | token, err := dec.Token()
44 | if err == io.EOF {
45 | break
46 | }
47 | if err != nil {
48 | return sheetData, err
49 | }
50 |
51 | switch elem := token.(type) {
52 | case xml.StartElement:
53 | if elem.Name.Local == "sheetData" {
54 | if err = dec.DecodeElement(&sheetData, &elem); err != nil {
55 | return sheetData, err
56 | }
57 | }
58 | }
59 | }
60 | return sheetData, nil
61 | }
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # XML Tokenizer
2 |
3 | 
4 | [](https://pkg.go.dev/github.com/muktihari/xmltokenizer)
5 | [](https://codecov.io/gh/muktihari/xmltokenizer)
6 | [](https://goreportcard.com/report/github.com/muktihari/xmltokenizer)
7 |
8 | XML Tokenizer is a low-memory high performance non-namespace parser library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern and you are willing to sacrifice certain features, such as handling the namespace, in favor of speed ([discussion](https://www.reddit.com/r/golang/comments/1drdji3/xml_tokenizer_thats_4x_faster_than_stdlibs_xml/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button)). This may not cover all XML files, but it can cover typical XML files.
9 |
10 | # Motivation
11 |
12 | Go provides a standard library for [XML](https://pkg.go.dev/encoding/xml) parsing, however, I've found it to be slow for my use case. I work with a lot of GPX files in my personal project to retrieve my workouts data; GPX is an XML-based file format. When parsing my 14MB GPX file containing 208km ride using the standard library's xml, it takes roughly 600ms which is super slow and it needs 2.8mil alloc!. I need an alternative library for parsing XML that's faster than standard library's `xml`, suitable for typical XML parsing tasks and no code should be made unsafe.
13 |
14 | # Usage
15 |
16 | Please see [USAGE.md](./docs/USAGE.md).
17 |
18 | # Benchmark
19 |
20 | ```js
21 | goos: darwin; goarch: amd64; pkg: xmltokenizer
22 | cpu: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
23 | Benchmark/stdlib.xml:"ride_sembalun.gpx"-4 2 605913816 ns/op 110562568 B/op 2806823 allocs/op
24 | Benchmark/xmltokenizer:"ride_sembalun.gpx"-4 8 141616068 ns/op 17143609 B/op 85 allocs/op
25 | ```
26 |
27 | Approx. 4 times faster!
28 |
--------------------------------------------------------------------------------
/internal/gpx/schema/gpx.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 |
7 | "github.com/muktihari/xmltokenizer"
8 | )
9 |
10 | // GPX is GPX schema (simplified).
11 | type GPX struct {
12 | Creator string `xml:"creator,attr"`
13 | Version string `xml:"version,attr"`
14 | Metadata Metadata `xml:"metadata,omitempty"`
15 | Tracks []Track `xml:"trk,omitempty"`
16 | }
17 |
18 | func (g *GPX) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
19 | for i := range se.Attrs {
20 | attr := &se.Attrs[i]
21 | switch string(attr.Name.Local) {
22 | case "creator":
23 | g.Creator = string(attr.Value)
24 | case "version":
25 | g.Version = string(attr.Value)
26 | }
27 | }
28 |
29 | for {
30 | token, err := tok.Token()
31 | if err != nil {
32 | return fmt.Errorf("gpx: %w", err)
33 | }
34 |
35 | if token.IsEndElementOf(se) {
36 | return nil
37 | }
38 | if token.IsEndElement {
39 | continue
40 | }
41 |
42 | switch string(token.Name.Local) {
43 | case "metadata":
44 | se := xmltokenizer.GetToken().Copy(token)
45 | err = g.Metadata.UnmarshalToken(tok, se)
46 | xmltokenizer.PutToken(se)
47 | if err != nil {
48 | return fmt.Errorf("metadata: %w", err)
49 | }
50 | case "trk":
51 | var track Track
52 | se := xmltokenizer.GetToken().Copy(token)
53 | err = track.UnmarshalToken(tok, se)
54 | xmltokenizer.PutToken(se)
55 | if err != nil {
56 | return fmt.Errorf("track: %w", err)
57 | }
58 | g.Tracks = append(g.Tracks, track)
59 | }
60 | }
61 | }
62 |
63 | func (g *GPX) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
64 | for i := range se.Attr {
65 | attr := &se.Attr[i]
66 | switch attr.Name.Local {
67 | case "creator":
68 | g.Creator = attr.Value
69 | case "version":
70 | g.Version = attr.Value
71 | }
72 | }
73 |
74 | for {
75 | token, err := dec.Token()
76 | if err != nil {
77 | return fmt.Errorf("gpx: %w", err)
78 | }
79 |
80 | switch elem := token.(type) {
81 | case xml.StartElement:
82 | switch elem.Name.Local {
83 | case "metadata":
84 | if err := g.Metadata.UnmarshalXML(dec, elem); err != nil {
85 | return fmt.Errorf("metadata: %w", err)
86 | }
87 | case "trk":
88 | var track Track
89 | if err := track.UnmarshalXML(dec, elem); err != nil {
90 | return fmt.Errorf("track: %w", err)
91 | }
92 | g.Tracks = append(g.Tracks, track)
93 | }
94 |
95 | case xml.EndElement:
96 | if elem == se.End() {
97 | return nil
98 | }
99 | }
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/token.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer
2 |
3 | import "sync"
4 |
5 | var pool = sync.Pool{New: func() any { return new(Token) }}
6 |
7 | // GetToken gets token from the pool, don't forget to put it back.
8 | func GetToken() *Token { return pool.Get().(*Token) }
9 |
10 | // PutToken puts token back to the pool.
11 | func PutToken(t *Token) { pool.Put(t) }
12 |
13 | // Token represent a single token, one of these following:
14 | // -
15 | // -
16 | // - CharData
17 | // -
18 | // -
19 | // -
20 | // -
21 | // -
23 | //
24 | // ]>
25 | //
26 | // Token includes CharData or CDATA in Data field when it appears right after the start element.
27 | type Token struct {
28 | Name Name // Name is an XML name, empty when a tag starts with "" or " 0.
30 | Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "" or "" e.g. . Also true when a tag starts with "" or " or .
33 | }
34 |
35 | // IsEndElementOf checks whether the given token represent a
36 | // n end element (closing tag) of given StartElement.
37 | func (t *Token) IsEndElementOf(se *Token) bool {
38 | if t.IsEndElement &&
39 | string(t.Name.Full) == string(se.Name.Full) {
40 | return true
41 | }
42 | return false
43 | }
44 |
45 | // Copy copies src Token into t, returning t. Attrs should be
46 | // consumed immediately since it's only being shallow copied.
47 | func (t *Token) Copy(src Token) *Token {
48 | t.Name.Prefix = append(t.Name.Prefix[:0], src.Name.Prefix...)
49 | t.Name.Local = append(t.Name.Local[:0], src.Name.Local...)
50 | t.Name.Full = append(t.Name.Full[:0], src.Name.Full...)
51 | t.Attrs = append(t.Attrs[:0], src.Attrs...) // shallow copy
52 | t.Data = append(t.Data[:0], src.Data...)
53 | t.SelfClosing = src.SelfClosing
54 | t.IsEndElement = src.IsEndElement
55 | return t
56 | }
57 |
58 | // Attr represents an XML attribute.
59 | type Attr struct {
60 | Name Name
61 | Value []byte
62 | }
63 |
64 | // Name represents an XML name ,
65 | // we don't manage the bookkeeping of namespaces.
66 | type Name struct {
67 | Prefix []byte
68 | Local []byte
69 | Full []byte // Full is combination of "prefix:local"
70 | }
71 |
--------------------------------------------------------------------------------
/internal/xlsx/schema/sheet.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "fmt"
5 | "strconv"
6 |
7 | "github.com/muktihari/xmltokenizer"
8 | )
9 |
10 | type SheetData struct {
11 | Rows []Row `xml:"row,omitempty"`
12 | }
13 |
14 | func (s *SheetData) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
15 | for {
16 | token, err := tok.Token()
17 | if err != nil {
18 | return fmt.Errorf("sheetData: %w", err)
19 | }
20 |
21 | if token.IsEndElementOf(se) {
22 | break
23 | }
24 | if token.IsEndElement {
25 | continue
26 | }
27 |
28 | switch string(token.Name.Local) {
29 | case "row":
30 | var row Row
31 | se := xmltokenizer.GetToken().Copy(token)
32 | err = row.UnmarshalToken(tok, se)
33 | xmltokenizer.PutToken(se)
34 | if err != nil {
35 | return fmt.Errorf("row: %w", err)
36 | }
37 | s.Rows = append(s.Rows, row)
38 | }
39 | }
40 | return nil
41 | }
42 |
43 | type Row struct {
44 | Index int `xml:"r,attr,omitempty"`
45 | Cells []Cell `xml:"c"`
46 | }
47 |
48 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
49 | var err error
50 | for i := range se.Attrs {
51 | attr := &se.Attrs[i]
52 | switch string(attr.Name.Local) {
53 | case "r":
54 | r.Index, err = strconv.Atoi(string(attr.Value))
55 | if err != nil {
56 | return err
57 | }
58 | }
59 | }
60 |
61 | for {
62 | token, err := tok.Token()
63 | if err != nil {
64 | return fmt.Errorf("row: %w", err)
65 | }
66 |
67 | if token.IsEndElementOf(se) {
68 | break
69 | }
70 | if token.IsEndElement {
71 | continue
72 | }
73 |
74 | switch string(token.Name.Local) {
75 | case "c":
76 | var cell Cell
77 | se := xmltokenizer.GetToken().Copy(token)
78 | err = cell.UnmarshalToken(tok, se)
79 | xmltokenizer.PutToken(se)
80 | if err != nil {
81 | return fmt.Errorf("c: %w", err)
82 | }
83 | r.Cells = append(r.Cells, cell)
84 | }
85 | }
86 |
87 | return nil
88 | }
89 |
90 | type Cell struct {
91 | Reference string `xml:"r,attr"` // E.g. A1
92 | Style int `xml:"s,attr"`
93 | Type string `xml:"t,attr,omitempty"`
94 | Value string `xml:"v,omitempty"`
95 | InlineString string `xml:"is>t"`
96 | }
97 |
98 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
99 | var err error
100 | for i := range se.Attrs {
101 | attr := &se.Attrs[i]
102 | switch string(attr.Name.Local) {
103 | case "r":
104 | c.Reference = string(attr.Value)
105 | case "s":
106 | c.Style, err = strconv.Atoi(string(attr.Value))
107 | if err != nil {
108 | return fmt.Errorf("s: %w", err)
109 | }
110 | case "t":
111 | c.Type = string(attr.Value)
112 | }
113 | }
114 |
115 | // Must check since `c` may contains self-closing tag:
116 | //
117 | if se.SelfClosing {
118 | return nil
119 | }
120 |
121 | for {
122 | token, err := tok.Token()
123 | if err != nil {
124 | return fmt.Errorf("cell: %w", err)
125 | }
126 |
127 | if token.IsEndElementOf(se) {
128 | break
129 | }
130 | if token.IsEndElement {
131 | continue
132 | }
133 |
134 | switch string(token.Name.Local) {
135 | case "v":
136 | c.Value = string(token.Data)
137 | case "t":
138 | c.InlineString = string(token.Data)
139 | }
140 | }
141 |
142 | return nil
143 | }
144 |
--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer_test
2 |
3 | import (
4 | "bytes"
5 | "encoding/xml"
6 | "fmt"
7 | "io"
8 | "io/fs"
9 | "os"
10 | "path/filepath"
11 | "strings"
12 | "testing"
13 |
14 | "github.com/muktihari/xmltokenizer"
15 | "github.com/muktihari/xmltokenizer/internal/gpx"
16 | "github.com/muktihari/xmltokenizer/internal/xlsx"
17 | )
18 |
19 | func BenchmarkToken(b *testing.B) {
20 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error {
21 | if info.IsDir() {
22 | return nil
23 | }
24 | name := strings.TrimPrefix(path, "testdata/")
25 | data, err := os.ReadFile(path)
26 | if err != nil {
27 | b.Logf("%v: %v", path, err)
28 | return nil
29 | }
30 |
31 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) {
32 | var err error
33 | for i := 0; i < b.N; i++ {
34 | if err = unmarshalWithStdlibXML(bytes.NewReader(data)); err != nil {
35 | b.Skipf("could not unmarshal: %v", err)
36 | }
37 | }
38 | })
39 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) {
40 | var err error
41 | for i := 0; i < b.N; i++ {
42 | if err = unmarshalWithXMLTokenizer(bytes.NewReader(data)); err != nil {
43 | b.Skipf("could not unmarshal: %v", err)
44 | }
45 | }
46 | })
47 | return nil
48 | })
49 | }
50 |
51 | func unmarshalWithXMLTokenizer(r io.Reader) error {
52 | tok := xmltokenizer.New(r)
53 | for {
54 | token, err := tok.Token()
55 | if err == io.EOF {
56 | break
57 | }
58 | if err != nil {
59 | return err
60 | }
61 | _ = token
62 | }
63 | return nil
64 | }
65 |
66 | func unmarshalWithStdlibXML(r io.Reader) error {
67 | dec := xml.NewDecoder(r)
68 | for {
69 | token, err := dec.Token()
70 | if err == io.EOF {
71 | break
72 | }
73 | if err != nil {
74 | return err
75 | }
76 | _ = token
77 | }
78 | return nil
79 | }
80 |
81 | func BenchmarkUnmarshalGPX(b *testing.B) {
82 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error {
83 | if info.IsDir() {
84 | return nil
85 | }
86 | if strings.ToLower(filepath.Ext(path)) != ".gpx" {
87 | return nil
88 | }
89 |
90 | name := strings.TrimPrefix(path, "testdata/")
91 |
92 | data, err := os.ReadFile(path)
93 | if err != nil {
94 | panic(err)
95 | }
96 |
97 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) {
98 | for i := 0; i < b.N; i++ {
99 | _, _ = gpx.UnmarshalWithStdlibXML(bytes.NewReader(data))
100 | }
101 | })
102 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) {
103 | for i := 0; i < b.N; i++ {
104 | _, _ = gpx.UnmarshalWithXMLTokenizer(bytes.NewReader(data))
105 | }
106 | })
107 |
108 | return nil
109 | })
110 | }
111 |
112 | func BenchmarkUnmarshalXLSX(b *testing.B) {
113 | path := filepath.Join("testdata", "xlsx_sheet1.xml")
114 | name := strings.TrimPrefix(path, "testdata/")
115 |
116 | data, err := os.ReadFile(path)
117 | if err != nil {
118 | panic(err)
119 | }
120 |
121 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) {
122 | for i := 0; i < b.N; i++ {
123 | _, _ = xlsx.UnmarshalWithStdlibXML(bytes.NewReader(data))
124 | }
125 | })
126 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) {
127 | for i := 0; i < b.N; i++ {
128 | _, _ = xlsx.UnmarshalWithXMLTokenizer(bytes.NewReader(data))
129 | }
130 | })
131 | }
132 |
--------------------------------------------------------------------------------
/internal/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "strconv"
8 |
9 | "github.com/muktihari/xmltokenizer"
10 | )
11 |
12 | const sample = `
13 |
14 |
15 | 0
16 |
17 |
18 | 4
19 |
20 |
21 |
`
22 |
23 | func main() {
24 | f := bytes.NewReader([]byte(sample))
25 |
26 | tok := xmltokenizer.New(f)
27 | var row Row
28 | loop:
29 | for {
30 | token, err := tok.Token() // Token is only valid until next tok.Token() invocation (short-lived object).
31 | if err == io.EOF {
32 | break
33 | }
34 | if err != nil {
35 | panic(err)
36 | }
37 | switch string(token.Name.Local) { // This do not allocate 🥳👍
38 | case "row":
39 | // Reuse Token object in the sync.Pool since we only use it temporarily.
40 | se := xmltokenizer.GetToken().Copy(token) // se: StartElement, we should copy it since token is a short-lived object.
41 | err = row.UnmarshalToken(tok, se)
42 | xmltokenizer.PutToken(se) // Put back to sync.Pool.
43 | if err != nil {
44 | panic(err)
45 | }
46 | break loop
47 | }
48 | }
49 | fmt.Printf("row: %+v\n", row)
50 | // Output:
51 | // row: {Index:1 Cells:[{Reference:A1 Value:0} {Reference:B1 Value:4} {Reference:C1 Value:}]}
52 | }
53 |
54 | type Row struct {
55 | Index int `xml:"r,attr,omitempty"`
56 | Cells []Cell `xml:"c"`
57 | }
58 |
59 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
60 | var err error
61 | for i := range se.Attrs {
62 | attr := &se.Attrs[i]
63 | switch string(attr.Name.Local) {
64 | case "r":
65 | r.Index, err = strconv.Atoi(string(attr.Value))
66 | if err != nil {
67 | return err
68 | }
69 | }
70 | }
71 |
72 | for {
73 | token, err := tok.Token()
74 | if err != nil {
75 | return err
76 | }
77 | if token.IsEndElementOf(se) { // Reach desired EndElement
78 | return nil
79 | }
80 | if token.IsEndElement { // Ignore child's EndElements
81 | continue
82 | }
83 | switch string(token.Name.Local) {
84 | case "c":
85 | var cell Cell
86 | // Reuse Token object in the sync.Pool since we only use it temporarily.
87 | se := xmltokenizer.GetToken().Copy(token)
88 | err = cell.UnmarshalToken(tok, se)
89 | xmltokenizer.PutToken(se) // Put back to sync.Pool.
90 | if err != nil {
91 | return err
92 | }
93 | r.Cells = append(r.Cells, cell)
94 | }
95 | }
96 | }
97 |
98 | type Cell struct {
99 | Reference string `xml:"r,attr"`
100 | Value string `xml:"v,omitempty"`
101 | }
102 |
103 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
104 | for i := range se.Attrs {
105 | attr := &se.Attrs[i]
106 | switch string(attr.Name.Local) {
107 | case "r":
108 | c.Reference = string(attr.Value)
109 | }
110 | }
111 |
112 | // Must check since `c` may contains self-closing tag:
113 | //
114 | if se.SelfClosing {
115 | return nil
116 | }
117 |
118 | for {
119 | token, err := tok.Token()
120 | if err != nil {
121 | return err
122 | }
123 | if token.IsEndElementOf(se) { // Reach desired EndElement
124 | return nil
125 | }
126 | if token.IsEndElement { // Ignore child's EndElements
127 | continue
128 | }
129 | switch string(token.Name.Local) {
130 | case "v":
131 | c.Value = string(token.Data)
132 | }
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/internal/gpx/schema/extensions.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 | "math"
7 | "strconv"
8 |
9 | "github.com/muktihari/xmltokenizer"
10 | )
11 |
12 | // TrackpointExtension is a GPX extension for health-related data.
13 | type TrackpointExtension struct {
14 | Cadence uint8
15 | Distance float64
16 | HeartRate uint8
17 | Temperature int8
18 | Power uint16
19 | }
20 |
21 | func (t *TrackpointExtension) reset() {
22 | t.Cadence = math.MaxUint8
23 | t.Distance = math.NaN()
24 | t.HeartRate = math.MaxUint8
25 | t.Temperature = math.MaxInt8
26 | t.Power = math.MaxUint16
27 | }
28 |
29 | func (t *TrackpointExtension) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
30 | t.reset()
31 |
32 | for {
33 | token, err := tok.Token()
34 | if err != nil {
35 | return fmt.Errorf("trackpointExtension: %w", err)
36 | }
37 |
38 | if token.IsEndElementOf(se) {
39 | return nil
40 | }
41 | if token.IsEndElement {
42 | continue
43 | }
44 |
45 | switch string(token.Name.Local) {
46 | case "cad", "cadence":
47 | val, err := strconv.ParseUint(string(token.Data), 10, 8)
48 | if err != nil {
49 | return err
50 | }
51 | t.Cadence = uint8(val)
52 | case "distance":
53 | val, err := strconv.ParseFloat(string(token.Data), 64)
54 | if err != nil {
55 | return err
56 | }
57 | t.Distance = val
58 | case "hr", "heartrate":
59 | val, err := strconv.ParseUint(string(token.Data), 10, 8)
60 | if err != nil {
61 | return err
62 | }
63 | t.HeartRate = uint8(val)
64 | case "atemp", "temp", "temperature":
65 | val, err := strconv.ParseInt(string(token.Data), 10, 8)
66 | if err != nil {
67 | return err
68 | }
69 | t.Temperature = int8(val)
70 | case "power":
71 | val, err := strconv.ParseUint(string(token.Data), 10, 16)
72 | if err != nil {
73 | return err
74 | }
75 | t.Power = uint16(val)
76 | }
77 | }
78 | }
79 |
80 | func (t *TrackpointExtension) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
81 | t.reset()
82 |
83 | for {
84 | token, err := dec.Token()
85 | if err != nil {
86 | return fmt.Errorf("trackpointExtension: %w", err)
87 | }
88 |
89 | switch elem := token.(type) {
90 | case xml.StartElement:
91 | charData, err := getCharData(dec)
92 | if err != nil {
93 | return err
94 | }
95 | switch elem.Name.Local {
96 | case "cad", "cadence":
97 | val, err := strconv.ParseUint(string(charData), 10, 8)
98 | if err != nil {
99 | return err
100 | }
101 | t.Cadence = uint8(val)
102 | case "distance":
103 | val, err := strconv.ParseFloat(string(charData), 64)
104 | if err != nil {
105 | return err
106 | }
107 | t.Distance = val
108 | case "hr", "heartrate":
109 | val, err := strconv.ParseUint(string(charData), 10, 8)
110 | if err != nil {
111 | return err
112 | }
113 | t.HeartRate = uint8(val)
114 | case "atemp", "temp", "temperature":
115 | val, err := strconv.ParseInt(string(charData), 10, 8)
116 | if err != nil {
117 | return err
118 | }
119 | t.Temperature = int8(val)
120 | case "power":
121 | val, err := strconv.ParseUint(string(charData), 10, 16)
122 | if err != nil {
123 | return err
124 | }
125 | t.Power = uint16(val)
126 | }
127 | case xml.EndElement:
128 | if elem == se.End() {
129 | return nil
130 | }
131 | }
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/token_test.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/google/go-cmp/cmp"
7 | "github.com/muktihari/xmltokenizer"
8 | )
9 |
10 | func TestGetToken(t *testing.T) {
11 | alloc := testing.AllocsPerRun(10, func() {
12 | token := xmltokenizer.GetToken()
13 | xmltokenizer.PutToken(token)
14 | })
15 | if alloc != 0 {
16 | t.Fatalf("expected alloc: 0, got: %g", alloc)
17 | }
18 | }
19 |
20 | func TestIsEndElement(t *testing.T) {
21 | tt := []struct {
22 | name string
23 | token xmltokenizer.Token
24 | expected bool
25 | }{
26 | {
27 | name: "an end element",
28 | token: xmltokenizer.Token{
29 | Name: xmltokenizer.Name{
30 | Full: []byte("worksheet"),
31 | },
32 | IsEndElement: true,
33 | },
34 | expected: true,
35 | },
36 | {
37 | name: "a start element",
38 | token: xmltokenizer.Token{
39 | Name: xmltokenizer.Name{
40 | Full: []byte("worksheet"),
41 | },
42 | },
43 | expected: false,
44 | },
45 | {
46 | name: "a procinst",
47 | token: xmltokenizer.Token{
48 | Name: xmltokenizer.Name{
49 | Full: []byte("?xml"),
50 | },
51 | },
52 | expected: false,
53 | },
54 | }
55 |
56 | for _, tc := range tt {
57 | t.Run(tc.name, func(t *testing.T) {
58 | if r := tc.token.IsEndElement; r != tc.expected {
59 | t.Fatalf("expected: %t, got: %t", tc.expected, r)
60 | }
61 | })
62 | }
63 | }
64 |
65 | func TestIsEndElementOf(t *testing.T) {
66 | tt := []struct {
67 | name string
68 | t1, t2 xmltokenizer.Token
69 | expected bool
70 | }{
71 | {
72 | name: "correct end element",
73 | t1: xmltokenizer.Token{
74 | Name: xmltokenizer.Name{
75 | Full: []byte("worksheet"),
76 | },
77 | IsEndElement: true,
78 | },
79 | t2: xmltokenizer.Token{
80 | Name: xmltokenizer.Name{
81 | Full: []byte("worksheet"),
82 | },
83 | },
84 | expected: true,
85 | },
86 | {
87 | name: "incorrect end element",
88 | t1: xmltokenizer.Token{
89 | Name: xmltokenizer.Name{
90 | Full: []byte("/gpx"),
91 | },
92 | },
93 | t2: xmltokenizer.Token{
94 | Name: xmltokenizer.Name{
95 | Full: []byte("worksheet"),
96 | },
97 | },
98 | expected: false,
99 | },
100 | {
101 | name: "not even an end element",
102 | t2: xmltokenizer.Token{
103 | Name: xmltokenizer.Name{
104 | Full: []byte("worksheet"),
105 | },
106 | },
107 | t1: xmltokenizer.Token{
108 | Name: xmltokenizer.Name{
109 | Full: []byte("worksheet"),
110 | },
111 | },
112 | expected: false,
113 | },
114 | }
115 |
116 | for _, tc := range tt {
117 | t.Run(tc.name, func(t *testing.T) {
118 | if r := tc.t1.IsEndElementOf(&tc.t2); r != tc.expected {
119 | t.Fatalf("expected: %t, got: %t", tc.expected, r)
120 | }
121 | })
122 | }
123 | }
124 |
125 | func TestCopy(t *testing.T) {
126 | t1 := xmltokenizer.Token{
127 | Name: xmltokenizer.Name{
128 | Prefix: []byte("gpxtpx"),
129 | Local: []byte("hr"),
130 | Full: []byte("gpxtpx:hr"),
131 | },
132 | Attrs: []xmltokenizer.Attr{{
133 | Name: xmltokenizer.Name{
134 | Prefix: nil,
135 | Local: []byte("units"),
136 | Full: []byte("units"),
137 | },
138 | Value: []byte("bpm"),
139 | }},
140 | Data: []byte("70"),
141 | }
142 |
143 | var t2 xmltokenizer.Token
144 | t2.Copy(t1)
145 |
146 | if diff := cmp.Diff(t2, t1); diff != "" {
147 | t.Fatal(diff)
148 | }
149 |
150 | t2.Name.Full = append(t2.Name.Full[:0], "asd"...)
151 | t2.Data = append(t2.Data[:0], "60"...)
152 | if diff := cmp.Diff(t2, t1); diff == "" {
153 | t.Fatalf("expected different, got same")
154 | }
155 |
156 | // Test shallow copy, it should change the original
157 | t2.Attrs[0].Name.Full[0] = 'i'
158 | if diff := cmp.Diff(t2.Attrs, t1.Attrs); diff != "" {
159 | t.Fatal(diff)
160 | }
161 | }
162 |
--------------------------------------------------------------------------------
/testdata/xlsx_sheet1.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
13 |
14 |
15 |
16 |
17 |
18 |
20 |
21 |
22 |
24 |
25 | 0
26 |
27 |
28 | 1
29 |
30 |
31 | 2
32 |
33 |
34 |
36 |
37 | 3
38 |
39 |
40 | 4
41 |
42 |
43 | 5
44 |
45 |
46 |
47 |
48 |
50 |
51 | 6
52 |
53 |
54 | 7
55 |
56 |
57 | 8
58 |
59 |
60 |
61 |
62 |
64 |
66 |
68 |
70 |
72 |
74 |
77 |
78 |
80 |
82 |
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/tokenizer_internal_test.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer
2 |
3 | import (
4 | "errors"
5 | "io"
6 | "os"
7 | "path/filepath"
8 | "testing"
9 |
10 | "github.com/google/go-cmp/cmp"
11 | )
12 |
13 | func TestOptions(t *testing.T) {
14 | tt := []struct {
15 | name string
16 | options []Option
17 | expectedOptions options
18 | }{
19 | {
20 | name: "defaultOptions",
21 | expectedOptions: defaultOptions(),
22 | },
23 | {
24 | name: "less than 0",
25 | options: []Option{
26 | WithReadBufferSize(-1),
27 | WithAttrBufferSize(-1),
28 | WithAutoGrowBufferMaxLimitSize(-1),
29 | },
30 | expectedOptions: options{
31 | readBufferSize: defaultReadBufferSize,
32 | autoGrowBufferMaxLimitSize: autoGrowBufferMaxLimitSize,
33 | attrsBufferSize: defaultAttrsBufferSize,
34 | },
35 | },
36 | {
37 | name: "readBufferSize > maxLimitGrowBufferSize",
38 | options: []Option{
39 | WithReadBufferSize(4 << 10),
40 | WithAutoGrowBufferMaxLimitSize(1 << 10),
41 | },
42 | expectedOptions: options{
43 | readBufferSize: 4 << 10,
44 | autoGrowBufferMaxLimitSize: 4 << 10,
45 | attrsBufferSize: defaultAttrsBufferSize,
46 | },
47 | },
48 | }
49 |
50 | for _, tc := range tt {
51 | t.Run(tc.name, func(t *testing.T) {
52 | tok := New(nil, tc.options...)
53 | if diff := cmp.Diff(tok.options, tc.expectedOptions,
54 | cmp.AllowUnexported(options{}),
55 | ); diff != "" {
56 | t.Fatal(diff)
57 | }
58 | })
59 | }
60 | }
61 |
62 | func TestAutoGrowBuffer(t *testing.T) {
63 | tt := []struct {
64 | name string
65 | filename string
66 | opts []Option
67 | err error
68 | }{
69 | {
70 | name: "grow buffer with alloc",
71 | filename: "long_comment_token.xml",
72 | opts: []Option{
73 | WithReadBufferSize(5),
74 | },
75 | err: nil,
76 | },
77 | {
78 | name: "grow buffer exceed max limit",
79 | filename: "long_comment_token.xml",
80 | opts: []Option{
81 | WithReadBufferSize(5),
82 | WithAutoGrowBufferMaxLimitSize(5),
83 | },
84 | err: errAutoGrowBufferExceedMaxLimit,
85 | },
86 | }
87 |
88 | for _, tc := range tt {
89 | t.Run(tc.name, func(t *testing.T) {
90 | f, err := os.Open(filepath.Join("testdata", tc.filename))
91 | if err != nil {
92 | panic(err)
93 | }
94 | defer f.Close()
95 |
96 | tok := New(f, tc.opts...)
97 | for {
98 | _, err = tok.Token()
99 | if err == io.EOF {
100 | err = nil
101 | break
102 | }
103 | if err != nil {
104 | break
105 | }
106 | }
107 |
108 | if !errors.Is(err, tc.err) {
109 | t.Fatalf("expected error: %v, got: %v", tc.err, err)
110 | }
111 | })
112 | }
113 | }
114 |
115 | type fnReader func(b []byte) (n int, err error)
116 |
117 | func (f fnReader) Read(b []byte) (n int, err error) { return f(b) }
118 |
119 | func TestReset(t *testing.T) {
120 | r := fnReader(func(b []byte) (n int, err error) { return len(b), nil })
121 | tok := New(r)
122 | tok.Token() // Trigger make buffer init, cause grow buffer by alloc up to max limit: 1MB
123 |
124 | tok.Reset(r,
125 | WithReadBufferSize(1024),
126 | WithAutoGrowBufferMaxLimitSize(4),
127 | )
128 |
129 | if expected := 1024; len(tok.buf) != expected {
130 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf))
131 | }
132 | if expected := 1000 << 10; cap(tok.buf) != expected {
133 | t.Fatalf("expected cap(t.buf): %d, got: %d", expected, cap(tok.buf))
134 | }
135 |
136 | if tok.cur != 0 {
137 | t.Fatalf("expected cur: %d, got: cur: %d",
138 | 0, tok.cur)
139 | }
140 |
141 | newBufferSize := 2000 << 10
142 | tok.Reset(r,
143 | WithReadBufferSize(newBufferSize),
144 | WithAutoGrowBufferMaxLimitSize(4),
145 | )
146 |
147 | tok.Token() // Trigger manageBuffer
148 |
149 | if expected := newBufferSize; len(tok.buf) != expected {
150 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf))
151 | }
152 | if expected := newBufferSize + defaultReadBufferSize; cap(tok.buf) != expected {
153 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf))
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/docs/USAGE.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | The usage of this library is similar to the standard library's xml manual implementation of `xml.Unmarshaler` interface, with a slightly different code.
4 |
5 | Let's say we have this xml schema, a simplified version of `xlsx's sheet1.xml`.
6 |
7 | ```xml
8 |
9 |
10 |
11 | 0
12 |
13 |
14 | 4
15 |
16 |
17 |
18 | ```
19 |
20 | We can write the Go implementation like following:
21 |
22 | ```go
23 | package main
24 |
25 | import (
26 | "bytes"
27 | "fmt"
28 | "io"
29 | "strconv"
30 |
31 | "github.com/muktihari/xmltokenizer"
32 | )
33 |
34 | const sample = `
35 |
36 |
37 | 0
38 |
39 |
40 | 4
41 |
42 |
43 |
`
44 |
45 | func main() {
46 | f := bytes.NewReader([]byte(sample))
47 |
48 | tok := xmltokenizer.New(f)
49 | var row Row
50 | loop:
51 | for {
52 | token, err := tok.Token() // Token is only valid until next tok.Token() invocation (short-lived object).
53 | if err == io.EOF {
54 | break
55 | }
56 | if err != nil {
57 | panic(err)
58 | }
59 | switch string(token.Name.Local) { // This do not allocate 🥳👍
60 | case "row":
61 | // Reuse Token object in the sync.Pool since we only use it temporarily.
62 | se := xmltokenizer.GetToken().Copy(token) // se: StartElement, we should copy it since token is a short-lived object.
63 | err = row.UnmarshalToken(tok, se)
64 | xmltokenizer.PutToken(se) // Put back to sync.Pool.
65 | if err != nil {
66 | panic(err)
67 | }
68 | break loop
69 | }
70 | }
71 | fmt.Printf("row: %+v\n", row)
72 | // Output:
73 | // row: {Index:1 Cells:[{Reference:A1 Value:0} {Reference:B1 Value:4} {Reference:C1 Value:}]}
74 | }
75 |
76 | type Row struct {
77 | Index int `xml:"r,attr,omitempty"`
78 | Cells []Cell `xml:"c"`
79 | }
80 |
81 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
82 | var err error
83 | for i := range se.Attrs {
84 | attr := &se.Attrs[i]
85 | switch string(attr.Name.Local) {
86 | case "r":
87 | r.Index, err = strconv.Atoi(string(attr.Value))
88 | if err != nil {
89 | return err
90 | }
91 | }
92 | }
93 |
94 | for {
95 | token, err := tok.Token()
96 | if err != nil {
97 | return err
98 | }
99 | if token.IsEndElementOf(se) { // Reach desired EndElement
100 | return nil
101 | }
102 | if token.IsEndElement { // Ignore child's EndElements
103 | continue
104 | }
105 | switch string(token.Name.Local) {
106 | case "c":
107 | var cell Cell
108 | // Reuse Token object in the sync.Pool since we only use it temporarily.
109 | se := xmltokenizer.GetToken().Copy(token)
110 | err = cell.UnmarshalToken(tok, se)
111 | xmltokenizer.PutToken(se) // Put back to sync.Pool.
112 | if err != nil {
113 | return err
114 | }
115 | r.Cells = append(r.Cells, cell)
116 | }
117 | }
118 | }
119 |
120 | type Cell struct {
121 | Reference string `xml:"r,attr"`
122 | Value string `xml:"v,omitempty"`
123 | }
124 |
125 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
126 | for i := range se.Attrs {
127 | attr := &se.Attrs[i]
128 | switch string(attr.Name.Local) {
129 | case "r":
130 | c.Reference = string(attr.Value)
131 | }
132 | }
133 |
134 | // Must check since `c` may contains self-closing tag:
135 | //
136 | if se.SelfClosing {
137 | return nil
138 | }
139 |
140 | for {
141 | token, err := tok.Token()
142 | if err != nil {
143 | return err
144 | }
145 | if token.IsEndElementOf(se) { // Reach desired EndElement
146 | return nil
147 | }
148 | if token.IsEndElement { // Ignore child's EndElements
149 | continue
150 | }
151 | switch string(token.Name.Local) {
152 | case "v":
153 | c.Value = string(token.Data)
154 | }
155 | }
156 | }
157 |
158 | ```
159 |
160 | You can find more examples in [internal](../internal/README.md) package.
161 |
--------------------------------------------------------------------------------
/testdata/long_comment_token.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/internal/gpx/schema/metadata.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 | "time"
7 |
8 | "github.com/muktihari/xmltokenizer"
9 | )
10 |
11 | // Metadata is GPX's Metadata schema (simplified).
12 | type Metadata struct {
13 | Name string `xml:"name,omitempty"`
14 | Desc string `xml:"desc,omitempty"`
15 | Author *Author `xml:"author,omitempty"`
16 | Link *Link `xml:"link,omitempty"`
17 | Time time.Time `xml:"time,omitempty"`
18 | }
19 |
20 | func (m *Metadata) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
21 | for {
22 | token, err := tok.Token()
23 | if err != nil {
24 | return fmt.Errorf("metadata: %w", err)
25 | }
26 |
27 | if token.IsEndElementOf(se) {
28 | return nil
29 | }
30 | if token.IsEndElement {
31 | continue
32 | }
33 |
34 | switch string(token.Name.Local) {
35 | case "name":
36 | m.Name = string(token.Data)
37 | case "desc":
38 | m.Desc = string(token.Data)
39 | case "author":
40 | m.Author = new(Author)
41 | se := xmltokenizer.GetToken().Copy(token)
42 | err = m.Author.UnmarshalToken(tok, se)
43 | xmltokenizer.PutToken(se)
44 | if err != nil {
45 | return fmt.Errorf("author: %w", err)
46 | }
47 | case "link":
48 | m.Link = new(Link)
49 | se := xmltokenizer.GetToken().Copy(token)
50 | err = m.Link.UnmarshalToken(tok, se)
51 | xmltokenizer.PutToken(se)
52 | if err != nil {
53 | return fmt.Errorf("link: %w", err)
54 | }
55 | case "time":
56 | m.Time, err = time.Parse(time.RFC3339, string(token.Data))
57 | if err != nil {
58 | return fmt.Errorf("time: %w", err)
59 | }
60 | }
61 | }
62 | }
63 |
64 | func (m *Metadata) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
65 | for {
66 | token, err := dec.Token()
67 | if err != nil {
68 | return fmt.Errorf("metadata: %w", err)
69 | }
70 |
71 | switch elem := token.(type) {
72 | case xml.StartElement:
73 | switch elem.Name.Local {
74 | case "author":
75 | m.Author = new(Author)
76 | if err := m.Author.UnmarshalXML(dec, elem); err != nil {
77 | return fmt.Errorf("author: %w", err)
78 | }
79 | continue
80 | case "link":
81 | m.Link = new(Link)
82 | if err := m.Link.UnmarshalXML(dec, elem); err != nil {
83 | return fmt.Errorf("link: %w", err)
84 | }
85 | continue
86 | }
87 | charData, err := getCharData(dec)
88 | if err != nil {
89 | return err
90 | }
91 | switch elem.Name.Local {
92 | case "name":
93 | m.Name = string(charData)
94 | case "desc":
95 | m.Desc = string(charData)
96 | case "time":
97 | m.Time, err = time.Parse(time.RFC3339, string(charData))
98 | if err != nil {
99 | return fmt.Errorf("time: %w", err)
100 | }
101 | }
102 | case xml.EndElement:
103 | if elem == se.End() {
104 | return nil
105 | }
106 | }
107 | }
108 | }
109 |
110 | // Author is Author schema (simplified).
111 | type Author struct {
112 | Name string `xml:"name"`
113 | Link *Link `xml:"link"`
114 | }
115 |
116 | func (a *Author) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
117 | for {
118 | token, err := tok.Token()
119 | if err != nil {
120 | return fmt.Errorf("author: %w", err)
121 | }
122 |
123 | if token.IsEndElementOf(se) {
124 | return nil
125 | }
126 | if token.IsEndElement {
127 | continue
128 | }
129 |
130 | switch string(token.Name.Local) {
131 | case "name":
132 | a.Name = string(token.Data)
133 | case "link":
134 | a.Link = new(Link)
135 | se := xmltokenizer.GetToken().Copy(token)
136 | err := a.Link.UnmarshalToken(tok, se)
137 | xmltokenizer.PutToken(se)
138 | if err != nil {
139 | return fmt.Errorf("link: %w", err)
140 | }
141 | }
142 | }
143 | }
144 |
145 | func (a *Author) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
146 | for {
147 | token, err := dec.Token()
148 | if err != nil {
149 | return fmt.Errorf("author: %w", err)
150 | }
151 |
152 | switch elem := token.(type) {
153 | case xml.StartElement:
154 | switch elem.Name.Local {
155 | case "link":
156 | a.Link = new(Link)
157 | if err := a.Link.UnmarshalXML(dec, elem); err != nil {
158 | return fmt.Errorf("link: %w", err)
159 | }
160 | case "name":
161 | charData, err := getCharData(dec)
162 | if err != nil {
163 | return fmt.Errorf("name: %w", err)
164 | }
165 | a.Name = string(charData)
166 | }
167 | case xml.EndElement:
168 | if elem == se.End() {
169 | return nil
170 | }
171 | }
172 | }
173 | }
174 |
175 | // Link is Link schema.
176 | type Link struct {
177 | XMLName xml.Name `xml:"link"`
178 | Href string `xml:"href,attr"`
179 |
180 | Text string `xml:"text,omitempty"`
181 | Type string `xml:"type,omitempty"`
182 | }
183 |
184 | func (a *Link) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
185 | for i := range se.Attrs {
186 | attr := &se.Attrs[i]
187 | switch string(attr.Name.Local) {
188 | case "href":
189 | a.Href = string(attr.Value)
190 | }
191 | }
192 |
193 | for {
194 | token, err := tok.Token()
195 | if err != nil {
196 | return fmt.Errorf("link: %w", err)
197 | }
198 |
199 | if token.IsEndElementOf(se) {
200 | return nil
201 | }
202 | if token.IsEndElement {
203 | continue
204 | }
205 |
206 | switch string(token.Name.Local) {
207 | case "text":
208 | a.Text = string(token.Data)
209 | case "type":
210 | a.Type = string(token.Data)
211 | }
212 | }
213 | }
214 |
215 | func (a *Link) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
216 | for i := range se.Attr {
217 | attr := &se.Attr[i]
218 | switch attr.Name.Local {
219 | case "href":
220 | a.Href = attr.Value
221 | }
222 | }
223 |
224 | for {
225 | token, err := dec.Token()
226 | if err != nil {
227 | return fmt.Errorf("link: %w", err)
228 | }
229 |
230 | switch elem := token.(type) {
231 | case xml.StartElement:
232 | charData, err := getCharData(dec)
233 | if err != nil {
234 | return fmt.Errorf("%s: %w", elem.Name.Local, err)
235 | }
236 | switch elem.Name.Local {
237 | case "text":
238 | a.Text = string(charData)
239 | case "type":
240 | a.Type = string(charData)
241 | }
242 | case xml.EndElement:
243 | if elem == se.End() {
244 | return nil
245 | }
246 | }
247 | }
248 | }
249 |
--------------------------------------------------------------------------------
/internal/gpx/schema/track.go:
--------------------------------------------------------------------------------
1 | package schema
2 |
3 | import (
4 | "encoding/xml"
5 | "fmt"
6 | "math"
7 | "strconv"
8 | "time"
9 |
10 | "github.com/muktihari/xmltokenizer"
11 | )
12 |
13 | type Track struct {
14 | Name string `xml:"name,omitempty"`
15 | Type string `xml:"type,omitempty"`
16 | TrackSegments []TrackSegment `xml:"trkseg,omitempty"`
17 | }
18 |
19 | func (t *Track) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
20 | for {
21 | token, err := tok.Token()
22 | if err != nil {
23 | return fmt.Errorf("track: %w", err)
24 | }
25 |
26 | if token.IsEndElementOf(se) {
27 | return nil
28 | }
29 | if token.IsEndElement {
30 | continue
31 | }
32 |
33 | switch string(token.Name.Local) {
34 | case "name":
35 | t.Name = string(token.Data)
36 | case "type":
37 | t.Type = string(token.Data)
38 | case "trkseg":
39 | var trkseg TrackSegment
40 | se := xmltokenizer.GetToken().Copy(token)
41 | err = trkseg.UnmarshalToken(tok, se)
42 | xmltokenizer.PutToken(se)
43 | if err != nil {
44 | return fmt.Errorf("trkseg: %w", err)
45 | }
46 | t.TrackSegments = append(t.TrackSegments, trkseg)
47 | }
48 | }
49 | }
50 |
51 | func (t *Track) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
52 | for {
53 | token, err := dec.Token()
54 | if err != nil {
55 | return fmt.Errorf("track: %w", err)
56 | }
57 |
58 | switch elem := token.(type) {
59 | case xml.StartElement:
60 | switch elem.Name.Local {
61 | case "trkseg":
62 | var trkseg TrackSegment
63 | if err := trkseg.UnmarshalXML(dec, elem); err != nil {
64 | return fmt.Errorf("trkseg: %w", err)
65 | }
66 | t.TrackSegments = append(t.TrackSegments, trkseg)
67 | continue
68 | }
69 | charData, err := getCharData(dec)
70 | if err != nil {
71 | return fmt.Errorf("%s: %w", elem.Name.Local, err)
72 | }
73 | switch elem.Name.Local {
74 | case "name":
75 | t.Name = string(charData)
76 | case "type":
77 | t.Type = string(charData)
78 | }
79 | case xml.EndElement:
80 | if elem == se.End() {
81 | return nil
82 | }
83 | }
84 | }
85 | }
86 |
87 | type TrackSegment struct {
88 | Trackpoints []Waypoint `xml:"trkpt,omitempty"`
89 | }
90 |
91 | func (t *TrackSegment) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
92 | for {
93 | token, err := tok.Token()
94 | if err != nil {
95 | return err
96 | }
97 |
98 | if token.IsEndElementOf(se) {
99 | return nil
100 | }
101 | if token.IsEndElement {
102 | continue
103 | }
104 |
105 | switch string(token.Name.Local) {
106 | case "trkpt":
107 | var trkpt Waypoint
108 | se := xmltokenizer.GetToken().Copy(token)
109 | err = trkpt.UnmarshalToken(tok, se)
110 | xmltokenizer.PutToken(se)
111 | if err != nil {
112 | return fmt.Errorf("trkpt: %w", err)
113 | }
114 | t.Trackpoints = append(t.Trackpoints, trkpt)
115 | }
116 | }
117 | }
118 |
119 | func (t *TrackSegment) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
120 | for {
121 | token, err := dec.Token()
122 | if err != nil {
123 | return err
124 | }
125 |
126 | switch elem := token.(type) {
127 | case xml.StartElement:
128 | switch elem.Name.Local {
129 | case "trkpt":
130 | var trkpt Waypoint
131 | if err := trkpt.UnmarshalXML(dec, elem); err != nil {
132 | return fmt.Errorf("trkpt: %w", err)
133 | }
134 | t.Trackpoints = append(t.Trackpoints, trkpt)
135 | }
136 | case xml.EndElement:
137 | if elem == se.End() {
138 | return nil
139 | }
140 | }
141 | }
142 | }
143 |
144 | type Waypoint struct {
145 | Lat float64 `xml:"lat,attr,omitempty"`
146 | Lon float64 `xml:"lon,attr,omitempty"`
147 | Ele float64 `xml:"ele,omitempty"`
148 | Time time.Time `xml:"time,omitempty"`
149 | TrackpointExtension TrackpointExtension `xml:"extensions>TrackPointExtension,omitempty"`
150 | }
151 |
152 | func (w *Waypoint) reset() {
153 | w.Lat = math.NaN()
154 | w.Lon = math.NaN()
155 | w.Ele = math.NaN()
156 | w.Time = time.Time{}
157 | w.TrackpointExtension.reset()
158 | }
159 |
160 | func (w *Waypoint) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error {
161 | w.reset()
162 |
163 | var err error
164 | for i := range se.Attrs {
165 | attr := &se.Attrs[i]
166 | switch string(attr.Name.Local) {
167 | case "lat":
168 | w.Lat, err = strconv.ParseFloat(string(attr.Value), 64)
169 | if err != nil {
170 | return fmt.Errorf("lat: %w", err)
171 | }
172 | case "lon":
173 | w.Lon, err = strconv.ParseFloat(string(attr.Value), 64)
174 | if err != nil {
175 | return fmt.Errorf("lon: %w", err)
176 | }
177 | }
178 | }
179 |
180 | for {
181 | token, err := tok.Token()
182 | if err != nil {
183 | return fmt.Errorf("waypoint: %w", err)
184 | }
185 |
186 | if token.IsEndElementOf(se) {
187 | return nil
188 | }
189 | if token.IsEndElement {
190 | continue
191 | }
192 |
193 | switch string(token.Name.Local) {
194 | case "ele":
195 | w.Ele, err = strconv.ParseFloat(string(token.Data), 64)
196 | if err != nil {
197 | return fmt.Errorf("ele: %w", err)
198 | }
199 | case "time":
200 | w.Time, err = time.Parse(time.RFC3339, string(token.Data))
201 | if err != nil {
202 | return fmt.Errorf("time: %w", err)
203 | }
204 | case "extensions":
205 | se := xmltokenizer.GetToken().Copy(token)
206 | err = w.TrackpointExtension.UnmarshalToken(tok, se)
207 | xmltokenizer.PutToken(se)
208 | if err != nil {
209 | return fmt.Errorf("extensions: %w", err)
210 | }
211 | }
212 | }
213 | }
214 |
215 | func (w *Waypoint) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error {
216 | w.reset()
217 |
218 | var err error
219 | for i := range se.Attr {
220 | attr := &se.Attr[i]
221 | switch attr.Name.Local {
222 | case "lat":
223 | w.Lat, err = strconv.ParseFloat(attr.Value, 64)
224 | if err != nil {
225 | return fmt.Errorf("lat: %w", err)
226 | }
227 | case "lon":
228 | w.Lon, err = strconv.ParseFloat(attr.Value, 64)
229 | if err != nil {
230 | return fmt.Errorf("lon: %w", err)
231 | }
232 | }
233 | }
234 |
235 | for {
236 | token, err := dec.Token()
237 | if err != nil {
238 | return fmt.Errorf("waypoint: %w", err)
239 | }
240 |
241 | switch elem := token.(type) {
242 | case xml.StartElement:
243 | switch elem.Name.Local {
244 | case "extensions":
245 | if err := w.TrackpointExtension.UnmarshalXML(dec, elem); err != nil {
246 | return fmt.Errorf("extensions: %w", err)
247 | }
248 | continue
249 | }
250 | charData, err := getCharData(dec)
251 | if err != nil {
252 | return fmt.Errorf("%s: %w", elem.Name.Local, err)
253 | }
254 | switch elem.Name.Local {
255 | case "ele":
256 | w.Ele, err = strconv.ParseFloat(string(charData), 64)
257 | if err != nil {
258 | return fmt.Errorf("ele: %w", err)
259 | }
260 | case "time":
261 | w.Time, err = time.Parse(time.RFC3339, string(charData))
262 | if err != nil {
263 | return fmt.Errorf("time: %w", err)
264 | }
265 | }
266 | case xml.EndElement:
267 | if elem == se.End() {
268 | return nil
269 | }
270 | }
271 | }
272 | }
273 |
--------------------------------------------------------------------------------
/tokenizer.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "io"
7 | )
8 |
9 | type errorString string
10 |
11 | func (e errorString) Error() string { return string(e) }
12 |
13 | const (
14 | errAutoGrowBufferExceedMaxLimit = errorString("auto grow buffer exceed max limit")
15 | )
16 |
17 | const (
18 | defaultReadBufferSize = 4 << 10
19 | autoGrowBufferMaxLimitSize = 1000 << 10
20 | defaultAttrsBufferSize = 16
21 | )
22 |
23 | // Tokenizer is a XML tokenizer.
24 | type Tokenizer struct {
25 | r io.Reader // reader provided by the client
26 | n int64 // the n read bytes counter
27 | options options // tokenizer's options
28 | buf []byte // buffer that will grow as needed, large enough to hold a token (default max limit: 1MB)
29 | cur int // cursor byte position
30 | err error // last encountered error
31 | token Token // shared token
32 | }
33 |
34 | type options struct {
35 | readBufferSize int
36 | autoGrowBufferMaxLimitSize int
37 | attrsBufferSize int
38 | }
39 |
40 | func defaultOptions() options {
41 | return options{
42 | readBufferSize: defaultReadBufferSize,
43 | autoGrowBufferMaxLimitSize: autoGrowBufferMaxLimitSize,
44 | attrsBufferSize: defaultAttrsBufferSize,
45 | }
46 | }
47 |
48 | // Option is Tokenizer option.
49 | type Option func(o *options)
50 |
51 | // WithReadBufferSize directs XML Tokenizer to this buffer size
52 | // to read from the io.Reader. Default: 4096.
53 | func WithReadBufferSize(size int) Option {
54 | if size <= 0 {
55 | size = defaultReadBufferSize
56 | }
57 | return func(o *options) { o.readBufferSize = size }
58 | }
59 |
60 | // WithAutoGrowBufferMaxLimitSize directs XML Tokenizer to limit
61 | // auto grow buffer to not grow exceed this limit. Default: 1 MB.
62 | func WithAutoGrowBufferMaxLimitSize(size int) Option {
63 | if size <= 0 {
64 | size = autoGrowBufferMaxLimitSize
65 | }
66 | return func(o *options) { o.autoGrowBufferMaxLimitSize = size }
67 | }
68 |
69 | // WithAttrBufferSize directs XML Tokenizer to use this Attrs
70 | // buffer capacity as its initial size. Default: 8.
71 | func WithAttrBufferSize(size int) Option {
72 | if size <= 0 {
73 | size = defaultAttrsBufferSize
74 | }
75 | return func(o *options) { o.attrsBufferSize = size }
76 | }
77 |
78 | // New creates new XML tokenizer.
79 | func New(r io.Reader, opts ...Option) *Tokenizer {
80 | t := new(Tokenizer)
81 | t.Reset(r, opts...)
82 | return t
83 | }
84 |
85 | // Reset resets the Tokenizer, maintaining storage for
86 | // future tokenization to reduce memory alloc.
87 | func (t *Tokenizer) Reset(r io.Reader, opts ...Option) {
88 | t.r, t.err = r, nil
89 | t.n, t.cur = 0, 0
90 |
91 | t.options = defaultOptions()
92 | for i := range opts {
93 | opts[i](&t.options)
94 | }
95 |
96 | if cap(t.token.Attrs) < t.options.attrsBufferSize {
97 | t.token.Attrs = make([]Attr, 0, t.options.attrsBufferSize)
98 | }
99 | if t.options.readBufferSize > t.options.autoGrowBufferMaxLimitSize {
100 | t.options.autoGrowBufferMaxLimitSize = t.options.readBufferSize
101 | }
102 |
103 | switch size := t.options.readBufferSize; {
104 | case cap(t.buf) >= size+defaultReadBufferSize:
105 | t.buf = t.buf[:size:cap(t.buf)]
106 | default:
107 | // Create buffer with additional cap since we need to memmove remaining bytes
108 | t.buf = make([]byte, size, size+defaultReadBufferSize)
109 | }
110 | }
111 |
112 | // Token returns either a valid token or an error.
113 | // The returned token is only valid before next
114 | // Token or RawToken method invocation.
115 | func (t *Tokenizer) Token() (token Token, err error) {
116 | if t.err != nil {
117 | return token, t.err
118 | }
119 |
120 | b, err := t.RawToken()
121 | if err != nil {
122 | if !errors.Is(err, io.EOF) {
123 | err = fmt.Errorf("byte pos %d: %w", t.n, err)
124 | }
125 | if len(b) == 0 || errors.Is(err, io.ErrUnexpectedEOF) {
126 | return
127 | }
128 | t.err = err
129 | }
130 |
131 | t.clearToken()
132 |
133 | b = t.consumeNonTagIdentifier(b)
134 | if len(b) > 0 {
135 | b = t.consumeTagName(b)
136 | b = t.consumeAttrs(b)
137 | t.consumeCharData(b)
138 | }
139 |
140 | token = t.token
141 | if len(token.Attrs) == 0 {
142 | token.Attrs = nil
143 | }
144 | if len(token.Data) == 0 {
145 | token.Data = nil
146 | }
147 |
148 | return token, nil
149 | }
150 |
151 | // RawToken returns token in its raw bytes. At the end,
152 | // it may returns last token bytes and an error.
153 | // The returned token bytes is only valid before next
154 | // Token or RawToken method invocation.
155 | func (t *Tokenizer) RawToken() (b []byte, err error) {
156 | if t.err != nil {
157 | return nil, t.err
158 | }
159 |
160 | var pivot, pos = t.cur, t.cur
161 | var openclose int // zero means open '<' and close '>' is matched.
162 | for {
163 | if pos >= len(t.buf) {
164 | pivot, pos = t.memmoveRemainingBytes(pivot)
165 | if err = t.manageBuffer(); err != nil {
166 | if openclose != 0 && errors.Is(err, io.EOF) {
167 | err = io.ErrUnexpectedEOF
168 | }
169 | t.err = err
170 | return t.buf[pivot:pos], err
171 | }
172 | }
173 | switch t.buf[pos] {
174 | case '<':
175 | if openclose == 0 {
176 | pivot = pos
177 | }
178 | openclose++
179 | case '>':
180 | if openclose--; openclose != 0 {
181 | break
182 | }
183 |
184 | switch t.buf[pivot+1] {
185 | case '?', '!': // Maybe a ProcInst ", this method will include it in the previous token.
204 | // It returns the new pivot and new position.
205 | func (t *Tokenizer) parseCharData(pivot, pos int) (newPivot, newPos int) {
206 | for i := pos + 1; ; i++ {
207 | if i >= len(t.buf) {
208 | pivot, i = t.memmoveRemainingBytes(pivot)
209 | pos = i - 1
210 | if t.err = t.manageBuffer(); t.err != nil {
211 | break
212 | }
213 | }
214 | if t.buf[i] != '<' {
215 | continue
216 | }
217 |
218 | pos = i - 1
219 | // Might be in the form of
220 | const prefix, suffix = ""
221 | var k int = 1
222 | for j := i + 1; ; j++ {
223 | if j >= len(t.buf) {
224 | prevLast := len(t.buf)
225 | pivot, j = t.memmoveRemainingBytes(pivot)
226 | pos = pos - (prevLast - len(t.buf))
227 | if t.err = t.manageBuffer(); t.err != nil {
228 | if errors.Is(t.err, io.EOF) {
229 | t.err = io.ErrUnexpectedEOF
230 | }
231 | break
232 | }
233 | }
234 | if k < len(prefix) {
235 | if t.buf[j] != prefix[k] {
236 | break
237 | }
238 | k++
239 | continue
240 | }
241 | if t.buf[j] == '>' && string(t.buf[j-2:j+1]) == suffix {
242 | pos = j
243 | break
244 | }
245 | }
246 | break
247 | }
248 | return pivot, pos
249 | }
250 |
251 | func (t *Tokenizer) memmoveRemainingBytes(pivot int) (cur, last int) {
252 | if pivot == 0 {
253 | return t.cur, len(t.buf)
254 | }
255 | n := copy(t.buf, t.buf[pivot:])
256 | t.buf = t.buf[:n:cap(t.buf)]
257 | t.cur = 0
258 | return t.cur, len(t.buf)
259 | }
260 |
261 | func (t *Tokenizer) manageBuffer() error {
262 | growSize := len(t.buf) + t.options.readBufferSize
263 | start, end := len(t.buf), growSize
264 | switch {
265 | case growSize <= cap(t.buf): // Grow by reslice
266 | t.buf = t.buf[:growSize:cap(t.buf)]
267 | default: // Grow by make new alloc
268 | if growSize > t.options.autoGrowBufferMaxLimitSize {
269 | return fmt.Errorf("could not grow buffer to %d, max limit is set to %d: %w",
270 | growSize, t.options.autoGrowBufferMaxLimitSize, errAutoGrowBufferExceedMaxLimit)
271 | }
272 | buf := make([]byte, growSize)
273 | n := copy(buf, t.buf)
274 | t.buf = buf
275 | start, end = n, cap(t.buf)
276 | }
277 |
278 | n, err := io.ReadAtLeast(t.r, t.buf[start:end], 1)
279 | t.buf = t.buf[: start+n : cap(t.buf)]
280 | t.n += int64(n)
281 |
282 | return err
283 | }
284 |
285 | func (t *Tokenizer) clearToken() {
286 | t.token.Name.Prefix = nil
287 | t.token.Name.Local = nil
288 | t.token.Name.Full = nil
289 | t.token.Attrs = t.token.Attrs[:0]
290 | t.token.Data = nil
291 | t.token.SelfClosing = false
292 | t.token.IsEndElement = false
293 | }
294 |
295 | // consumeNonTagIdentifier consumes identifier starts with "" or "', ' ', '\t', '\r', '\n': // e.g. ,
320 | if b[i] == '>' && b[i-1] == '/' { // In case we encounter
321 | i--
322 | }
323 | t.token.Name.Local = trim(b[pos:i])
324 | t.token.Name.Full = trim(b[fullpos:i])
325 | return b[i:]
326 | }
327 | }
328 | return b
329 | }
330 |
331 | func (t *Tokenizer) consumeAttrs(b []byte) []byte {
332 | var prefix, local, full []byte
333 | var pos, fullpos int
334 | for i := 0; i < len(b); i++ {
335 | switch b[i] {
336 | case ':':
337 | prefix = trim(b[pos:i])
338 | pos = i + 1
339 | case '=':
340 | local = trim(b[pos:i])
341 | full = trim(b[fullpos:i])
342 | pos = i + 1
343 | case '"':
344 | for {
345 | i++
346 | if i+1 == len(b) {
347 | return nil
348 | }
349 | if b[i] == '"' {
350 | break
351 | }
352 | }
353 | if len(full) == 0 { // Ignore malformed attr
354 | continue
355 | }
356 | t.token.Attrs = append(t.token.Attrs, Attr{
357 | Name: Name{Prefix: prefix, Local: local, Full: full},
358 | Value: trim(b[pos+1 : i]),
359 | })
360 | prefix, local, full = nil, nil, nil
361 | pos = i + 1
362 | fullpos = i + 1
363 | case '/':
364 | t.token.SelfClosing = true
365 | case '>':
366 | return b[i+1:]
367 | }
368 | }
369 | return b
370 | }
371 |
372 | func (t *Tokenizer) consumeCharData(b []byte) {
373 | const prefix, suffix = ""
374 | b = trimPrefix(b)
375 | if len(b) >= len(prefix) && string(b[:len(prefix)]) == prefix {
376 | b = b[len(prefix):]
377 | }
378 | if end := len(b) - len(suffix); end >= 0 && string(b[end:]) == suffix {
379 | b = b[:end]
380 | }
381 | t.token.Data = trim(b)
382 | }
383 |
384 | func trim(b []byte) []byte {
385 | b = trimPrefix(b)
386 | b = trimSuffix(b)
387 | return b
388 | }
389 |
390 | func trimPrefix(b []byte) []byte {
391 | var start int
392 | for i := 0; i < len(b); i++ {
393 | switch b[i] {
394 | case '\r':
395 | if i+1 < len(b) && b[i+1] == '\n' {
396 | start += 2
397 | i++
398 | }
399 | case '\n', ' ', '\t':
400 | start++
401 | default:
402 | return b[start:]
403 | }
404 | }
405 | return b[start:]
406 | }
407 |
408 | func trimSuffix(b []byte) []byte {
409 | var end int = len(b)
410 | for i := len(b) - 1; i >= 0; i-- {
411 | switch b[i] {
412 | case '\n':
413 | end--
414 | if i-1 > 0 && b[i-1] == '\r' {
415 | end--
416 | }
417 | case ' ', '\t':
418 | end--
419 | default:
420 | return b[:end]
421 | }
422 | }
423 | return b[:end]
424 | }
425 |
--------------------------------------------------------------------------------
/tokenizer_test.go:
--------------------------------------------------------------------------------
1 | package xmltokenizer_test
2 |
3 | import (
4 | "bytes"
5 | "errors"
6 | "fmt"
7 | "io"
8 | "io/fs"
9 | "math"
10 | "os"
11 | "path/filepath"
12 | "strings"
13 | "testing"
14 |
15 | "github.com/google/go-cmp/cmp"
16 | "github.com/muktihari/xmltokenizer"
17 | "github.com/muktihari/xmltokenizer/internal/gpx"
18 | "github.com/muktihari/xmltokenizer/internal/xlsx"
19 | "github.com/muktihari/xmltokenizer/internal/xlsx/schema"
20 | )
21 |
22 | var tokenHeader = xmltokenizer.Token{Data: []byte(``), SelfClosing: true}
23 |
24 | func TestTokenWithInmemXML(t *testing.T) {
25 | tt := []struct {
26 | name string
27 | xml string
28 | expecteds []xmltokenizer.Token
29 | err error
30 | }{
31 | {
32 | name: "dtd without entity",
33 | xml: `
34 |
35 |
37 |
39 | World <>'" 白鵬翔
40 | &何; &is-it;
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | `, // Note: retrieved from stdlib xml test.
49 | expecteds: []xmltokenizer.Token{
50 | {
51 | Data: []byte(``),
52 | SelfClosing: true,
53 | },
54 | {
55 | Data: []byte(""),
57 | SelfClosing: true,
58 | },
59 | {
60 | Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")},
61 | Attrs: []xmltokenizer.Attr{
62 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("foo"), Full: []byte("xmlns:foo")}, Value: []byte("ns1")},
63 | {Name: xmltokenizer.Name{Local: []byte("xmlns"), Full: []byte("xmlns")}, Value: []byte("ns2")},
64 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns3")},
65 | },
66 | },
67 | {
68 | Name: xmltokenizer.Name{Local: []byte("hello"), Full: []byte("hello")},
69 | Attrs: []xmltokenizer.Attr{
70 | {Name: xmltokenizer.Name{Local: []byte("lang"), Full: []byte("lang")}, Value: []byte("en")},
71 | },
72 | Data: []byte("World <>'" 白鵬翔"),
73 | },
74 | {
75 | Name: xmltokenizer.Name{Local: []byte("hello"), Full: []byte("hello")},
76 | IsEndElement: true,
77 | },
78 | {
79 | Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")},
80 | Data: []byte("&何; &is-it;"),
81 | },
82 | {
83 | Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")},
84 | IsEndElement: true,
85 | },
86 | {
87 | Name: xmltokenizer.Name{Local: []byte("goodbye"), Full: []byte("goodbye")},
88 | SelfClosing: true,
89 | },
90 | {
91 | Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")},
92 | Attrs: []xmltokenizer.Attr{
93 | {Name: xmltokenizer.Name{Prefix: []byte("foo"), Local: []byte("attr"), Full: []byte("foo:attr")}, Value: []byte("value")},
94 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns4")},
95 | },
96 | },
97 | {
98 | Name: xmltokenizer.Name{Local: []byte("inner"), Full: []byte("inner")},
99 | SelfClosing: true,
100 | },
101 | {
102 | Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")},
103 | IsEndElement: true,
104 | },
105 | {
106 | Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")},
107 | Data: []byte("Some text here."),
108 | },
109 | {
110 | Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")},
111 | IsEndElement: true,
112 | },
113 | {
114 | Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")},
115 | IsEndElement: true,
116 | },
117 | {
118 | Data: []byte(""),
119 | SelfClosing: true,
120 | },
121 | },
122 | },
123 | {
124 | name: "unexpected EOF truncated XML after ``),
129 | SelfClosing: true,
130 | },
131 | },
132 | err: io.ErrUnexpectedEOF,
133 | },
134 | {
135 | name: "unexpected quote before attr name",
136 | xml: "",
137 | expecteds: []xmltokenizer.Token{
138 | {
139 | Data: []byte(``),
140 | SelfClosing: true,
141 | },
142 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}},
143 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}, IsEndElement: true},
144 | },
145 | },
146 | {
147 | name: "unexpected equals in attr name",
148 | xml: "",
149 | expecteds: []xmltokenizer.Token{
150 | {
151 | Data: []byte(``),
152 | SelfClosing: true,
153 | IsEndElement: false,
154 | },
155 | {Name: xmltokenizer.Name{Local: []byte("Image"), Full: []byte("Image")},
156 | Attrs: []xmltokenizer.Attr{
157 | {
158 | Name: xmltokenizer.Name{Local: []uint8("URL"), Full: []uint8("URL")},
159 | Value: []uint8("https://test.com/my-url-ending-in-="),
160 | },
161 | {
162 | Name: xmltokenizer.Name{Local: []uint8("URL2"), Full: []uint8("URL2")},
163 | Value: []uint8("https://ok.com"),
164 | },
165 | },
166 | SelfClosing: true,
167 | },
168 | },
169 | },
170 | {
171 | name: "tab after node name",
172 | xml: ``,
173 | expecteds: []xmltokenizer.Token{
174 | {
175 | Name: xmltokenizer.Name{
176 | Local: []uint8("sample"),
177 | Full: []uint8("sample"),
178 | },
179 | Attrs: []xmltokenizer.Attr{
180 | {
181 | Name: xmltokenizer.Name{
182 | Local: []uint8("foo"),
183 | Full: []uint8("foo")},
184 | Value: []uint8("bar"),
185 | },
186 | },
187 | SelfClosing: true,
188 | },
189 | },
190 | },
191 | {
192 | name: "tab after attribute value",
193 | xml: ``,
194 | expecteds: []xmltokenizer.Token{
195 | {
196 | Name: xmltokenizer.Name{
197 | Local: []uint8("sample"),
198 | Full: []uint8("sample"),
199 | },
200 | Attrs: []xmltokenizer.Attr{
201 | {
202 | Name: xmltokenizer.Name{
203 | Local: []uint8("foo"),
204 | Full: []uint8("foo")},
205 | Value: []uint8("bar"),
206 | },
207 | },
208 | SelfClosing: true,
209 | },
210 | },
211 | },
212 | {
213 | name: "tab between attributes",
214 | xml: ``,
215 | expecteds: []xmltokenizer.Token{
216 | {
217 | Name: xmltokenizer.Name{
218 | Local: []uint8("sample"),
219 | Full: []uint8("sample"),
220 | },
221 | Attrs: []xmltokenizer.Attr{
222 | {
223 | Name: xmltokenizer.Name{
224 | Local: []uint8("foo"),
225 | Full: []uint8("foo")},
226 | Value: []uint8("bar"),
227 | },
228 | {
229 | Name: xmltokenizer.Name{
230 | Local: []uint8("baz"),
231 | Full: []uint8("baz")},
232 | Value: []uint8("quux"),
233 | },
234 | },
235 | SelfClosing: true,
236 | },
237 | },
238 | },
239 | {
240 | name: "slash inside attribute value",
241 | xml: ``,
242 | expecteds: []xmltokenizer.Token{
243 | {
244 | Name: xmltokenizer.Name{Local: []byte("sample"), Full: []byte("sample")},
245 | Attrs: []xmltokenizer.Attr{
246 | {
247 | Name: xmltokenizer.Name{Local: []uint8("path"), Full: []uint8("path")},
248 | Value: []uint8("foo/bar/baz"),
249 | },
250 | },
251 | },
252 | },
253 | },
254 | }
255 |
256 | for i, tc := range tt {
257 | t.Run(fmt.Sprintf("[%d]: %s", i, tc.name), func(t *testing.T) {
258 | tok := xmltokenizer.New(
259 | bytes.NewReader([]byte(tc.xml)),
260 | xmltokenizer.WithReadBufferSize(1), // Read per char so we can cover more code paths
261 | )
262 |
263 | for i := 0; ; i++ {
264 | token, err := tok.Token()
265 | if err == io.EOF {
266 | if i != len(tc.expecteds) {
267 | t.Fatalf("expected %d tokens, got %d", len(tc.expecteds), i)
268 | }
269 | break
270 | }
271 | if err != nil {
272 | if !errors.Is(err, tc.err) {
273 | t.Fatalf("expected error: %v, got: %v", tc.err, err)
274 | }
275 | return
276 | }
277 | if diff := cmp.Diff(token, tc.expecteds[i]); diff != "" {
278 | t.Fatalf("%d: %s", i, diff)
279 | }
280 | }
281 | })
282 | }
283 | }
284 |
285 | func TestTokenWithSmallXMLFiles(t *testing.T) {
286 | tt := []struct {
287 | filename string
288 | expecteds []xmltokenizer.Token
289 | err error
290 | }{
291 | {filename: "cdata.xml", expecteds: []xmltokenizer.Token{
292 | tokenHeader,
293 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}},
294 | {
295 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
296 | Data: []byte("text"),
297 | },
298 | {
299 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
300 | IsEndElement: true,
301 | },
302 | {
303 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
304 | Data: []byte("text"),
305 | },
306 | {
307 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
308 | IsEndElement: true,
309 | },
310 | {
311 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
312 | Data: []byte("text"),
313 | },
314 | {
315 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
316 | IsEndElement: true,
317 | },
318 | {
319 | Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")},
320 | IsEndElement: true,
321 | },
322 | }},
323 | {filename: "cdata_clrf.xml", expecteds: []xmltokenizer.Token{
324 | tokenHeader,
325 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}},
326 | {
327 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
328 | Data: []byte("text"),
329 | },
330 | {
331 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
332 | IsEndElement: true,
333 | },
334 | {
335 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
336 | Data: []byte("text"),
337 | },
338 | {
339 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
340 | IsEndElement: true,
341 | },
342 | {
343 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
344 | Data: []byte("text"),
345 | },
346 | {
347 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
348 | IsEndElement: true,
349 | },
350 | {
351 | Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")},
352 | IsEndElement: true,
353 | },
354 | }},
355 | {filename: filepath.Join("corrupted", "cdata_truncated.xml"), expecteds: []xmltokenizer.Token{
356 | tokenHeader,
357 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}},
358 | {
359 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")},
360 | },
361 | },
362 | err: io.ErrUnexpectedEOF,
363 | },
364 | {filename: "self_closing.xml", expecteds: []xmltokenizer.Token{
365 | tokenHeader,
366 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}, SelfClosing: true},
367 | {Name: xmltokenizer.Name{Local: []byte("b"), Full: []byte("b")}, SelfClosing: true},
368 | }},
369 | {filename: "copyright_header.xml", expecteds: []xmltokenizer.Token{
370 | {Data: []byte(""), SelfClosing: true},
371 | tokenHeader,
372 | }},
373 | {filename: "dtd.xml", expecteds: []xmltokenizer.Token{
374 | tokenHeader,
375 | {
376 | Data: []byte("\n" +
378 | " \n" +
379 | " \n" +
380 | "]>"),
381 | SelfClosing: true,
382 | },
383 | {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}},
384 | {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, Data: []byte("Tove")},
385 | {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, IsEndElement: true},
386 | {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, Data: []byte("Jani")},
387 | {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, IsEndElement: true},
388 | {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, Data: []byte("Reminder")},
389 | {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, IsEndElement: true},
390 | {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, Data: []byte("Don't forget me this weekend!")},
391 | {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, IsEndElement: true},
392 | {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, Data: []byte("&writer; ©right;")},
393 | {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, IsEndElement: true},
394 | {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}, IsEndElement: true},
395 | }},
396 | }
397 |
398 | for i, tc := range tt {
399 | t.Run(fmt.Sprintf("[%d], %s", i, tc.filename), func(t *testing.T) {
400 | path := filepath.Join("testdata", tc.filename)
401 | f, err := os.Open(path)
402 | if err != nil {
403 | panic(err)
404 | }
405 | defer f.Close()
406 |
407 | tok := xmltokenizer.New(f, xmltokenizer.WithReadBufferSize(1))
408 | for i := 0; ; i++ {
409 | token, err := tok.Token()
410 | if err == io.EOF {
411 | break
412 | }
413 | if err != nil {
414 | if !errors.Is(err, tc.err) {
415 | t.Fatalf("expected error: %v, got: %v", tc.err, err)
416 | }
417 | return
418 | }
419 |
420 | if diff := cmp.Diff(token, tc.expecteds[i]); diff != "" {
421 | t.Fatal(diff)
422 | }
423 | }
424 | })
425 | }
426 | }
427 |
428 | func TestTokenOnGPXFiles(t *testing.T) {
429 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error {
430 | t.Run(path, func(t *testing.T) {
431 | if info.IsDir() {
432 | return
433 | }
434 | if strings.ToLower(filepath.Ext(path)) != ".gpx" {
435 | return
436 | }
437 |
438 | data, err := os.ReadFile(path)
439 | if err != nil {
440 | t.Skip(err)
441 | }
442 |
443 | gpx1, err := gpx.UnmarshalWithXMLTokenizer(bytes.NewReader(data))
444 | if err != nil {
445 | t.Fatalf("xmltokenizer: %v", err)
446 | }
447 |
448 | gpx2, err := gpx.UnmarshalWithStdlibXML(bytes.NewReader(data))
449 | if err != nil {
450 | t.Fatalf("xml: %v", err)
451 | }
452 |
453 | if diff := cmp.Diff(gpx1, gpx2,
454 | cmp.Transformer("float64", func(x float64) uint64 {
455 | return math.Float64bits(x)
456 | }),
457 | ); diff != "" {
458 | t.Fatal(diff)
459 | }
460 | })
461 |
462 | return nil
463 | })
464 | }
465 |
466 | func TestTokenOnXLSXFiles(t *testing.T) {
467 | path := filepath.Join("testdata", "xlsx_sheet1.xml")
468 |
469 | data, err := os.ReadFile(path)
470 | if err != nil {
471 | t.Skip(err)
472 | }
473 |
474 | sheet1, err := xlsx.UnmarshalWithXMLTokenizer(bytes.NewReader(data))
475 | if err != nil {
476 | t.Fatalf("xmltokenizer: %v", err)
477 | }
478 | sheet2, err := xlsx.UnmarshalWithStdlibXML(bytes.NewReader(data))
479 | if err != nil {
480 | t.Fatalf("xml: %v", err)
481 | }
482 |
483 | if diff := cmp.Diff(sheet1, sheet2); diff != "" {
484 | t.Fatal(diff)
485 | }
486 | }
487 |
488 | func TestAutoGrowBufferCorrectness(t *testing.T) {
489 | path := filepath.Join("testdata", "xlsx_sheet1.xml")
490 | f, err := os.Open(path)
491 | if err != nil {
492 | panic(err)
493 | }
494 | defer f.Close()
495 |
496 | tok := xmltokenizer.New(f,
497 | xmltokenizer.WithReadBufferSize(1),
498 | )
499 |
500 | var token xmltokenizer.Token
501 | var sheetData1 schema.SheetData
502 | loop:
503 | for {
504 | token, err = tok.Token()
505 | if err == io.EOF {
506 | break
507 | }
508 | if err != nil {
509 | t.Fatal(err)
510 | }
511 |
512 | switch string(token.Name.Local) {
513 | case "sheetData":
514 | se := xmltokenizer.GetToken().Copy(token)
515 | err = sheetData1.UnmarshalToken(tok, se)
516 | xmltokenizer.PutToken(se)
517 | if err != nil {
518 | t.Fatal(err)
519 | }
520 | break loop
521 | }
522 | }
523 |
524 | f2, err := os.Open(path)
525 | if err != nil {
526 | panic(err)
527 | }
528 | defer f2.Close()
529 |
530 | sheetData2, err := xlsx.UnmarshalWithStdlibXML(f2)
531 | if err != nil {
532 | t.Fatal(err)
533 | }
534 |
535 | if diff := cmp.Diff(sheetData1, sheetData2); diff != "" {
536 | t.Fatal(err)
537 | }
538 | }
539 |
540 | func TestRawTokenWithInmemXML(t *testing.T) {
541 | tt := []struct {
542 | name string
543 | xml string
544 | expecteds []string
545 | err error
546 | }{
547 | {
548 | name: "simple xml happy flow",
549 | xml: `
550 |
552 |
554 | World <>'" 白鵬翔
555 | &何; &is-it;
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 | `, // Note: retrieved from stdlib xml test.
564 | expecteds: []string{
565 | "",
566 | "",
568 | "",
570 | "World <>'" 白鵬翔",
571 | "",
572 | "&何; &is-it;",
573 | "",
574 | "",
575 | "",
576 | "",
577 | "",
578 | "\n ",
579 | "",
580 | "",
581 | "",
582 | },
583 | },
584 | {
585 | name: "unexpected EOF truncated XML after `",
589 | "