├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── ci.yml ├── testdata ├── self_closing.xml ├── copyright_header.xml ├── corrupted │ └── cdata_truncated.xml ├── cdata.xml ├── cdata_clrf.xml ├── dtd.xml ├── xlsx_sheet1.xml └── long_comment_token.xml ├── go.mod ├── internal ├── README.md ├── gpx │ ├── schema │ │ ├── xml.go │ │ ├── gpx.go │ │ ├── extensions.go │ │ ├── metadata.go │ │ └── track.go │ └── unmarshal.go ├── xlsx │ ├── unmarshal.go │ └── schema │ │ └── sheet.go └── main.go ├── go.sum ├── codecov.yml ├── CONTRIBUTING.md ├── LICENCE ├── README.md ├── token.go ├── benchmark_test.go ├── token_test.go ├── tokenizer_internal_test.go ├── docs └── USAGE.md ├── tokenizer.go └── tokenizer_test.go /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [muktihari] 2 | -------------------------------------------------------------------------------- /testdata/self_closing.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/muktihari/xmltokenizer 2 | 3 | go 1.21 4 | 5 | require github.com/google/go-cmp v0.7.0 6 | -------------------------------------------------------------------------------- /testdata/copyright_header.xml: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /testdata/corrupted/cdata_truncated.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 2 | 3 | 4 | 5 | 6 | 7 | text]]> 8 | 9 | 10 | text 12 | ]]> 13 | 14 | -------------------------------------------------------------------------------- /testdata/cdata_clrf.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | text]]> 8 | 9 | 10 | text 12 | ]]> 13 | 14 | 15 | -------------------------------------------------------------------------------- /internal/gpx/schema/xml.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | ) 7 | 8 | func getCharData(dec *xml.Decoder) (xml.CharData, error) { 9 | token, err := dec.Token() 10 | if err != nil { 11 | return nil, err 12 | } 13 | v, ok := token.(xml.CharData) 14 | if !ok { 15 | return nil, fmt.Errorf("not a chardata") 16 | } 17 | return v, nil 18 | } 19 | -------------------------------------------------------------------------------- /testdata/dtd.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | ]> 7 | 8 | 9 | Tove 10 | Jani 11 | Reminder 12 | Don't forget me this weekend! 13 |
&writer; ©right;
14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | 8 | updates: 9 | - package-ecosystem: gomod 10 | directory: / 11 | schedule: 12 | interval: monthly 13 | 14 | - package-ecosystem: github-actions 15 | directory: / 16 | schedule: 17 | interval: monthly 18 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Hi, thank you for showing interest in this project. I would be grateful to receive feedback and any form of help. 4 | 5 | If you have trivial fix or improvement, go ahead and create a [pull request][prs] and mention me to review the changes. 6 | 7 | If you plan to do major changes, here are few guidelines to follow: 8 | 9 | 1. Check the [open issues][issues] and [pull requests][prs] for existing discussions. 10 | 1. Open an [issue][issues] first, to discuss new feature or enhancement. 11 | 1. Write the code with tests, and make sure it passes locally and on CI. 12 | 1. Open a pull request, and reference the relevant issue(s). 13 | 1. After receiving feedback, squash your commits and wrap them in an informative message. 14 | 15 | Have fun! 16 | 17 | [issues]: https://github.com/muktihari/xmltokenizer/issues 18 | [prs]: https://github.com/muktihari/xmltokenizer/pulls 19 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a golang project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | branches: ["master"] 9 | 10 | pull_request: 11 | branches: ["master"] 12 | 13 | permissions: {} 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 20 | 21 | - name: Set up Go 22 | uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0 23 | with: 24 | go-version: "stable" 25 | 26 | - name: Test 27 | run: go test -v -cover -coverprofile=coverage.coverprofile ./... 28 | 29 | - name: Upload coverage reports to Codecov 30 | uses: codecov/codecov-action@0565863a31f2c772f9f0395002a31e3f06189574 # v5.4.0 31 | env: 32 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 33 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Hikmatulloh Hari Mukti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /internal/gpx/unmarshal.go: -------------------------------------------------------------------------------- 1 | package gpx 2 | 3 | import ( 4 | "encoding/xml" 5 | "io" 6 | 7 | "github.com/muktihari/xmltokenizer" 8 | "github.com/muktihari/xmltokenizer/internal/gpx/schema" 9 | ) 10 | 11 | func UnmarshalWithXMLTokenizer(f io.Reader) (schema.GPX, error) { 12 | tok := xmltokenizer.New(f) 13 | var gpx schema.GPX 14 | loop: 15 | for { 16 | token, err := tok.Token() 17 | if err == io.EOF { 18 | break 19 | } 20 | if err != nil { 21 | return gpx, err 22 | } 23 | 24 | switch string(token.Name.Local) { 25 | case "gpx": 26 | se := xmltokenizer.GetToken().Copy(token) 27 | err = gpx.UnmarshalToken(tok, se) 28 | xmltokenizer.PutToken(se) 29 | if err != nil { 30 | return gpx, err 31 | } 32 | break loop 33 | } 34 | } 35 | 36 | return gpx, nil 37 | } 38 | 39 | func UnmarshalWithStdlibXML(f io.Reader) (schema.GPX, error) { 40 | dec := xml.NewDecoder(f) 41 | var gpx schema.GPX 42 | loop: 43 | for { 44 | token, err := dec.Token() 45 | if err == io.EOF { 46 | break 47 | } 48 | if err != nil { 49 | return gpx, err 50 | } 51 | 52 | se, ok := token.(xml.StartElement) 53 | if !ok { 54 | continue 55 | } 56 | switch se.Name.Local { 57 | case "gpx": 58 | if err = gpx.UnmarshalXML(dec, se); err != nil { 59 | return gpx, err 60 | } 61 | break loop 62 | } 63 | } 64 | 65 | return gpx, nil 66 | } 67 | -------------------------------------------------------------------------------- /internal/xlsx/unmarshal.go: -------------------------------------------------------------------------------- 1 | package xlsx 2 | 3 | import ( 4 | "encoding/xml" 5 | "io" 6 | 7 | "github.com/muktihari/xmltokenizer" 8 | "github.com/muktihari/xmltokenizer/internal/xlsx/schema" 9 | ) 10 | 11 | func UnmarshalWithXMLTokenizer(r io.Reader) (schema.SheetData, error) { 12 | tok := xmltokenizer.New(r) 13 | var sheetData schema.SheetData 14 | loop: 15 | for { 16 | token, err := tok.Token() 17 | if err == io.EOF { 18 | break 19 | } 20 | if err != nil { 21 | return sheetData, err 22 | } 23 | 24 | switch string(token.Name.Local) { 25 | case "sheetData": 26 | se := xmltokenizer.GetToken().Copy(token) 27 | err = sheetData.UnmarshalToken(tok, se) 28 | xmltokenizer.PutToken(se) 29 | if err != nil { 30 | return sheetData, err 31 | } 32 | break loop 33 | } 34 | } 35 | 36 | return sheetData, nil 37 | } 38 | 39 | func UnmarshalWithStdlibXML(r io.Reader) (schema.SheetData, error) { 40 | dec := xml.NewDecoder(r) 41 | var sheetData schema.SheetData 42 | for { 43 | token, err := dec.Token() 44 | if err == io.EOF { 45 | break 46 | } 47 | if err != nil { 48 | return sheetData, err 49 | } 50 | 51 | switch elem := token.(type) { 52 | case xml.StartElement: 53 | if elem.Name.Local == "sheetData" { 54 | if err = dec.DecodeElement(&sheetData, &elem); err != nil { 55 | return sheetData, err 56 | } 57 | } 58 | } 59 | } 60 | return sheetData, nil 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XML Tokenizer 2 | 3 | ![GitHub Workflow Status](https://github.com/muktihari/xmltokenizer/workflows/CI/badge.svg) 4 | [![Go Reference](https://pkg.go.dev/badge/github.com/muktihari/xmltokenizer.svg)](https://pkg.go.dev/github.com/muktihari/xmltokenizer) 5 | [![CodeCov](https://codecov.io/gh/muktihari/xmltokenizer/branch/master/graph/badge.svg)](https://codecov.io/gh/muktihari/xmltokenizer) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/muktihari/xmltokenizer)](https://goreportcard.com/report/github.com/muktihari/xmltokenizer) 7 | 8 | XML Tokenizer is a low-memory high performance non-namespace parser library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern and you are willing to sacrifice certain features, such as handling the namespace, in favor of speed ([discussion](https://www.reddit.com/r/golang/comments/1drdji3/xml_tokenizer_thats_4x_faster_than_stdlibs_xml/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button)). This may not cover all XML files, but it can cover typical XML files. 9 | 10 | # Motivation 11 | 12 | Go provides a standard library for [XML](https://pkg.go.dev/encoding/xml) parsing, however, I've found it to be slow for my use case. I work with a lot of GPX files in my personal project to retrieve my workouts data; GPX is an XML-based file format. When parsing my 14MB GPX file containing 208km ride using the standard library's xml, it takes roughly 600ms which is super slow and it needs 2.8mil alloc!. I need an alternative library for parsing XML that's faster than standard library's `xml`, suitable for typical XML parsing tasks and no code should be made unsafe. 13 | 14 | # Usage 15 | 16 | Please see [USAGE.md](./docs/USAGE.md). 17 | 18 | # Benchmark 19 | 20 | ```js 21 | goos: darwin; goarch: amd64; pkg: xmltokenizer 22 | cpu: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz 23 | Benchmark/stdlib.xml:"ride_sembalun.gpx"-4 2 605913816 ns/op 110562568 B/op 2806823 allocs/op 24 | Benchmark/xmltokenizer:"ride_sembalun.gpx"-4 8 141616068 ns/op 17143609 B/op 85 allocs/op 25 | ``` 26 | 27 | Approx. 4 times faster! 28 | -------------------------------------------------------------------------------- /internal/gpx/schema/gpx.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | 7 | "github.com/muktihari/xmltokenizer" 8 | ) 9 | 10 | // GPX is GPX schema (simplified). 11 | type GPX struct { 12 | Creator string `xml:"creator,attr"` 13 | Version string `xml:"version,attr"` 14 | Metadata Metadata `xml:"metadata,omitempty"` 15 | Tracks []Track `xml:"trk,omitempty"` 16 | } 17 | 18 | func (g *GPX) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 19 | for i := range se.Attrs { 20 | attr := &se.Attrs[i] 21 | switch string(attr.Name.Local) { 22 | case "creator": 23 | g.Creator = string(attr.Value) 24 | case "version": 25 | g.Version = string(attr.Value) 26 | } 27 | } 28 | 29 | for { 30 | token, err := tok.Token() 31 | if err != nil { 32 | return fmt.Errorf("gpx: %w", err) 33 | } 34 | 35 | if token.IsEndElementOf(se) { 36 | return nil 37 | } 38 | if token.IsEndElement { 39 | continue 40 | } 41 | 42 | switch string(token.Name.Local) { 43 | case "metadata": 44 | se := xmltokenizer.GetToken().Copy(token) 45 | err = g.Metadata.UnmarshalToken(tok, se) 46 | xmltokenizer.PutToken(se) 47 | if err != nil { 48 | return fmt.Errorf("metadata: %w", err) 49 | } 50 | case "trk": 51 | var track Track 52 | se := xmltokenizer.GetToken().Copy(token) 53 | err = track.UnmarshalToken(tok, se) 54 | xmltokenizer.PutToken(se) 55 | if err != nil { 56 | return fmt.Errorf("track: %w", err) 57 | } 58 | g.Tracks = append(g.Tracks, track) 59 | } 60 | } 61 | } 62 | 63 | func (g *GPX) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 64 | for i := range se.Attr { 65 | attr := &se.Attr[i] 66 | switch attr.Name.Local { 67 | case "creator": 68 | g.Creator = attr.Value 69 | case "version": 70 | g.Version = attr.Value 71 | } 72 | } 73 | 74 | for { 75 | token, err := dec.Token() 76 | if err != nil { 77 | return fmt.Errorf("gpx: %w", err) 78 | } 79 | 80 | switch elem := token.(type) { 81 | case xml.StartElement: 82 | switch elem.Name.Local { 83 | case "metadata": 84 | if err := g.Metadata.UnmarshalXML(dec, elem); err != nil { 85 | return fmt.Errorf("metadata: %w", err) 86 | } 87 | case "trk": 88 | var track Track 89 | if err := track.UnmarshalXML(dec, elem); err != nil { 90 | return fmt.Errorf("track: %w", err) 91 | } 92 | g.Tracks = append(g.Tracks, track) 93 | } 94 | 95 | case xml.EndElement: 96 | if elem == se.End() { 97 | return nil 98 | } 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /token.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer 2 | 3 | import "sync" 4 | 5 | var pool = sync.Pool{New: func() any { return new(Token) }} 6 | 7 | // GetToken gets token from the pool, don't forget to put it back. 8 | func GetToken() *Token { return pool.Get().(*Token) } 9 | 10 | // PutToken puts token back to the pool. 11 | func PutToken(t *Token) { pool.Put(t) } 12 | 13 | // Token represent a single token, one of these following: 14 | // - 15 | // - 16 | // - CharData 17 | // - 18 | // - 19 | // - 20 | // - 21 | // - 23 | // 24 | // ]> 25 | // 26 | // Token includes CharData or CDATA in Data field when it appears right after the start element. 27 | type Token struct { 28 | Name Name // Name is an XML name, empty when a tag starts with " 0. 30 | Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "" e.g. . Also true when a tag starts with " or . 33 | } 34 | 35 | // IsEndElementOf checks whether the given token represent a 36 | // n end element (closing tag) of given StartElement. 37 | func (t *Token) IsEndElementOf(se *Token) bool { 38 | if t.IsEndElement && 39 | string(t.Name.Full) == string(se.Name.Full) { 40 | return true 41 | } 42 | return false 43 | } 44 | 45 | // Copy copies src Token into t, returning t. Attrs should be 46 | // consumed immediately since it's only being shallow copied. 47 | func (t *Token) Copy(src Token) *Token { 48 | t.Name.Prefix = append(t.Name.Prefix[:0], src.Name.Prefix...) 49 | t.Name.Local = append(t.Name.Local[:0], src.Name.Local...) 50 | t.Name.Full = append(t.Name.Full[:0], src.Name.Full...) 51 | t.Attrs = append(t.Attrs[:0], src.Attrs...) // shallow copy 52 | t.Data = append(t.Data[:0], src.Data...) 53 | t.SelfClosing = src.SelfClosing 54 | t.IsEndElement = src.IsEndElement 55 | return t 56 | } 57 | 58 | // Attr represents an XML attribute. 59 | type Attr struct { 60 | Name Name 61 | Value []byte 62 | } 63 | 64 | // Name represents an XML name , 65 | // we don't manage the bookkeeping of namespaces. 66 | type Name struct { 67 | Prefix []byte 68 | Local []byte 69 | Full []byte // Full is combination of "prefix:local" 70 | } 71 | -------------------------------------------------------------------------------- /internal/xlsx/schema/sheet.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | 7 | "github.com/muktihari/xmltokenizer" 8 | ) 9 | 10 | type SheetData struct { 11 | Rows []Row `xml:"row,omitempty"` 12 | } 13 | 14 | func (s *SheetData) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 15 | for { 16 | token, err := tok.Token() 17 | if err != nil { 18 | return fmt.Errorf("sheetData: %w", err) 19 | } 20 | 21 | if token.IsEndElementOf(se) { 22 | break 23 | } 24 | if token.IsEndElement { 25 | continue 26 | } 27 | 28 | switch string(token.Name.Local) { 29 | case "row": 30 | var row Row 31 | se := xmltokenizer.GetToken().Copy(token) 32 | err = row.UnmarshalToken(tok, se) 33 | xmltokenizer.PutToken(se) 34 | if err != nil { 35 | return fmt.Errorf("row: %w", err) 36 | } 37 | s.Rows = append(s.Rows, row) 38 | } 39 | } 40 | return nil 41 | } 42 | 43 | type Row struct { 44 | Index int `xml:"r,attr,omitempty"` 45 | Cells []Cell `xml:"c"` 46 | } 47 | 48 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 49 | var err error 50 | for i := range se.Attrs { 51 | attr := &se.Attrs[i] 52 | switch string(attr.Name.Local) { 53 | case "r": 54 | r.Index, err = strconv.Atoi(string(attr.Value)) 55 | if err != nil { 56 | return err 57 | } 58 | } 59 | } 60 | 61 | for { 62 | token, err := tok.Token() 63 | if err != nil { 64 | return fmt.Errorf("row: %w", err) 65 | } 66 | 67 | if token.IsEndElementOf(se) { 68 | break 69 | } 70 | if token.IsEndElement { 71 | continue 72 | } 73 | 74 | switch string(token.Name.Local) { 75 | case "c": 76 | var cell Cell 77 | se := xmltokenizer.GetToken().Copy(token) 78 | err = cell.UnmarshalToken(tok, se) 79 | xmltokenizer.PutToken(se) 80 | if err != nil { 81 | return fmt.Errorf("c: %w", err) 82 | } 83 | r.Cells = append(r.Cells, cell) 84 | } 85 | } 86 | 87 | return nil 88 | } 89 | 90 | type Cell struct { 91 | Reference string `xml:"r,attr"` // E.g. A1 92 | Style int `xml:"s,attr"` 93 | Type string `xml:"t,attr,omitempty"` 94 | Value string `xml:"v,omitempty"` 95 | InlineString string `xml:"is>t"` 96 | } 97 | 98 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 99 | var err error 100 | for i := range se.Attrs { 101 | attr := &se.Attrs[i] 102 | switch string(attr.Name.Local) { 103 | case "r": 104 | c.Reference = string(attr.Value) 105 | case "s": 106 | c.Style, err = strconv.Atoi(string(attr.Value)) 107 | if err != nil { 108 | return fmt.Errorf("s: %w", err) 109 | } 110 | case "t": 111 | c.Type = string(attr.Value) 112 | } 113 | } 114 | 115 | // Must check since `c` may contains self-closing tag: 116 | // 117 | if se.SelfClosing { 118 | return nil 119 | } 120 | 121 | for { 122 | token, err := tok.Token() 123 | if err != nil { 124 | return fmt.Errorf("cell: %w", err) 125 | } 126 | 127 | if token.IsEndElementOf(se) { 128 | break 129 | } 130 | if token.IsEndElement { 131 | continue 132 | } 133 | 134 | switch string(token.Name.Local) { 135 | case "v": 136 | c.Value = string(token.Data) 137 | case "t": 138 | c.InlineString = string(token.Data) 139 | } 140 | } 141 | 142 | return nil 143 | } 144 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer_test 2 | 3 | import ( 4 | "bytes" 5 | "encoding/xml" 6 | "fmt" 7 | "io" 8 | "io/fs" 9 | "os" 10 | "path/filepath" 11 | "strings" 12 | "testing" 13 | 14 | "github.com/muktihari/xmltokenizer" 15 | "github.com/muktihari/xmltokenizer/internal/gpx" 16 | "github.com/muktihari/xmltokenizer/internal/xlsx" 17 | ) 18 | 19 | func BenchmarkToken(b *testing.B) { 20 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error { 21 | if info.IsDir() { 22 | return nil 23 | } 24 | name := strings.TrimPrefix(path, "testdata/") 25 | data, err := os.ReadFile(path) 26 | if err != nil { 27 | b.Logf("%v: %v", path, err) 28 | return nil 29 | } 30 | 31 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) { 32 | var err error 33 | for i := 0; i < b.N; i++ { 34 | if err = unmarshalWithStdlibXML(bytes.NewReader(data)); err != nil { 35 | b.Skipf("could not unmarshal: %v", err) 36 | } 37 | } 38 | }) 39 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) { 40 | var err error 41 | for i := 0; i < b.N; i++ { 42 | if err = unmarshalWithXMLTokenizer(bytes.NewReader(data)); err != nil { 43 | b.Skipf("could not unmarshal: %v", err) 44 | } 45 | } 46 | }) 47 | return nil 48 | }) 49 | } 50 | 51 | func unmarshalWithXMLTokenizer(r io.Reader) error { 52 | tok := xmltokenizer.New(r) 53 | for { 54 | token, err := tok.Token() 55 | if err == io.EOF { 56 | break 57 | } 58 | if err != nil { 59 | return err 60 | } 61 | _ = token 62 | } 63 | return nil 64 | } 65 | 66 | func unmarshalWithStdlibXML(r io.Reader) error { 67 | dec := xml.NewDecoder(r) 68 | for { 69 | token, err := dec.Token() 70 | if err == io.EOF { 71 | break 72 | } 73 | if err != nil { 74 | return err 75 | } 76 | _ = token 77 | } 78 | return nil 79 | } 80 | 81 | func BenchmarkUnmarshalGPX(b *testing.B) { 82 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error { 83 | if info.IsDir() { 84 | return nil 85 | } 86 | if strings.ToLower(filepath.Ext(path)) != ".gpx" { 87 | return nil 88 | } 89 | 90 | name := strings.TrimPrefix(path, "testdata/") 91 | 92 | data, err := os.ReadFile(path) 93 | if err != nil { 94 | panic(err) 95 | } 96 | 97 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) { 98 | for i := 0; i < b.N; i++ { 99 | _, _ = gpx.UnmarshalWithStdlibXML(bytes.NewReader(data)) 100 | } 101 | }) 102 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) { 103 | for i := 0; i < b.N; i++ { 104 | _, _ = gpx.UnmarshalWithXMLTokenizer(bytes.NewReader(data)) 105 | } 106 | }) 107 | 108 | return nil 109 | }) 110 | } 111 | 112 | func BenchmarkUnmarshalXLSX(b *testing.B) { 113 | path := filepath.Join("testdata", "xlsx_sheet1.xml") 114 | name := strings.TrimPrefix(path, "testdata/") 115 | 116 | data, err := os.ReadFile(path) 117 | if err != nil { 118 | panic(err) 119 | } 120 | 121 | b.Run(fmt.Sprintf("stdlib.xml:%q", name), func(b *testing.B) { 122 | for i := 0; i < b.N; i++ { 123 | _, _ = xlsx.UnmarshalWithStdlibXML(bytes.NewReader(data)) 124 | } 125 | }) 126 | b.Run(fmt.Sprintf("xmltokenizer:%q", name), func(b *testing.B) { 127 | for i := 0; i < b.N; i++ { 128 | _, _ = xlsx.UnmarshalWithXMLTokenizer(bytes.NewReader(data)) 129 | } 130 | }) 131 | } 132 | -------------------------------------------------------------------------------- /internal/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strconv" 8 | 9 | "github.com/muktihari/xmltokenizer" 10 | ) 11 | 12 | const sample = ` 13 | 14 | 15 | 0 16 | 17 | 18 | 4 19 | 20 | 21 | ` 22 | 23 | func main() { 24 | f := bytes.NewReader([]byte(sample)) 25 | 26 | tok := xmltokenizer.New(f) 27 | var row Row 28 | loop: 29 | for { 30 | token, err := tok.Token() // Token is only valid until next tok.Token() invocation (short-lived object). 31 | if err == io.EOF { 32 | break 33 | } 34 | if err != nil { 35 | panic(err) 36 | } 37 | switch string(token.Name.Local) { // This do not allocate 🥳👍 38 | case "row": 39 | // Reuse Token object in the sync.Pool since we only use it temporarily. 40 | se := xmltokenizer.GetToken().Copy(token) // se: StartElement, we should copy it since token is a short-lived object. 41 | err = row.UnmarshalToken(tok, se) 42 | xmltokenizer.PutToken(se) // Put back to sync.Pool. 43 | if err != nil { 44 | panic(err) 45 | } 46 | break loop 47 | } 48 | } 49 | fmt.Printf("row: %+v\n", row) 50 | // Output: 51 | // row: {Index:1 Cells:[{Reference:A1 Value:0} {Reference:B1 Value:4} {Reference:C1 Value:}]} 52 | } 53 | 54 | type Row struct { 55 | Index int `xml:"r,attr,omitempty"` 56 | Cells []Cell `xml:"c"` 57 | } 58 | 59 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 60 | var err error 61 | for i := range se.Attrs { 62 | attr := &se.Attrs[i] 63 | switch string(attr.Name.Local) { 64 | case "r": 65 | r.Index, err = strconv.Atoi(string(attr.Value)) 66 | if err != nil { 67 | return err 68 | } 69 | } 70 | } 71 | 72 | for { 73 | token, err := tok.Token() 74 | if err != nil { 75 | return err 76 | } 77 | if token.IsEndElementOf(se) { // Reach desired EndElement 78 | return nil 79 | } 80 | if token.IsEndElement { // Ignore child's EndElements 81 | continue 82 | } 83 | switch string(token.Name.Local) { 84 | case "c": 85 | var cell Cell 86 | // Reuse Token object in the sync.Pool since we only use it temporarily. 87 | se := xmltokenizer.GetToken().Copy(token) 88 | err = cell.UnmarshalToken(tok, se) 89 | xmltokenizer.PutToken(se) // Put back to sync.Pool. 90 | if err != nil { 91 | return err 92 | } 93 | r.Cells = append(r.Cells, cell) 94 | } 95 | } 96 | } 97 | 98 | type Cell struct { 99 | Reference string `xml:"r,attr"` 100 | Value string `xml:"v,omitempty"` 101 | } 102 | 103 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 104 | for i := range se.Attrs { 105 | attr := &se.Attrs[i] 106 | switch string(attr.Name.Local) { 107 | case "r": 108 | c.Reference = string(attr.Value) 109 | } 110 | } 111 | 112 | // Must check since `c` may contains self-closing tag: 113 | // 114 | if se.SelfClosing { 115 | return nil 116 | } 117 | 118 | for { 119 | token, err := tok.Token() 120 | if err != nil { 121 | return err 122 | } 123 | if token.IsEndElementOf(se) { // Reach desired EndElement 124 | return nil 125 | } 126 | if token.IsEndElement { // Ignore child's EndElements 127 | continue 128 | } 129 | switch string(token.Name.Local) { 130 | case "v": 131 | c.Value = string(token.Data) 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /internal/gpx/schema/extensions.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | "math" 7 | "strconv" 8 | 9 | "github.com/muktihari/xmltokenizer" 10 | ) 11 | 12 | // TrackpointExtension is a GPX extension for health-related data. 13 | type TrackpointExtension struct { 14 | Cadence uint8 15 | Distance float64 16 | HeartRate uint8 17 | Temperature int8 18 | Power uint16 19 | } 20 | 21 | func (t *TrackpointExtension) reset() { 22 | t.Cadence = math.MaxUint8 23 | t.Distance = math.NaN() 24 | t.HeartRate = math.MaxUint8 25 | t.Temperature = math.MaxInt8 26 | t.Power = math.MaxUint16 27 | } 28 | 29 | func (t *TrackpointExtension) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 30 | t.reset() 31 | 32 | for { 33 | token, err := tok.Token() 34 | if err != nil { 35 | return fmt.Errorf("trackpointExtension: %w", err) 36 | } 37 | 38 | if token.IsEndElementOf(se) { 39 | return nil 40 | } 41 | if token.IsEndElement { 42 | continue 43 | } 44 | 45 | switch string(token.Name.Local) { 46 | case "cad", "cadence": 47 | val, err := strconv.ParseUint(string(token.Data), 10, 8) 48 | if err != nil { 49 | return err 50 | } 51 | t.Cadence = uint8(val) 52 | case "distance": 53 | val, err := strconv.ParseFloat(string(token.Data), 64) 54 | if err != nil { 55 | return err 56 | } 57 | t.Distance = val 58 | case "hr", "heartrate": 59 | val, err := strconv.ParseUint(string(token.Data), 10, 8) 60 | if err != nil { 61 | return err 62 | } 63 | t.HeartRate = uint8(val) 64 | case "atemp", "temp", "temperature": 65 | val, err := strconv.ParseInt(string(token.Data), 10, 8) 66 | if err != nil { 67 | return err 68 | } 69 | t.Temperature = int8(val) 70 | case "power": 71 | val, err := strconv.ParseUint(string(token.Data), 10, 16) 72 | if err != nil { 73 | return err 74 | } 75 | t.Power = uint16(val) 76 | } 77 | } 78 | } 79 | 80 | func (t *TrackpointExtension) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 81 | t.reset() 82 | 83 | for { 84 | token, err := dec.Token() 85 | if err != nil { 86 | return fmt.Errorf("trackpointExtension: %w", err) 87 | } 88 | 89 | switch elem := token.(type) { 90 | case xml.StartElement: 91 | charData, err := getCharData(dec) 92 | if err != nil { 93 | return err 94 | } 95 | switch elem.Name.Local { 96 | case "cad", "cadence": 97 | val, err := strconv.ParseUint(string(charData), 10, 8) 98 | if err != nil { 99 | return err 100 | } 101 | t.Cadence = uint8(val) 102 | case "distance": 103 | val, err := strconv.ParseFloat(string(charData), 64) 104 | if err != nil { 105 | return err 106 | } 107 | t.Distance = val 108 | case "hr", "heartrate": 109 | val, err := strconv.ParseUint(string(charData), 10, 8) 110 | if err != nil { 111 | return err 112 | } 113 | t.HeartRate = uint8(val) 114 | case "atemp", "temp", "temperature": 115 | val, err := strconv.ParseInt(string(charData), 10, 8) 116 | if err != nil { 117 | return err 118 | } 119 | t.Temperature = int8(val) 120 | case "power": 121 | val, err := strconv.ParseUint(string(charData), 10, 16) 122 | if err != nil { 123 | return err 124 | } 125 | t.Power = uint16(val) 126 | } 127 | case xml.EndElement: 128 | if elem == se.End() { 129 | return nil 130 | } 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /token_test.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/google/go-cmp/cmp" 7 | "github.com/muktihari/xmltokenizer" 8 | ) 9 | 10 | func TestGetToken(t *testing.T) { 11 | alloc := testing.AllocsPerRun(10, func() { 12 | token := xmltokenizer.GetToken() 13 | xmltokenizer.PutToken(token) 14 | }) 15 | if alloc != 0 { 16 | t.Fatalf("expected alloc: 0, got: %g", alloc) 17 | } 18 | } 19 | 20 | func TestIsEndElement(t *testing.T) { 21 | tt := []struct { 22 | name string 23 | token xmltokenizer.Token 24 | expected bool 25 | }{ 26 | { 27 | name: "an end element", 28 | token: xmltokenizer.Token{ 29 | Name: xmltokenizer.Name{ 30 | Full: []byte("worksheet"), 31 | }, 32 | IsEndElement: true, 33 | }, 34 | expected: true, 35 | }, 36 | { 37 | name: "a start element", 38 | token: xmltokenizer.Token{ 39 | Name: xmltokenizer.Name{ 40 | Full: []byte("worksheet"), 41 | }, 42 | }, 43 | expected: false, 44 | }, 45 | { 46 | name: "a procinst", 47 | token: xmltokenizer.Token{ 48 | Name: xmltokenizer.Name{ 49 | Full: []byte("?xml"), 50 | }, 51 | }, 52 | expected: false, 53 | }, 54 | } 55 | 56 | for _, tc := range tt { 57 | t.Run(tc.name, func(t *testing.T) { 58 | if r := tc.token.IsEndElement; r != tc.expected { 59 | t.Fatalf("expected: %t, got: %t", tc.expected, r) 60 | } 61 | }) 62 | } 63 | } 64 | 65 | func TestIsEndElementOf(t *testing.T) { 66 | tt := []struct { 67 | name string 68 | t1, t2 xmltokenizer.Token 69 | expected bool 70 | }{ 71 | { 72 | name: "correct end element", 73 | t1: xmltokenizer.Token{ 74 | Name: xmltokenizer.Name{ 75 | Full: []byte("worksheet"), 76 | }, 77 | IsEndElement: true, 78 | }, 79 | t2: xmltokenizer.Token{ 80 | Name: xmltokenizer.Name{ 81 | Full: []byte("worksheet"), 82 | }, 83 | }, 84 | expected: true, 85 | }, 86 | { 87 | name: "incorrect end element", 88 | t1: xmltokenizer.Token{ 89 | Name: xmltokenizer.Name{ 90 | Full: []byte("/gpx"), 91 | }, 92 | }, 93 | t2: xmltokenizer.Token{ 94 | Name: xmltokenizer.Name{ 95 | Full: []byte("worksheet"), 96 | }, 97 | }, 98 | expected: false, 99 | }, 100 | { 101 | name: "not even an end element", 102 | t2: xmltokenizer.Token{ 103 | Name: xmltokenizer.Name{ 104 | Full: []byte("worksheet"), 105 | }, 106 | }, 107 | t1: xmltokenizer.Token{ 108 | Name: xmltokenizer.Name{ 109 | Full: []byte("worksheet"), 110 | }, 111 | }, 112 | expected: false, 113 | }, 114 | } 115 | 116 | for _, tc := range tt { 117 | t.Run(tc.name, func(t *testing.T) { 118 | if r := tc.t1.IsEndElementOf(&tc.t2); r != tc.expected { 119 | t.Fatalf("expected: %t, got: %t", tc.expected, r) 120 | } 121 | }) 122 | } 123 | } 124 | 125 | func TestCopy(t *testing.T) { 126 | t1 := xmltokenizer.Token{ 127 | Name: xmltokenizer.Name{ 128 | Prefix: []byte("gpxtpx"), 129 | Local: []byte("hr"), 130 | Full: []byte("gpxtpx:hr"), 131 | }, 132 | Attrs: []xmltokenizer.Attr{{ 133 | Name: xmltokenizer.Name{ 134 | Prefix: nil, 135 | Local: []byte("units"), 136 | Full: []byte("units"), 137 | }, 138 | Value: []byte("bpm"), 139 | }}, 140 | Data: []byte("70"), 141 | } 142 | 143 | var t2 xmltokenizer.Token 144 | t2.Copy(t1) 145 | 146 | if diff := cmp.Diff(t2, t1); diff != "" { 147 | t.Fatal(diff) 148 | } 149 | 150 | t2.Name.Full = append(t2.Name.Full[:0], "asd"...) 151 | t2.Data = append(t2.Data[:0], "60"...) 152 | if diff := cmp.Diff(t2, t1); diff == "" { 153 | t.Fatalf("expected different, got same") 154 | } 155 | 156 | // Test shallow copy, it should change the original 157 | t2.Attrs[0].Name.Full[0] = 'i' 158 | if diff := cmp.Diff(t2.Attrs, t1.Attrs); diff != "" { 159 | t.Fatal(diff) 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /testdata/xlsx_sheet1.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 20 | 21 | 22 | 34 | 48 | 62 | 64 | 66 | 68 | 70 | 72 | 74 | 77 | 78 | 80 | 82 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /tokenizer_internal_test.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "os" 7 | "path/filepath" 8 | "testing" 9 | 10 | "github.com/google/go-cmp/cmp" 11 | ) 12 | 13 | func TestOptions(t *testing.T) { 14 | tt := []struct { 15 | name string 16 | options []Option 17 | expectedOptions options 18 | }{ 19 | { 20 | name: "defaultOptions", 21 | expectedOptions: defaultOptions(), 22 | }, 23 | { 24 | name: "less than 0", 25 | options: []Option{ 26 | WithReadBufferSize(-1), 27 | WithAttrBufferSize(-1), 28 | WithAutoGrowBufferMaxLimitSize(-1), 29 | }, 30 | expectedOptions: options{ 31 | readBufferSize: defaultReadBufferSize, 32 | autoGrowBufferMaxLimitSize: autoGrowBufferMaxLimitSize, 33 | attrsBufferSize: defaultAttrsBufferSize, 34 | }, 35 | }, 36 | { 37 | name: "readBufferSize > maxLimitGrowBufferSize", 38 | options: []Option{ 39 | WithReadBufferSize(4 << 10), 40 | WithAutoGrowBufferMaxLimitSize(1 << 10), 41 | }, 42 | expectedOptions: options{ 43 | readBufferSize: 4 << 10, 44 | autoGrowBufferMaxLimitSize: 4 << 10, 45 | attrsBufferSize: defaultAttrsBufferSize, 46 | }, 47 | }, 48 | } 49 | 50 | for _, tc := range tt { 51 | t.Run(tc.name, func(t *testing.T) { 52 | tok := New(nil, tc.options...) 53 | if diff := cmp.Diff(tok.options, tc.expectedOptions, 54 | cmp.AllowUnexported(options{}), 55 | ); diff != "" { 56 | t.Fatal(diff) 57 | } 58 | }) 59 | } 60 | } 61 | 62 | func TestAutoGrowBuffer(t *testing.T) { 63 | tt := []struct { 64 | name string 65 | filename string 66 | opts []Option 67 | err error 68 | }{ 69 | { 70 | name: "grow buffer with alloc", 71 | filename: "long_comment_token.xml", 72 | opts: []Option{ 73 | WithReadBufferSize(5), 74 | }, 75 | err: nil, 76 | }, 77 | { 78 | name: "grow buffer exceed max limit", 79 | filename: "long_comment_token.xml", 80 | opts: []Option{ 81 | WithReadBufferSize(5), 82 | WithAutoGrowBufferMaxLimitSize(5), 83 | }, 84 | err: errAutoGrowBufferExceedMaxLimit, 85 | }, 86 | } 87 | 88 | for _, tc := range tt { 89 | t.Run(tc.name, func(t *testing.T) { 90 | f, err := os.Open(filepath.Join("testdata", tc.filename)) 91 | if err != nil { 92 | panic(err) 93 | } 94 | defer f.Close() 95 | 96 | tok := New(f, tc.opts...) 97 | for { 98 | _, err = tok.Token() 99 | if err == io.EOF { 100 | err = nil 101 | break 102 | } 103 | if err != nil { 104 | break 105 | } 106 | } 107 | 108 | if !errors.Is(err, tc.err) { 109 | t.Fatalf("expected error: %v, got: %v", tc.err, err) 110 | } 111 | }) 112 | } 113 | } 114 | 115 | type fnReader func(b []byte) (n int, err error) 116 | 117 | func (f fnReader) Read(b []byte) (n int, err error) { return f(b) } 118 | 119 | func TestReset(t *testing.T) { 120 | r := fnReader(func(b []byte) (n int, err error) { return len(b), nil }) 121 | tok := New(r) 122 | tok.Token() // Trigger make buffer init, cause grow buffer by alloc up to max limit: 1MB 123 | 124 | tok.Reset(r, 125 | WithReadBufferSize(1024), 126 | WithAutoGrowBufferMaxLimitSize(4), 127 | ) 128 | 129 | if expected := 1024; len(tok.buf) != expected { 130 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf)) 131 | } 132 | if expected := 1000 << 10; cap(tok.buf) != expected { 133 | t.Fatalf("expected cap(t.buf): %d, got: %d", expected, cap(tok.buf)) 134 | } 135 | 136 | if tok.cur != 0 { 137 | t.Fatalf("expected cur: %d, got: cur: %d", 138 | 0, tok.cur) 139 | } 140 | 141 | newBufferSize := 2000 << 10 142 | tok.Reset(r, 143 | WithReadBufferSize(newBufferSize), 144 | WithAutoGrowBufferMaxLimitSize(4), 145 | ) 146 | 147 | tok.Token() // Trigger manageBuffer 148 | 149 | if expected := newBufferSize; len(tok.buf) != expected { 150 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf)) 151 | } 152 | if expected := newBufferSize + defaultReadBufferSize; cap(tok.buf) != expected { 153 | t.Fatalf("expected len(t.buf): %d, got: %d", expected, len(tok.buf)) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /docs/USAGE.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | The usage of this library is similar to the standard library's xml manual implementation of `xml.Unmarshaler` interface, with a slightly different code. 4 | 5 | Let's say we have this xml schema, a simplified version of `xlsx's sheet1.xml`. 6 | 7 | ```xml 8 | 9 | 10 | 11 | 0 12 | 13 | 14 | 4 15 | 16 | 17 | 18 | ``` 19 | 20 | We can write the Go implementation like following: 21 | 22 | ```go 23 | package main 24 | 25 | import ( 26 | "bytes" 27 | "fmt" 28 | "io" 29 | "strconv" 30 | 31 | "github.com/muktihari/xmltokenizer" 32 | ) 33 | 34 | const sample = ` 35 | 36 | 37 | 0 38 | 39 | 40 | 4 41 | 42 | 43 | ` 44 | 45 | func main() { 46 | f := bytes.NewReader([]byte(sample)) 47 | 48 | tok := xmltokenizer.New(f) 49 | var row Row 50 | loop: 51 | for { 52 | token, err := tok.Token() // Token is only valid until next tok.Token() invocation (short-lived object). 53 | if err == io.EOF { 54 | break 55 | } 56 | if err != nil { 57 | panic(err) 58 | } 59 | switch string(token.Name.Local) { // This do not allocate 🥳👍 60 | case "row": 61 | // Reuse Token object in the sync.Pool since we only use it temporarily. 62 | se := xmltokenizer.GetToken().Copy(token) // se: StartElement, we should copy it since token is a short-lived object. 63 | err = row.UnmarshalToken(tok, se) 64 | xmltokenizer.PutToken(se) // Put back to sync.Pool. 65 | if err != nil { 66 | panic(err) 67 | } 68 | break loop 69 | } 70 | } 71 | fmt.Printf("row: %+v\n", row) 72 | // Output: 73 | // row: {Index:1 Cells:[{Reference:A1 Value:0} {Reference:B1 Value:4} {Reference:C1 Value:}]} 74 | } 75 | 76 | type Row struct { 77 | Index int `xml:"r,attr,omitempty"` 78 | Cells []Cell `xml:"c"` 79 | } 80 | 81 | func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 82 | var err error 83 | for i := range se.Attrs { 84 | attr := &se.Attrs[i] 85 | switch string(attr.Name.Local) { 86 | case "r": 87 | r.Index, err = strconv.Atoi(string(attr.Value)) 88 | if err != nil { 89 | return err 90 | } 91 | } 92 | } 93 | 94 | for { 95 | token, err := tok.Token() 96 | if err != nil { 97 | return err 98 | } 99 | if token.IsEndElementOf(se) { // Reach desired EndElement 100 | return nil 101 | } 102 | if token.IsEndElement { // Ignore child's EndElements 103 | continue 104 | } 105 | switch string(token.Name.Local) { 106 | case "c": 107 | var cell Cell 108 | // Reuse Token object in the sync.Pool since we only use it temporarily. 109 | se := xmltokenizer.GetToken().Copy(token) 110 | err = cell.UnmarshalToken(tok, se) 111 | xmltokenizer.PutToken(se) // Put back to sync.Pool. 112 | if err != nil { 113 | return err 114 | } 115 | r.Cells = append(r.Cells, cell) 116 | } 117 | } 118 | } 119 | 120 | type Cell struct { 121 | Reference string `xml:"r,attr"` 122 | Value string `xml:"v,omitempty"` 123 | } 124 | 125 | func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 126 | for i := range se.Attrs { 127 | attr := &se.Attrs[i] 128 | switch string(attr.Name.Local) { 129 | case "r": 130 | c.Reference = string(attr.Value) 131 | } 132 | } 133 | 134 | // Must check since `c` may contains self-closing tag: 135 | // 136 | if se.SelfClosing { 137 | return nil 138 | } 139 | 140 | for { 141 | token, err := tok.Token() 142 | if err != nil { 143 | return err 144 | } 145 | if token.IsEndElementOf(se) { // Reach desired EndElement 146 | return nil 147 | } 148 | if token.IsEndElement { // Ignore child's EndElements 149 | continue 150 | } 151 | switch string(token.Name.Local) { 152 | case "v": 153 | c.Value = string(token.Data) 154 | } 155 | } 156 | } 157 | 158 | ``` 159 | 160 | You can find more examples in [internal](../internal/README.md) package. 161 | -------------------------------------------------------------------------------- /testdata/long_comment_token.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /internal/gpx/schema/metadata.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/muktihari/xmltokenizer" 9 | ) 10 | 11 | // Metadata is GPX's Metadata schema (simplified). 12 | type Metadata struct { 13 | Name string `xml:"name,omitempty"` 14 | Desc string `xml:"desc,omitempty"` 15 | Author *Author `xml:"author,omitempty"` 16 | Link *Link `xml:"link,omitempty"` 17 | Time time.Time `xml:"time,omitempty"` 18 | } 19 | 20 | func (m *Metadata) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 21 | for { 22 | token, err := tok.Token() 23 | if err != nil { 24 | return fmt.Errorf("metadata: %w", err) 25 | } 26 | 27 | if token.IsEndElementOf(se) { 28 | return nil 29 | } 30 | if token.IsEndElement { 31 | continue 32 | } 33 | 34 | switch string(token.Name.Local) { 35 | case "name": 36 | m.Name = string(token.Data) 37 | case "desc": 38 | m.Desc = string(token.Data) 39 | case "author": 40 | m.Author = new(Author) 41 | se := xmltokenizer.GetToken().Copy(token) 42 | err = m.Author.UnmarshalToken(tok, se) 43 | xmltokenizer.PutToken(se) 44 | if err != nil { 45 | return fmt.Errorf("author: %w", err) 46 | } 47 | case "link": 48 | m.Link = new(Link) 49 | se := xmltokenizer.GetToken().Copy(token) 50 | err = m.Link.UnmarshalToken(tok, se) 51 | xmltokenizer.PutToken(se) 52 | if err != nil { 53 | return fmt.Errorf("link: %w", err) 54 | } 55 | case "time": 56 | m.Time, err = time.Parse(time.RFC3339, string(token.Data)) 57 | if err != nil { 58 | return fmt.Errorf("time: %w", err) 59 | } 60 | } 61 | } 62 | } 63 | 64 | func (m *Metadata) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 65 | for { 66 | token, err := dec.Token() 67 | if err != nil { 68 | return fmt.Errorf("metadata: %w", err) 69 | } 70 | 71 | switch elem := token.(type) { 72 | case xml.StartElement: 73 | switch elem.Name.Local { 74 | case "author": 75 | m.Author = new(Author) 76 | if err := m.Author.UnmarshalXML(dec, elem); err != nil { 77 | return fmt.Errorf("author: %w", err) 78 | } 79 | continue 80 | case "link": 81 | m.Link = new(Link) 82 | if err := m.Link.UnmarshalXML(dec, elem); err != nil { 83 | return fmt.Errorf("link: %w", err) 84 | } 85 | continue 86 | } 87 | charData, err := getCharData(dec) 88 | if err != nil { 89 | return err 90 | } 91 | switch elem.Name.Local { 92 | case "name": 93 | m.Name = string(charData) 94 | case "desc": 95 | m.Desc = string(charData) 96 | case "time": 97 | m.Time, err = time.Parse(time.RFC3339, string(charData)) 98 | if err != nil { 99 | return fmt.Errorf("time: %w", err) 100 | } 101 | } 102 | case xml.EndElement: 103 | if elem == se.End() { 104 | return nil 105 | } 106 | } 107 | } 108 | } 109 | 110 | // Author is Author schema (simplified). 111 | type Author struct { 112 | Name string `xml:"name"` 113 | Link *Link `xml:"link"` 114 | } 115 | 116 | func (a *Author) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 117 | for { 118 | token, err := tok.Token() 119 | if err != nil { 120 | return fmt.Errorf("author: %w", err) 121 | } 122 | 123 | if token.IsEndElementOf(se) { 124 | return nil 125 | } 126 | if token.IsEndElement { 127 | continue 128 | } 129 | 130 | switch string(token.Name.Local) { 131 | case "name": 132 | a.Name = string(token.Data) 133 | case "link": 134 | a.Link = new(Link) 135 | se := xmltokenizer.GetToken().Copy(token) 136 | err := a.Link.UnmarshalToken(tok, se) 137 | xmltokenizer.PutToken(se) 138 | if err != nil { 139 | return fmt.Errorf("link: %w", err) 140 | } 141 | } 142 | } 143 | } 144 | 145 | func (a *Author) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 146 | for { 147 | token, err := dec.Token() 148 | if err != nil { 149 | return fmt.Errorf("author: %w", err) 150 | } 151 | 152 | switch elem := token.(type) { 153 | case xml.StartElement: 154 | switch elem.Name.Local { 155 | case "link": 156 | a.Link = new(Link) 157 | if err := a.Link.UnmarshalXML(dec, elem); err != nil { 158 | return fmt.Errorf("link: %w", err) 159 | } 160 | case "name": 161 | charData, err := getCharData(dec) 162 | if err != nil { 163 | return fmt.Errorf("name: %w", err) 164 | } 165 | a.Name = string(charData) 166 | } 167 | case xml.EndElement: 168 | if elem == se.End() { 169 | return nil 170 | } 171 | } 172 | } 173 | } 174 | 175 | // Link is Link schema. 176 | type Link struct { 177 | XMLName xml.Name `xml:"link"` 178 | Href string `xml:"href,attr"` 179 | 180 | Text string `xml:"text,omitempty"` 181 | Type string `xml:"type,omitempty"` 182 | } 183 | 184 | func (a *Link) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 185 | for i := range se.Attrs { 186 | attr := &se.Attrs[i] 187 | switch string(attr.Name.Local) { 188 | case "href": 189 | a.Href = string(attr.Value) 190 | } 191 | } 192 | 193 | for { 194 | token, err := tok.Token() 195 | if err != nil { 196 | return fmt.Errorf("link: %w", err) 197 | } 198 | 199 | if token.IsEndElementOf(se) { 200 | return nil 201 | } 202 | if token.IsEndElement { 203 | continue 204 | } 205 | 206 | switch string(token.Name.Local) { 207 | case "text": 208 | a.Text = string(token.Data) 209 | case "type": 210 | a.Type = string(token.Data) 211 | } 212 | } 213 | } 214 | 215 | func (a *Link) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 216 | for i := range se.Attr { 217 | attr := &se.Attr[i] 218 | switch attr.Name.Local { 219 | case "href": 220 | a.Href = attr.Value 221 | } 222 | } 223 | 224 | for { 225 | token, err := dec.Token() 226 | if err != nil { 227 | return fmt.Errorf("link: %w", err) 228 | } 229 | 230 | switch elem := token.(type) { 231 | case xml.StartElement: 232 | charData, err := getCharData(dec) 233 | if err != nil { 234 | return fmt.Errorf("%s: %w", elem.Name.Local, err) 235 | } 236 | switch elem.Name.Local { 237 | case "text": 238 | a.Text = string(charData) 239 | case "type": 240 | a.Type = string(charData) 241 | } 242 | case xml.EndElement: 243 | if elem == se.End() { 244 | return nil 245 | } 246 | } 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /internal/gpx/schema/track.go: -------------------------------------------------------------------------------- 1 | package schema 2 | 3 | import ( 4 | "encoding/xml" 5 | "fmt" 6 | "math" 7 | "strconv" 8 | "time" 9 | 10 | "github.com/muktihari/xmltokenizer" 11 | ) 12 | 13 | type Track struct { 14 | Name string `xml:"name,omitempty"` 15 | Type string `xml:"type,omitempty"` 16 | TrackSegments []TrackSegment `xml:"trkseg,omitempty"` 17 | } 18 | 19 | func (t *Track) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 20 | for { 21 | token, err := tok.Token() 22 | if err != nil { 23 | return fmt.Errorf("track: %w", err) 24 | } 25 | 26 | if token.IsEndElementOf(se) { 27 | return nil 28 | } 29 | if token.IsEndElement { 30 | continue 31 | } 32 | 33 | switch string(token.Name.Local) { 34 | case "name": 35 | t.Name = string(token.Data) 36 | case "type": 37 | t.Type = string(token.Data) 38 | case "trkseg": 39 | var trkseg TrackSegment 40 | se := xmltokenizer.GetToken().Copy(token) 41 | err = trkseg.UnmarshalToken(tok, se) 42 | xmltokenizer.PutToken(se) 43 | if err != nil { 44 | return fmt.Errorf("trkseg: %w", err) 45 | } 46 | t.TrackSegments = append(t.TrackSegments, trkseg) 47 | } 48 | } 49 | } 50 | 51 | func (t *Track) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 52 | for { 53 | token, err := dec.Token() 54 | if err != nil { 55 | return fmt.Errorf("track: %w", err) 56 | } 57 | 58 | switch elem := token.(type) { 59 | case xml.StartElement: 60 | switch elem.Name.Local { 61 | case "trkseg": 62 | var trkseg TrackSegment 63 | if err := trkseg.UnmarshalXML(dec, elem); err != nil { 64 | return fmt.Errorf("trkseg: %w", err) 65 | } 66 | t.TrackSegments = append(t.TrackSegments, trkseg) 67 | continue 68 | } 69 | charData, err := getCharData(dec) 70 | if err != nil { 71 | return fmt.Errorf("%s: %w", elem.Name.Local, err) 72 | } 73 | switch elem.Name.Local { 74 | case "name": 75 | t.Name = string(charData) 76 | case "type": 77 | t.Type = string(charData) 78 | } 79 | case xml.EndElement: 80 | if elem == se.End() { 81 | return nil 82 | } 83 | } 84 | } 85 | } 86 | 87 | type TrackSegment struct { 88 | Trackpoints []Waypoint `xml:"trkpt,omitempty"` 89 | } 90 | 91 | func (t *TrackSegment) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 92 | for { 93 | token, err := tok.Token() 94 | if err != nil { 95 | return err 96 | } 97 | 98 | if token.IsEndElementOf(se) { 99 | return nil 100 | } 101 | if token.IsEndElement { 102 | continue 103 | } 104 | 105 | switch string(token.Name.Local) { 106 | case "trkpt": 107 | var trkpt Waypoint 108 | se := xmltokenizer.GetToken().Copy(token) 109 | err = trkpt.UnmarshalToken(tok, se) 110 | xmltokenizer.PutToken(se) 111 | if err != nil { 112 | return fmt.Errorf("trkpt: %w", err) 113 | } 114 | t.Trackpoints = append(t.Trackpoints, trkpt) 115 | } 116 | } 117 | } 118 | 119 | func (t *TrackSegment) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 120 | for { 121 | token, err := dec.Token() 122 | if err != nil { 123 | return err 124 | } 125 | 126 | switch elem := token.(type) { 127 | case xml.StartElement: 128 | switch elem.Name.Local { 129 | case "trkpt": 130 | var trkpt Waypoint 131 | if err := trkpt.UnmarshalXML(dec, elem); err != nil { 132 | return fmt.Errorf("trkpt: %w", err) 133 | } 134 | t.Trackpoints = append(t.Trackpoints, trkpt) 135 | } 136 | case xml.EndElement: 137 | if elem == se.End() { 138 | return nil 139 | } 140 | } 141 | } 142 | } 143 | 144 | type Waypoint struct { 145 | Lat float64 `xml:"lat,attr,omitempty"` 146 | Lon float64 `xml:"lon,attr,omitempty"` 147 | Ele float64 `xml:"ele,omitempty"` 148 | Time time.Time `xml:"time,omitempty"` 149 | TrackpointExtension TrackpointExtension `xml:"extensions>TrackPointExtension,omitempty"` 150 | } 151 | 152 | func (w *Waypoint) reset() { 153 | w.Lat = math.NaN() 154 | w.Lon = math.NaN() 155 | w.Ele = math.NaN() 156 | w.Time = time.Time{} 157 | w.TrackpointExtension.reset() 158 | } 159 | 160 | func (w *Waypoint) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token) error { 161 | w.reset() 162 | 163 | var err error 164 | for i := range se.Attrs { 165 | attr := &se.Attrs[i] 166 | switch string(attr.Name.Local) { 167 | case "lat": 168 | w.Lat, err = strconv.ParseFloat(string(attr.Value), 64) 169 | if err != nil { 170 | return fmt.Errorf("lat: %w", err) 171 | } 172 | case "lon": 173 | w.Lon, err = strconv.ParseFloat(string(attr.Value), 64) 174 | if err != nil { 175 | return fmt.Errorf("lon: %w", err) 176 | } 177 | } 178 | } 179 | 180 | for { 181 | token, err := tok.Token() 182 | if err != nil { 183 | return fmt.Errorf("waypoint: %w", err) 184 | } 185 | 186 | if token.IsEndElementOf(se) { 187 | return nil 188 | } 189 | if token.IsEndElement { 190 | continue 191 | } 192 | 193 | switch string(token.Name.Local) { 194 | case "ele": 195 | w.Ele, err = strconv.ParseFloat(string(token.Data), 64) 196 | if err != nil { 197 | return fmt.Errorf("ele: %w", err) 198 | } 199 | case "time": 200 | w.Time, err = time.Parse(time.RFC3339, string(token.Data)) 201 | if err != nil { 202 | return fmt.Errorf("time: %w", err) 203 | } 204 | case "extensions": 205 | se := xmltokenizer.GetToken().Copy(token) 206 | err = w.TrackpointExtension.UnmarshalToken(tok, se) 207 | xmltokenizer.PutToken(se) 208 | if err != nil { 209 | return fmt.Errorf("extensions: %w", err) 210 | } 211 | } 212 | } 213 | } 214 | 215 | func (w *Waypoint) UnmarshalXML(dec *xml.Decoder, se xml.StartElement) error { 216 | w.reset() 217 | 218 | var err error 219 | for i := range se.Attr { 220 | attr := &se.Attr[i] 221 | switch attr.Name.Local { 222 | case "lat": 223 | w.Lat, err = strconv.ParseFloat(attr.Value, 64) 224 | if err != nil { 225 | return fmt.Errorf("lat: %w", err) 226 | } 227 | case "lon": 228 | w.Lon, err = strconv.ParseFloat(attr.Value, 64) 229 | if err != nil { 230 | return fmt.Errorf("lon: %w", err) 231 | } 232 | } 233 | } 234 | 235 | for { 236 | token, err := dec.Token() 237 | if err != nil { 238 | return fmt.Errorf("waypoint: %w", err) 239 | } 240 | 241 | switch elem := token.(type) { 242 | case xml.StartElement: 243 | switch elem.Name.Local { 244 | case "extensions": 245 | if err := w.TrackpointExtension.UnmarshalXML(dec, elem); err != nil { 246 | return fmt.Errorf("extensions: %w", err) 247 | } 248 | continue 249 | } 250 | charData, err := getCharData(dec) 251 | if err != nil { 252 | return fmt.Errorf("%s: %w", elem.Name.Local, err) 253 | } 254 | switch elem.Name.Local { 255 | case "ele": 256 | w.Ele, err = strconv.ParseFloat(string(charData), 64) 257 | if err != nil { 258 | return fmt.Errorf("ele: %w", err) 259 | } 260 | case "time": 261 | w.Time, err = time.Parse(time.RFC3339, string(charData)) 262 | if err != nil { 263 | return fmt.Errorf("time: %w", err) 264 | } 265 | } 266 | case xml.EndElement: 267 | if elem == se.End() { 268 | return nil 269 | } 270 | } 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /tokenizer.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | ) 8 | 9 | type errorString string 10 | 11 | func (e errorString) Error() string { return string(e) } 12 | 13 | const ( 14 | errAutoGrowBufferExceedMaxLimit = errorString("auto grow buffer exceed max limit") 15 | ) 16 | 17 | const ( 18 | defaultReadBufferSize = 4 << 10 19 | autoGrowBufferMaxLimitSize = 1000 << 10 20 | defaultAttrsBufferSize = 16 21 | ) 22 | 23 | // Tokenizer is a XML tokenizer. 24 | type Tokenizer struct { 25 | r io.Reader // reader provided by the client 26 | n int64 // the n read bytes counter 27 | options options // tokenizer's options 28 | buf []byte // buffer that will grow as needed, large enough to hold a token (default max limit: 1MB) 29 | cur int // cursor byte position 30 | err error // last encountered error 31 | token Token // shared token 32 | } 33 | 34 | type options struct { 35 | readBufferSize int 36 | autoGrowBufferMaxLimitSize int 37 | attrsBufferSize int 38 | } 39 | 40 | func defaultOptions() options { 41 | return options{ 42 | readBufferSize: defaultReadBufferSize, 43 | autoGrowBufferMaxLimitSize: autoGrowBufferMaxLimitSize, 44 | attrsBufferSize: defaultAttrsBufferSize, 45 | } 46 | } 47 | 48 | // Option is Tokenizer option. 49 | type Option func(o *options) 50 | 51 | // WithReadBufferSize directs XML Tokenizer to this buffer size 52 | // to read from the io.Reader. Default: 4096. 53 | func WithReadBufferSize(size int) Option { 54 | if size <= 0 { 55 | size = defaultReadBufferSize 56 | } 57 | return func(o *options) { o.readBufferSize = size } 58 | } 59 | 60 | // WithAutoGrowBufferMaxLimitSize directs XML Tokenizer to limit 61 | // auto grow buffer to not grow exceed this limit. Default: 1 MB. 62 | func WithAutoGrowBufferMaxLimitSize(size int) Option { 63 | if size <= 0 { 64 | size = autoGrowBufferMaxLimitSize 65 | } 66 | return func(o *options) { o.autoGrowBufferMaxLimitSize = size } 67 | } 68 | 69 | // WithAttrBufferSize directs XML Tokenizer to use this Attrs 70 | // buffer capacity as its initial size. Default: 8. 71 | func WithAttrBufferSize(size int) Option { 72 | if size <= 0 { 73 | size = defaultAttrsBufferSize 74 | } 75 | return func(o *options) { o.attrsBufferSize = size } 76 | } 77 | 78 | // New creates new XML tokenizer. 79 | func New(r io.Reader, opts ...Option) *Tokenizer { 80 | t := new(Tokenizer) 81 | t.Reset(r, opts...) 82 | return t 83 | } 84 | 85 | // Reset resets the Tokenizer, maintaining storage for 86 | // future tokenization to reduce memory alloc. 87 | func (t *Tokenizer) Reset(r io.Reader, opts ...Option) { 88 | t.r, t.err = r, nil 89 | t.n, t.cur = 0, 0 90 | 91 | t.options = defaultOptions() 92 | for i := range opts { 93 | opts[i](&t.options) 94 | } 95 | 96 | if cap(t.token.Attrs) < t.options.attrsBufferSize { 97 | t.token.Attrs = make([]Attr, 0, t.options.attrsBufferSize) 98 | } 99 | if t.options.readBufferSize > t.options.autoGrowBufferMaxLimitSize { 100 | t.options.autoGrowBufferMaxLimitSize = t.options.readBufferSize 101 | } 102 | 103 | switch size := t.options.readBufferSize; { 104 | case cap(t.buf) >= size+defaultReadBufferSize: 105 | t.buf = t.buf[:size:cap(t.buf)] 106 | default: 107 | // Create buffer with additional cap since we need to memmove remaining bytes 108 | t.buf = make([]byte, size, size+defaultReadBufferSize) 109 | } 110 | } 111 | 112 | // Token returns either a valid token or an error. 113 | // The returned token is only valid before next 114 | // Token or RawToken method invocation. 115 | func (t *Tokenizer) Token() (token Token, err error) { 116 | if t.err != nil { 117 | return token, t.err 118 | } 119 | 120 | b, err := t.RawToken() 121 | if err != nil { 122 | if !errors.Is(err, io.EOF) { 123 | err = fmt.Errorf("byte pos %d: %w", t.n, err) 124 | } 125 | if len(b) == 0 || errors.Is(err, io.ErrUnexpectedEOF) { 126 | return 127 | } 128 | t.err = err 129 | } 130 | 131 | t.clearToken() 132 | 133 | b = t.consumeNonTagIdentifier(b) 134 | if len(b) > 0 { 135 | b = t.consumeTagName(b) 136 | b = t.consumeAttrs(b) 137 | t.consumeCharData(b) 138 | } 139 | 140 | token = t.token 141 | if len(token.Attrs) == 0 { 142 | token.Attrs = nil 143 | } 144 | if len(token.Data) == 0 { 145 | token.Data = nil 146 | } 147 | 148 | return token, nil 149 | } 150 | 151 | // RawToken returns token in its raw bytes. At the end, 152 | // it may returns last token bytes and an error. 153 | // The returned token bytes is only valid before next 154 | // Token or RawToken method invocation. 155 | func (t *Tokenizer) RawToken() (b []byte, err error) { 156 | if t.err != nil { 157 | return nil, t.err 158 | } 159 | 160 | var pivot, pos = t.cur, t.cur 161 | var openclose int // zero means open '<' and close '>' is matched. 162 | for { 163 | if pos >= len(t.buf) { 164 | pivot, pos = t.memmoveRemainingBytes(pivot) 165 | if err = t.manageBuffer(); err != nil { 166 | if openclose != 0 && errors.Is(err, io.EOF) { 167 | err = io.ErrUnexpectedEOF 168 | } 169 | t.err = err 170 | return t.buf[pivot:pos], err 171 | } 172 | } 173 | switch t.buf[pos] { 174 | case '<': 175 | if openclose == 0 { 176 | pivot = pos 177 | } 178 | openclose++ 179 | case '>': 180 | if openclose--; openclose != 0 { 181 | break 182 | } 183 | 184 | switch t.buf[pivot+1] { 185 | case '?', '!': // Maybe a ProcInst ", this method will include it in the previous token. 204 | // It returns the new pivot and new position. 205 | func (t *Tokenizer) parseCharData(pivot, pos int) (newPivot, newPos int) { 206 | for i := pos + 1; ; i++ { 207 | if i >= len(t.buf) { 208 | pivot, i = t.memmoveRemainingBytes(pivot) 209 | pos = i - 1 210 | if t.err = t.manageBuffer(); t.err != nil { 211 | break 212 | } 213 | } 214 | if t.buf[i] != '<' { 215 | continue 216 | } 217 | 218 | pos = i - 1 219 | // Might be in the form of 220 | const prefix, suffix = "" 221 | var k int = 1 222 | for j := i + 1; ; j++ { 223 | if j >= len(t.buf) { 224 | prevLast := len(t.buf) 225 | pivot, j = t.memmoveRemainingBytes(pivot) 226 | pos = pos - (prevLast - len(t.buf)) 227 | if t.err = t.manageBuffer(); t.err != nil { 228 | if errors.Is(t.err, io.EOF) { 229 | t.err = io.ErrUnexpectedEOF 230 | } 231 | break 232 | } 233 | } 234 | if k < len(prefix) { 235 | if t.buf[j] != prefix[k] { 236 | break 237 | } 238 | k++ 239 | continue 240 | } 241 | if t.buf[j] == '>' && string(t.buf[j-2:j+1]) == suffix { 242 | pos = j 243 | break 244 | } 245 | } 246 | break 247 | } 248 | return pivot, pos 249 | } 250 | 251 | func (t *Tokenizer) memmoveRemainingBytes(pivot int) (cur, last int) { 252 | if pivot == 0 { 253 | return t.cur, len(t.buf) 254 | } 255 | n := copy(t.buf, t.buf[pivot:]) 256 | t.buf = t.buf[:n:cap(t.buf)] 257 | t.cur = 0 258 | return t.cur, len(t.buf) 259 | } 260 | 261 | func (t *Tokenizer) manageBuffer() error { 262 | growSize := len(t.buf) + t.options.readBufferSize 263 | start, end := len(t.buf), growSize 264 | switch { 265 | case growSize <= cap(t.buf): // Grow by reslice 266 | t.buf = t.buf[:growSize:cap(t.buf)] 267 | default: // Grow by make new alloc 268 | if growSize > t.options.autoGrowBufferMaxLimitSize { 269 | return fmt.Errorf("could not grow buffer to %d, max limit is set to %d: %w", 270 | growSize, t.options.autoGrowBufferMaxLimitSize, errAutoGrowBufferExceedMaxLimit) 271 | } 272 | buf := make([]byte, growSize) 273 | n := copy(buf, t.buf) 274 | t.buf = buf 275 | start, end = n, cap(t.buf) 276 | } 277 | 278 | n, err := io.ReadAtLeast(t.r, t.buf[start:end], 1) 279 | t.buf = t.buf[: start+n : cap(t.buf)] 280 | t.n += int64(n) 281 | 282 | return err 283 | } 284 | 285 | func (t *Tokenizer) clearToken() { 286 | t.token.Name.Prefix = nil 287 | t.token.Name.Local = nil 288 | t.token.Name.Full = nil 289 | t.token.Attrs = t.token.Attrs[:0] 290 | t.token.Data = nil 291 | t.token.SelfClosing = false 292 | t.token.IsEndElement = false 293 | } 294 | 295 | // consumeNonTagIdentifier consumes identifier starts with "', ' ', '\t', '\r', '\n': // e.g. , 320 | if b[i] == '>' && b[i-1] == '/' { // In case we encounter 321 | i-- 322 | } 323 | t.token.Name.Local = trim(b[pos:i]) 324 | t.token.Name.Full = trim(b[fullpos:i]) 325 | return b[i:] 326 | } 327 | } 328 | return b 329 | } 330 | 331 | func (t *Tokenizer) consumeAttrs(b []byte) []byte { 332 | var prefix, local, full []byte 333 | var pos, fullpos int 334 | for i := 0; i < len(b); i++ { 335 | switch b[i] { 336 | case ':': 337 | prefix = trim(b[pos:i]) 338 | pos = i + 1 339 | case '=': 340 | local = trim(b[pos:i]) 341 | full = trim(b[fullpos:i]) 342 | pos = i + 1 343 | case '"': 344 | for { 345 | i++ 346 | if i+1 == len(b) { 347 | return nil 348 | } 349 | if b[i] == '"' { 350 | break 351 | } 352 | } 353 | if len(full) == 0 { // Ignore malformed attr 354 | continue 355 | } 356 | t.token.Attrs = append(t.token.Attrs, Attr{ 357 | Name: Name{Prefix: prefix, Local: local, Full: full}, 358 | Value: trim(b[pos+1 : i]), 359 | }) 360 | prefix, local, full = nil, nil, nil 361 | pos = i + 1 362 | fullpos = i + 1 363 | case '/': 364 | t.token.SelfClosing = true 365 | case '>': 366 | return b[i+1:] 367 | } 368 | } 369 | return b 370 | } 371 | 372 | func (t *Tokenizer) consumeCharData(b []byte) { 373 | const prefix, suffix = "" 374 | b = trimPrefix(b) 375 | if len(b) >= len(prefix) && string(b[:len(prefix)]) == prefix { 376 | b = b[len(prefix):] 377 | } 378 | if end := len(b) - len(suffix); end >= 0 && string(b[end:]) == suffix { 379 | b = b[:end] 380 | } 381 | t.token.Data = trim(b) 382 | } 383 | 384 | func trim(b []byte) []byte { 385 | b = trimPrefix(b) 386 | b = trimSuffix(b) 387 | return b 388 | } 389 | 390 | func trimPrefix(b []byte) []byte { 391 | var start int 392 | for i := 0; i < len(b); i++ { 393 | switch b[i] { 394 | case '\r': 395 | if i+1 < len(b) && b[i+1] == '\n' { 396 | start += 2 397 | i++ 398 | } 399 | case '\n', ' ', '\t': 400 | start++ 401 | default: 402 | return b[start:] 403 | } 404 | } 405 | return b[start:] 406 | } 407 | 408 | func trimSuffix(b []byte) []byte { 409 | var end int = len(b) 410 | for i := len(b) - 1; i >= 0; i-- { 411 | switch b[i] { 412 | case '\n': 413 | end-- 414 | if i-1 > 0 && b[i-1] == '\r' { 415 | end-- 416 | } 417 | case ' ', '\t': 418 | end-- 419 | default: 420 | return b[:end] 421 | } 422 | } 423 | return b[:end] 424 | } 425 | -------------------------------------------------------------------------------- /tokenizer_test.go: -------------------------------------------------------------------------------- 1 | package xmltokenizer_test 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "io/fs" 9 | "math" 10 | "os" 11 | "path/filepath" 12 | "strings" 13 | "testing" 14 | 15 | "github.com/google/go-cmp/cmp" 16 | "github.com/muktihari/xmltokenizer" 17 | "github.com/muktihari/xmltokenizer/internal/gpx" 18 | "github.com/muktihari/xmltokenizer/internal/xlsx" 19 | "github.com/muktihari/xmltokenizer/internal/xlsx/schema" 20 | ) 21 | 22 | var tokenHeader = xmltokenizer.Token{Data: []byte(``), SelfClosing: true} 23 | 24 | func TestTokenWithInmemXML(t *testing.T) { 25 | tt := []struct { 26 | name string 27 | xml string 28 | expecteds []xmltokenizer.Token 29 | err error 30 | }{ 31 | { 32 | name: "dtd without entity", 33 | xml: ` 34 | 35 | 37 | 39 | World <>'" 白鵬翔 40 | &何; &is-it; 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | `, // Note: retrieved from stdlib xml test. 49 | expecteds: []xmltokenizer.Token{ 50 | { 51 | Data: []byte(``), 52 | SelfClosing: true, 53 | }, 54 | { 55 | Data: []byte(""), 57 | SelfClosing: true, 58 | }, 59 | { 60 | Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, 61 | Attrs: []xmltokenizer.Attr{ 62 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("foo"), Full: []byte("xmlns:foo")}, Value: []byte("ns1")}, 63 | {Name: xmltokenizer.Name{Local: []byte("xmlns"), Full: []byte("xmlns")}, Value: []byte("ns2")}, 64 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns3")}, 65 | }, 66 | }, 67 | { 68 | Name: xmltokenizer.Name{Local: []byte("hello"), Full: []byte("hello")}, 69 | Attrs: []xmltokenizer.Attr{ 70 | {Name: xmltokenizer.Name{Local: []byte("lang"), Full: []byte("lang")}, Value: []byte("en")}, 71 | }, 72 | Data: []byte("World <>'" 白鵬翔"), 73 | }, 74 | { 75 | Name: xmltokenizer.Name{Local: []byte("hello"), Full: []byte("hello")}, 76 | IsEndElement: true, 77 | }, 78 | { 79 | Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")}, 80 | Data: []byte("&何; &is-it;"), 81 | }, 82 | { 83 | Name: xmltokenizer.Name{Local: []byte("query"), Full: []byte("query")}, 84 | IsEndElement: true, 85 | }, 86 | { 87 | Name: xmltokenizer.Name{Local: []byte("goodbye"), Full: []byte("goodbye")}, 88 | SelfClosing: true, 89 | }, 90 | { 91 | Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")}, 92 | Attrs: []xmltokenizer.Attr{ 93 | {Name: xmltokenizer.Name{Prefix: []byte("foo"), Local: []byte("attr"), Full: []byte("foo:attr")}, Value: []byte("value")}, 94 | {Name: xmltokenizer.Name{Prefix: []byte("xmlns"), Local: []byte("tag"), Full: []byte("xmlns:tag")}, Value: []byte("ns4")}, 95 | }, 96 | }, 97 | { 98 | Name: xmltokenizer.Name{Local: []byte("inner"), Full: []byte("inner")}, 99 | SelfClosing: true, 100 | }, 101 | { 102 | Name: xmltokenizer.Name{Local: []byte("outer"), Full: []byte("outer")}, 103 | IsEndElement: true, 104 | }, 105 | { 106 | Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")}, 107 | Data: []byte("Some text here."), 108 | }, 109 | { 110 | Name: xmltokenizer.Name{Prefix: []byte("tag"), Local: []byte("name"), Full: []byte("tag:name")}, 111 | IsEndElement: true, 112 | }, 113 | { 114 | Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, 115 | IsEndElement: true, 116 | }, 117 | { 118 | Data: []byte(""), 119 | SelfClosing: true, 120 | }, 121 | }, 122 | }, 123 | { 124 | name: "unexpected EOF truncated XML after ``), 129 | SelfClosing: true, 130 | }, 131 | }, 132 | err: io.ErrUnexpectedEOF, 133 | }, 134 | { 135 | name: "unexpected quote before attr name", 136 | xml: "", 137 | expecteds: []xmltokenizer.Token{ 138 | { 139 | Data: []byte(``), 140 | SelfClosing: true, 141 | }, 142 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}}, 143 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}, IsEndElement: true}, 144 | }, 145 | }, 146 | { 147 | name: "unexpected equals in attr name", 148 | xml: "", 149 | expecteds: []xmltokenizer.Token{ 150 | { 151 | Data: []byte(``), 152 | SelfClosing: true, 153 | IsEndElement: false, 154 | }, 155 | {Name: xmltokenizer.Name{Local: []byte("Image"), Full: []byte("Image")}, 156 | Attrs: []xmltokenizer.Attr{ 157 | { 158 | Name: xmltokenizer.Name{Local: []uint8("URL"), Full: []uint8("URL")}, 159 | Value: []uint8("https://test.com/my-url-ending-in-="), 160 | }, 161 | { 162 | Name: xmltokenizer.Name{Local: []uint8("URL2"), Full: []uint8("URL2")}, 163 | Value: []uint8("https://ok.com"), 164 | }, 165 | }, 166 | SelfClosing: true, 167 | }, 168 | }, 169 | }, 170 | { 171 | name: "tab after node name", 172 | xml: ``, 173 | expecteds: []xmltokenizer.Token{ 174 | { 175 | Name: xmltokenizer.Name{ 176 | Local: []uint8("sample"), 177 | Full: []uint8("sample"), 178 | }, 179 | Attrs: []xmltokenizer.Attr{ 180 | { 181 | Name: xmltokenizer.Name{ 182 | Local: []uint8("foo"), 183 | Full: []uint8("foo")}, 184 | Value: []uint8("bar"), 185 | }, 186 | }, 187 | SelfClosing: true, 188 | }, 189 | }, 190 | }, 191 | { 192 | name: "tab after attribute value", 193 | xml: ``, 194 | expecteds: []xmltokenizer.Token{ 195 | { 196 | Name: xmltokenizer.Name{ 197 | Local: []uint8("sample"), 198 | Full: []uint8("sample"), 199 | }, 200 | Attrs: []xmltokenizer.Attr{ 201 | { 202 | Name: xmltokenizer.Name{ 203 | Local: []uint8("foo"), 204 | Full: []uint8("foo")}, 205 | Value: []uint8("bar"), 206 | }, 207 | }, 208 | SelfClosing: true, 209 | }, 210 | }, 211 | }, 212 | { 213 | name: "tab between attributes", 214 | xml: ``, 215 | expecteds: []xmltokenizer.Token{ 216 | { 217 | Name: xmltokenizer.Name{ 218 | Local: []uint8("sample"), 219 | Full: []uint8("sample"), 220 | }, 221 | Attrs: []xmltokenizer.Attr{ 222 | { 223 | Name: xmltokenizer.Name{ 224 | Local: []uint8("foo"), 225 | Full: []uint8("foo")}, 226 | Value: []uint8("bar"), 227 | }, 228 | { 229 | Name: xmltokenizer.Name{ 230 | Local: []uint8("baz"), 231 | Full: []uint8("baz")}, 232 | Value: []uint8("quux"), 233 | }, 234 | }, 235 | SelfClosing: true, 236 | }, 237 | }, 238 | }, 239 | { 240 | name: "slash inside attribute value", 241 | xml: ``, 242 | expecteds: []xmltokenizer.Token{ 243 | { 244 | Name: xmltokenizer.Name{Local: []byte("sample"), Full: []byte("sample")}, 245 | Attrs: []xmltokenizer.Attr{ 246 | { 247 | Name: xmltokenizer.Name{Local: []uint8("path"), Full: []uint8("path")}, 248 | Value: []uint8("foo/bar/baz"), 249 | }, 250 | }, 251 | }, 252 | }, 253 | }, 254 | } 255 | 256 | for i, tc := range tt { 257 | t.Run(fmt.Sprintf("[%d]: %s", i, tc.name), func(t *testing.T) { 258 | tok := xmltokenizer.New( 259 | bytes.NewReader([]byte(tc.xml)), 260 | xmltokenizer.WithReadBufferSize(1), // Read per char so we can cover more code paths 261 | ) 262 | 263 | for i := 0; ; i++ { 264 | token, err := tok.Token() 265 | if err == io.EOF { 266 | if i != len(tc.expecteds) { 267 | t.Fatalf("expected %d tokens, got %d", len(tc.expecteds), i) 268 | } 269 | break 270 | } 271 | if err != nil { 272 | if !errors.Is(err, tc.err) { 273 | t.Fatalf("expected error: %v, got: %v", tc.err, err) 274 | } 275 | return 276 | } 277 | if diff := cmp.Diff(token, tc.expecteds[i]); diff != "" { 278 | t.Fatalf("%d: %s", i, diff) 279 | } 280 | } 281 | }) 282 | } 283 | } 284 | 285 | func TestTokenWithSmallXMLFiles(t *testing.T) { 286 | tt := []struct { 287 | filename string 288 | expecteds []xmltokenizer.Token 289 | err error 290 | }{ 291 | {filename: "cdata.xml", expecteds: []xmltokenizer.Token{ 292 | tokenHeader, 293 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}}, 294 | { 295 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 296 | Data: []byte("text"), 297 | }, 298 | { 299 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 300 | IsEndElement: true, 301 | }, 302 | { 303 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 304 | Data: []byte("text"), 305 | }, 306 | { 307 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 308 | IsEndElement: true, 309 | }, 310 | { 311 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 312 | Data: []byte("text"), 313 | }, 314 | { 315 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 316 | IsEndElement: true, 317 | }, 318 | { 319 | Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}, 320 | IsEndElement: true, 321 | }, 322 | }}, 323 | {filename: "cdata_clrf.xml", expecteds: []xmltokenizer.Token{ 324 | tokenHeader, 325 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}}, 326 | { 327 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 328 | Data: []byte("text"), 329 | }, 330 | { 331 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 332 | IsEndElement: true, 333 | }, 334 | { 335 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 336 | Data: []byte("text"), 337 | }, 338 | { 339 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 340 | IsEndElement: true, 341 | }, 342 | { 343 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 344 | Data: []byte("text"), 345 | }, 346 | { 347 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 348 | IsEndElement: true, 349 | }, 350 | { 351 | Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}, 352 | IsEndElement: true, 353 | }, 354 | }}, 355 | {filename: filepath.Join("corrupted", "cdata_truncated.xml"), expecteds: []xmltokenizer.Token{ 356 | tokenHeader, 357 | {Name: xmltokenizer.Name{Local: []byte("content"), Full: []byte("content")}}, 358 | { 359 | Name: xmltokenizer.Name{Local: []byte("data"), Full: []byte("data")}, 360 | }, 361 | }, 362 | err: io.ErrUnexpectedEOF, 363 | }, 364 | {filename: "self_closing.xml", expecteds: []xmltokenizer.Token{ 365 | tokenHeader, 366 | {Name: xmltokenizer.Name{Local: []byte("a"), Full: []byte("a")}, SelfClosing: true}, 367 | {Name: xmltokenizer.Name{Local: []byte("b"), Full: []byte("b")}, SelfClosing: true}, 368 | }}, 369 | {filename: "copyright_header.xml", expecteds: []xmltokenizer.Token{ 370 | {Data: []byte(""), SelfClosing: true}, 371 | tokenHeader, 372 | }}, 373 | {filename: "dtd.xml", expecteds: []xmltokenizer.Token{ 374 | tokenHeader, 375 | { 376 | Data: []byte("\n" + 378 | " \n" + 379 | " \n" + 380 | "]>"), 381 | SelfClosing: true, 382 | }, 383 | {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}}, 384 | {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, Data: []byte("Tove")}, 385 | {Name: xmltokenizer.Name{Local: []byte("to"), Full: []byte("to")}, IsEndElement: true}, 386 | {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, Data: []byte("Jani")}, 387 | {Name: xmltokenizer.Name{Local: []byte("from"), Full: []byte("from")}, IsEndElement: true}, 388 | {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, Data: []byte("Reminder")}, 389 | {Name: xmltokenizer.Name{Local: []byte("heading"), Full: []byte("heading")}, IsEndElement: true}, 390 | {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, Data: []byte("Don't forget me this weekend!")}, 391 | {Name: xmltokenizer.Name{Local: []byte("body"), Full: []byte("body")}, IsEndElement: true}, 392 | {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, Data: []byte("&writer; ©right;")}, 393 | {Name: xmltokenizer.Name{Local: []byte("footer"), Full: []byte("footer")}, IsEndElement: true}, 394 | {Name: xmltokenizer.Name{Local: []byte("note"), Full: []byte("note")}, IsEndElement: true}, 395 | }}, 396 | } 397 | 398 | for i, tc := range tt { 399 | t.Run(fmt.Sprintf("[%d], %s", i, tc.filename), func(t *testing.T) { 400 | path := filepath.Join("testdata", tc.filename) 401 | f, err := os.Open(path) 402 | if err != nil { 403 | panic(err) 404 | } 405 | defer f.Close() 406 | 407 | tok := xmltokenizer.New(f, xmltokenizer.WithReadBufferSize(1)) 408 | for i := 0; ; i++ { 409 | token, err := tok.Token() 410 | if err == io.EOF { 411 | break 412 | } 413 | if err != nil { 414 | if !errors.Is(err, tc.err) { 415 | t.Fatalf("expected error: %v, got: %v", tc.err, err) 416 | } 417 | return 418 | } 419 | 420 | if diff := cmp.Diff(token, tc.expecteds[i]); diff != "" { 421 | t.Fatal(diff) 422 | } 423 | } 424 | }) 425 | } 426 | } 427 | 428 | func TestTokenOnGPXFiles(t *testing.T) { 429 | filepath.Walk("testdata", func(path string, info fs.FileInfo, _ error) error { 430 | t.Run(path, func(t *testing.T) { 431 | if info.IsDir() { 432 | return 433 | } 434 | if strings.ToLower(filepath.Ext(path)) != ".gpx" { 435 | return 436 | } 437 | 438 | data, err := os.ReadFile(path) 439 | if err != nil { 440 | t.Skip(err) 441 | } 442 | 443 | gpx1, err := gpx.UnmarshalWithXMLTokenizer(bytes.NewReader(data)) 444 | if err != nil { 445 | t.Fatalf("xmltokenizer: %v", err) 446 | } 447 | 448 | gpx2, err := gpx.UnmarshalWithStdlibXML(bytes.NewReader(data)) 449 | if err != nil { 450 | t.Fatalf("xml: %v", err) 451 | } 452 | 453 | if diff := cmp.Diff(gpx1, gpx2, 454 | cmp.Transformer("float64", func(x float64) uint64 { 455 | return math.Float64bits(x) 456 | }), 457 | ); diff != "" { 458 | t.Fatal(diff) 459 | } 460 | }) 461 | 462 | return nil 463 | }) 464 | } 465 | 466 | func TestTokenOnXLSXFiles(t *testing.T) { 467 | path := filepath.Join("testdata", "xlsx_sheet1.xml") 468 | 469 | data, err := os.ReadFile(path) 470 | if err != nil { 471 | t.Skip(err) 472 | } 473 | 474 | sheet1, err := xlsx.UnmarshalWithXMLTokenizer(bytes.NewReader(data)) 475 | if err != nil { 476 | t.Fatalf("xmltokenizer: %v", err) 477 | } 478 | sheet2, err := xlsx.UnmarshalWithStdlibXML(bytes.NewReader(data)) 479 | if err != nil { 480 | t.Fatalf("xml: %v", err) 481 | } 482 | 483 | if diff := cmp.Diff(sheet1, sheet2); diff != "" { 484 | t.Fatal(diff) 485 | } 486 | } 487 | 488 | func TestAutoGrowBufferCorrectness(t *testing.T) { 489 | path := filepath.Join("testdata", "xlsx_sheet1.xml") 490 | f, err := os.Open(path) 491 | if err != nil { 492 | panic(err) 493 | } 494 | defer f.Close() 495 | 496 | tok := xmltokenizer.New(f, 497 | xmltokenizer.WithReadBufferSize(1), 498 | ) 499 | 500 | var token xmltokenizer.Token 501 | var sheetData1 schema.SheetData 502 | loop: 503 | for { 504 | token, err = tok.Token() 505 | if err == io.EOF { 506 | break 507 | } 508 | if err != nil { 509 | t.Fatal(err) 510 | } 511 | 512 | switch string(token.Name.Local) { 513 | case "sheetData": 514 | se := xmltokenizer.GetToken().Copy(token) 515 | err = sheetData1.UnmarshalToken(tok, se) 516 | xmltokenizer.PutToken(se) 517 | if err != nil { 518 | t.Fatal(err) 519 | } 520 | break loop 521 | } 522 | } 523 | 524 | f2, err := os.Open(path) 525 | if err != nil { 526 | panic(err) 527 | } 528 | defer f2.Close() 529 | 530 | sheetData2, err := xlsx.UnmarshalWithStdlibXML(f2) 531 | if err != nil { 532 | t.Fatal(err) 533 | } 534 | 535 | if diff := cmp.Diff(sheetData1, sheetData2); diff != "" { 536 | t.Fatal(err) 537 | } 538 | } 539 | 540 | func TestRawTokenWithInmemXML(t *testing.T) { 541 | tt := []struct { 542 | name string 543 | xml string 544 | expecteds []string 545 | err error 546 | }{ 547 | { 548 | name: "simple xml happy flow", 549 | xml: ` 550 | 552 | 554 | World <>'" 白鵬翔 555 | &何; &is-it; 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | `, // Note: retrieved from stdlib xml test. 564 | expecteds: []string{ 565 | "", 566 | "", 568 | "", 570 | "World <>'" 白鵬翔", 571 | "", 572 | "&何; &is-it;", 573 | "", 574 | "", 575 | "", 576 | "", 577 | "", 578 | "\n ", 579 | "", 580 | "", 581 | "", 582 | }, 583 | }, 584 | { 585 | name: "unexpected EOF truncated XML after `", 589 | "