├── .godocdown.md ├── .travis.yml ├── LICENSE.md ├── README.md ├── decoder.go ├── decoder_test.go ├── doc.go ├── go.mod ├── go.sum ├── unmarshal-error.go ├── unmarshal.go ├── unmarshal_test.go └── util.go /.godocdown.md: -------------------------------------------------------------------------------- 1 | # goq 2 | [](https://travis-ci.org/andrewstuart/goq) 3 | [](https://godoc.org/astuart.co/goq) 4 | [](https://coveralls.io/github/andrewstuart/goq?branch=master) 5 | [](https://goreportcard.com/report/astuart.co/goq) 6 | 7 | ## Example 8 | 9 | ```go 10 | import ( 11 | "log" 12 | "net/http" 13 | 14 | "astuart.co/goq" 15 | ) 16 | 17 | // Structured representation for github file name table 18 | type example struct { 19 | Title string `goquery:"h1"` 20 | Files []string `goquery:"table.files tbody tr.js-navigation-item td.content,text"` 21 | } 22 | 23 | func main() { 24 | res, err := http.Get("https://github.com/andrewstuart/goq") 25 | if err != nil { 26 | log.Fatal(err) 27 | } 28 | defer res.Body.Close() 29 | 30 | var ex example 31 | 32 | err = goq.NewDecoder(res.Body).Decode(&ex) 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | 37 | log.Println(ex.Title, ex.Files) 38 | } 39 | ``` 40 | 41 | ## Details 42 | 43 | {{ .Emit }} 44 | 45 | ## TODO 46 | 47 | - Callable goquery methods with args, via reflection 48 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go_import_path: astuart.co/goq 3 | 4 | matrix: 5 | allow_failures: 6 | - go: tip 7 | 8 | before_install: 9 | - go get github.com/mattn/goveralls 10 | script: 11 | - $HOME/gopath/bin/goveralls -service=travis-ci 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2017 Andrew Stuart 3 | 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 21 | OR OTHER DEALINGS IN THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # goq 2 | [](https://travis-ci.org/andrewstuart/goq) 3 | [](https://godoc.org/astuart.co/goq) 4 | [](https://coveralls.io/github/andrewstuart/goq?branch=master) 5 | [](https://goreportcard.com/report/astuart.co/goq) 6 | 7 | ## Example 8 | 9 | ```go 10 | import ( 11 | "log" 12 | "net/http" 13 | 14 | "astuart.co/goq" 15 | ) 16 | 17 | // Structured representation for github file name table 18 | type example struct { 19 | Title string `goquery:"h1"` 20 | Files []string `goquery:"table.files tbody tr.js-navigation-item td.content,text"` 21 | } 22 | 23 | func main() { 24 | res, err := http.Get("https://github.com/andrewstuart/goq") 25 | if err != nil { 26 | log.Fatal(err) 27 | } 28 | defer res.Body.Close() 29 | 30 | var ex example 31 | 32 | err = goq.NewDecoder(res.Body).Decode(&ex) 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | 37 | log.Println(ex.Title, ex.Files) 38 | } 39 | ``` 40 | 41 | ## Details 42 | 43 | # goq 44 | -- 45 | import "astuart.co/goq" 46 | 47 | Package goq was built to allow users to declaratively unmarshal HTML into go 48 | structs using struct tags composed of css selectors. 49 | 50 | I've made a best effort to behave very similarly to JSON and XML decoding as 51 | well as exposing as much information as possible in the event of an error to 52 | help you debug your Unmarshaling issues. 53 | 54 | When creating struct types to be unmarshaled into, the following general rules 55 | apply: 56 | 57 | - Any type that implements the Unmarshaler interface will be passed a slice of 58 | *html.Node so that manual unmarshaling may be done. This takes the highest 59 | precedence. 60 | 61 | - Any struct fields may be annotated with goquery metadata, which takes the form 62 | of an element selector followed by arbitrary comma-separated "value selectors." 63 | 64 | - A value selector may be one of `html`, `text`, or `[someAttrName]`. `html` and 65 | `text` will result in the methods of the same name being called on the 66 | `*goquery.Selection` to obtain the value. `[someAttrName]` will result in 67 | `*goquery.Selection.Attr("someAttrName")` being called for the value. 68 | 69 | - A primitive value type will default to the text value of the resulting nodes 70 | if no value selector is given. 71 | 72 | - At least one value selector is required for maps, to determine the map key. 73 | The key type must follow both the rules applicable to go map indexing, as well 74 | as these unmarshaling rules. The value of each key will be unmarshaled in the 75 | same way the element value is unmarshaled. 76 | 77 | - For maps, keys will be retreived from the *same level* of the DOM. The key 78 | selector may be arbitrarily nested, though. The first level of children with any 79 | number of matching elements will be used, though. 80 | 81 | - For maps, any values *must* be nested *below* the level of the key selector. 82 | Parents or siblings of the element matched by the key selector will not be 83 | considered. 84 | 85 | - Once used, a "value selector" will be shifted off of the comma-separated list. 86 | This allows you to nest arbitrary levels of value selectors. For example, the 87 | type `[]map[string][]string` would require one selector for the map key, and 88 | take an optional second selector for the values of the string slice. 89 | 90 | - Any struct type encountered in nested types (e.g. map[string]SomeStruct) will 91 | override any remaining "value selectors" that had not been used. For example, 92 | given: 93 | 94 | struct S { 95 | F string `goquery:",[bang]"` 96 | } 97 | 98 | struct { 99 | T map[string]S `goquery:"#someId,[foo],[bar],[baz]"` 100 | } 101 | 102 | `[foo]` will be used to determine the string map key,but `[bar]` and `[baz]` 103 | will be ignored, with the `[bang]` tag present S struct type taking precedence. 104 | 105 | ## Usage 106 | 107 | #### func NodeSelector 108 | 109 | ```go 110 | func NodeSelector(nodes []*html.Node) *goquery.Selection 111 | ``` 112 | NodeSelector is a quick utility function to get a goquery.Selection from a slice 113 | of *html.Node. Useful for performing unmarshaling, since the decision was made 114 | to use []*html.Node for maximum flexibility. 115 | 116 | #### func Unmarshal 117 | 118 | ```go 119 | func Unmarshal(bs []byte, v interface{}) error 120 | ``` 121 | Unmarshal takes a byte slice and a destination pointer to any interface{}, and 122 | unmarshals the document into the destination based on the rules above. Any error 123 | returned here will likely be of type CannotUnmarshalError, though an initial 124 | goquery error will pass through directly. 125 | 126 | #### func UnmarshalSelection 127 | 128 | ```go 129 | func UnmarshalSelection(s *goquery.Selection, iface interface{}) error 130 | ``` 131 | UnmarshalSelection will unmarshal a goquery.goquery.Selection into an interface 132 | appropriately annoated with goquery tags. 133 | 134 | #### type CannotUnmarshalError 135 | 136 | ```go 137 | type CannotUnmarshalError struct { 138 | Err error 139 | Val string 140 | FldOrIdx interface{} 141 | } 142 | ``` 143 | 144 | CannotUnmarshalError represents an error returned by the goquery Unmarshaler and 145 | helps consumers in programmatically diagnosing the cause of their error. 146 | 147 | #### func (*CannotUnmarshalError) Error 148 | 149 | ```go 150 | func (e *CannotUnmarshalError) Error() string 151 | ``` 152 | 153 | #### type Decoder 154 | 155 | ```go 156 | type Decoder struct { 157 | } 158 | ``` 159 | 160 | Decoder implements the same API you will see in encoding/xml and encoding/json 161 | except that we do not currently support proper streaming decoding as it is not 162 | supported by goquery upstream. 163 | 164 | #### func NewDecoder 165 | 166 | ```go 167 | func NewDecoder(r io.Reader) *Decoder 168 | ``` 169 | NewDecoder returns a new decoder given an io.Reader 170 | 171 | #### func (*Decoder) Decode 172 | 173 | ```go 174 | func (d *Decoder) Decode(dest interface{}) error 175 | ``` 176 | Decode will unmarshal the contents of the decoder when given an instance of an 177 | annotated type as its argument. It will return any errors encountered during 178 | either parsing the document or unmarshaling into the given object. 179 | 180 | #### type Unmarshaler 181 | 182 | ```go 183 | type Unmarshaler interface { 184 | UnmarshalHTML([]*html.Node) error 185 | } 186 | ``` 187 | 188 | Unmarshaler allows for custom implementations of unmarshaling logic 189 | 190 | ## TODO 191 | 192 | - Callable goquery methods with args, via reflection 193 | -------------------------------------------------------------------------------- /decoder.go: -------------------------------------------------------------------------------- 1 | package goq 2 | 3 | import ( 4 | "io" 5 | "sync" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | // Decoder implements the same API you will see in encoding/xml and 11 | // encoding/json except that we do not currently support proper streaming 12 | // decoding as it is not supported by goquery upstream. 13 | type Decoder struct { 14 | err error 15 | doc *goquery.Document 16 | cache sync.Map 17 | } 18 | 19 | // NewDecoder returns a new decoder given an io.Reader 20 | func NewDecoder(r io.Reader) *Decoder { 21 | d := &Decoder{} 22 | d.doc, d.err = goquery.NewDocumentFromReader(r) 23 | return d 24 | } 25 | 26 | // Decode will unmarshal the contents of the decoder when given an instance of 27 | // an annotated type as its argument. It will return any errors encountered 28 | // during either parsing the document or unmarshaling into the given object. 29 | func (d *Decoder) Decode(dest interface{}) error { 30 | if d.err != nil { 31 | return d.err 32 | } 33 | if d.doc == nil { 34 | return &CannotUnmarshalError{ 35 | Reason: "resulting document was nil", 36 | } 37 | } 38 | 39 | return UnmarshalSelection(d.doc.Selection, dest) 40 | } 41 | -------------------------------------------------------------------------------- /decoder_test.go: -------------------------------------------------------------------------------- 1 | package goq 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestDecoder(t *testing.T) { 11 | asrt := assert.New(t) 12 | 13 | var p page 14 | 15 | asrt.NoError(NewDecoder(strings.NewReader(hnPage)).Decode(&p)) 16 | asrt.Len(p.Items, 30) 17 | } 18 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Package goq was built to allow users to declaratively unmarshal HTML into go 2 | // structs using struct tags composed of css selectors. 3 | // 4 | // I've made a best effort to behave very similarly to JSON and XML decoding as 5 | // well as exposing as much information as possible in the event of an error to 6 | // help you debug your Unmarshaling issues. 7 | // 8 | // When creating struct types to be unmarshaled into, the following general 9 | // rules apply: 10 | // 11 | // - Any type that implements the Unmarshaler interface will be passed a slice 12 | // of *html.Node so that manual unmarshaling may be done. This takes the 13 | // highest precedence. 14 | // 15 | // - Any struct fields may be annotated with goquery metadata, which takes the 16 | // form of an element selector followed by arbitrary comma-separated "value 17 | // selectors." 18 | // 19 | // - A value selector may be one of `html`, `text`, or `[someAttrName]`. `html` 20 | // and `text` will result in the methods of the same name being called on the 21 | // `*goquery.Selection` to obtain the value. `[someAttrName]` will result in 22 | // `*goquery.Selection.Attr("someAttrName")` being called for the value. 23 | // 24 | // - A primitive value type will default to the text value of the resulting 25 | // nodes if no value selector is given. 26 | // 27 | // - At least one value selector is required for maps, to determine the map key. 28 | // The key type must follow both the rules applicable to go map indexing, as 29 | // well as these unmarshaling rules. The value of each key will be unmarshaled 30 | // in the same way the element value is unmarshaled. 31 | // 32 | // - For maps, keys will be retreived from the *same level* of the DOM. The key 33 | // selector may be arbitrarily nested, though. The first level of children 34 | // with any number of matching elements will be used, though. 35 | // 36 | // - For maps, any values *must* be nested *below* the level of the key 37 | // selector. Parents or siblings of the element matched by the key selector will 38 | // not be considered. 39 | // 40 | // - Once used, a "value selector" will be shifted off of the comma-separated 41 | // list. This allows you to nest arbitrary levels of value selectors. For 42 | // example, the type `[]map[string][]string` would require one selector for the 43 | // map key, and take an optional second selector for the values of the string 44 | // slice. 45 | // 46 | // - Any struct type encountered in nested types (e.g. map[string]SomeStruct) 47 | // will override any remaining "value selectors" that had not been used. For 48 | // example, given: 49 | // struct S { 50 | // F string `goquery:",[bang]"` 51 | // } 52 | // 53 | // struct { 54 | // T map[string]S `goquery:"#someId,[foo],[bar],[baz]"` 55 | // } 56 | // `[foo]` will be used to determine the string map key,but `[bar]` and `[baz]` 57 | // will be ignored, with the `[bang]` tag present S struct type taking 58 | // precedence. 59 | package goq 60 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module astuart.co/goq 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.7.1 7 | github.com/stretchr/testify v1.3.0 8 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 // indirect 9 | golang.org/x/net v0.0.0-20210825183410-e898025ed96a 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk= 2 | github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= 3 | github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4= 4 | github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY= 5 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= 6 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 7 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= 8 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= 9 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 10 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 11 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 12 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 14 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 15 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 16 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 17 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 18 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 19 | golang.org/x/net v0.0.0-20190606173856-1492cefac77f h1:IWHgpgFqnL5AhBUBZSgBdjl2vkQUEzcY+JNKWfcgAU0= 20 | golang.org/x/net v0.0.0-20190606173856-1492cefac77f/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= 21 | golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 22 | golang.org/x/net v0.0.0-20210825183410-e898025ed96a h1:bRuuGXV8wwSdGTB+CtJf+FjgO1APK1CoO39T4BN/XBw= 23 | golang.org/x/net v0.0.0-20210825183410-e898025ed96a/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 24 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 25 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 26 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 27 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 28 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 29 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 30 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 31 | -------------------------------------------------------------------------------- /unmarshal-error.go: -------------------------------------------------------------------------------- 1 | package goq 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | ) 7 | 8 | // All "Reason" fields within CannotUnmarshalError will be constants and part of 9 | // this list 10 | const ( 11 | nonPointer = "non-pointer value" 12 | nilValue = "destination argument is nil" 13 | documentReadError = "error reading goquery document" 14 | arrayLengthMismatch = "array length does not match document elements found" 15 | customUnmarshalError = "a custom Unmarshaler implementation threw an error" 16 | typeConversionError = "a type conversion error occurred" 17 | mapKeyUnmarshalError = "error unmarshaling a map key" 18 | missingValueSelector = "at least one value selector must be passed to use as map index" 19 | ) 20 | 21 | // CannotUnmarshalError represents an error returned by the goquery Unmarshaler 22 | // and helps consumers in programmatically diagnosing the cause of their error. 23 | type CannotUnmarshalError struct { 24 | Err error 25 | Val string 26 | FldOrIdx interface{} 27 | 28 | V reflect.Value 29 | Reason string 30 | } 31 | 32 | // This type is a mid-level abstraction to help understand the error printing logic 33 | type errChain struct { 34 | chain []*CannotUnmarshalError 35 | val string 36 | tail error 37 | } 38 | 39 | // tPath returns the type path in the same string format one might use to access 40 | // the nested value in go code. This should hopefully help make debugging easier. 41 | func (e errChain) tPath() string { 42 | nest := "" 43 | 44 | for _, err := range e.chain { 45 | if err.FldOrIdx != nil { 46 | switch nesting := err.FldOrIdx.(type) { 47 | case string: 48 | switch err.V.Type().Kind() { 49 | case reflect.Map: 50 | nest += fmt.Sprintf("[%q]", nesting) 51 | case reflect.Struct: 52 | nest += fmt.Sprintf(".%s", nesting) 53 | } 54 | case int: 55 | nest += fmt.Sprintf("[%d]", nesting) 56 | case *int: 57 | nest += fmt.Sprintf("[%d]", *nesting) 58 | default: 59 | fmt.Printf("err.FldOrIdx = %#v\n", err.FldOrIdx) 60 | nest += fmt.Sprintf("[%v]", nesting) 61 | } 62 | } 63 | } 64 | 65 | return nest 66 | } 67 | 68 | func (e errChain) last() *CannotUnmarshalError { 69 | return e.chain[len(e.chain)-1] 70 | } 71 | 72 | // Error gives a human-readable error message for debugging purposes. 73 | func (e errChain) Error() string { 74 | last := e.last() 75 | 76 | // Avoid panic if we cannot get a type name for the Value 77 | t := "unknown: invalid value" 78 | if last.V.IsValid() { 79 | t = last.V.Type().String() 80 | } 81 | 82 | msg := "could not unmarshal " 83 | 84 | if e.val != "" { 85 | msg += fmt.Sprintf("value %q ", e.val) 86 | } 87 | 88 | msg += fmt.Sprintf( 89 | "into '%s%s' (type %s): %s", 90 | e.chain[0].V.Type(), 91 | e.tPath(), 92 | t, 93 | last.Reason, 94 | ) 95 | 96 | // If a generic error was reported elsewhere, report its message last 97 | if e.tail != nil { 98 | msg = msg + ": " + e.tail.Error() 99 | } 100 | 101 | return msg 102 | } 103 | 104 | // Traverse e.Err, printing hopefully helpful type info until there are no more 105 | // chained errors. 106 | func (e *CannotUnmarshalError) unwind() *errChain { 107 | str := &errChain{chain: []*CannotUnmarshalError{}} 108 | for { 109 | str.chain = append(str.chain, e) 110 | 111 | if e.Val != "" { 112 | str.val = e.Val 113 | } 114 | 115 | // Terminal error was of type *CannotUnmarshalError and had no children 116 | if e.Err == nil { 117 | return str 118 | } 119 | 120 | if e2, ok := e.Err.(*CannotUnmarshalError); ok { 121 | e = e2 122 | continue 123 | } 124 | 125 | // Child error was not a *CannotUnmarshalError; print its message 126 | str.tail = e.Err 127 | return str 128 | } 129 | } 130 | 131 | func (e *CannotUnmarshalError) Error() string { 132 | return e.unwind().Error() 133 | } 134 | -------------------------------------------------------------------------------- /unmarshal.go: -------------------------------------------------------------------------------- 1 | package goq 2 | 3 | import ( 4 | "bytes" 5 | "reflect" 6 | "strconv" 7 | "strings" 8 | "sync" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | 12 | "golang.org/x/net/html" 13 | ) 14 | 15 | // Unmarshaler allows for custom implementations of unmarshaling logic 16 | type Unmarshaler interface { 17 | UnmarshalHTML([]*html.Node) error 18 | } 19 | 20 | // NodeSelector is a quick utility function to get a goquery.Selection from a 21 | // slice of *html.Node. Useful for performing unmarshaling, since the decision 22 | // was made to use []*html.Node for maximum flexibility. 23 | func NodeSelector(nodes []*html.Node) *goquery.Selection { 24 | sel := &goquery.Selection{} 25 | return sel.AddNodes(nodes...) 26 | } 27 | 28 | type valFunc func(*goquery.Selection) string 29 | 30 | type goqueryTag string 31 | 32 | const ( 33 | prePfx = '!' 34 | tagName = "goquery" 35 | ignoreTag = "!ignore" 36 | ) 37 | 38 | func (tag goqueryTag) preprocess(s *goquery.Selection) *goquery.Selection { 39 | arr := strings.Split(string(tag), ",") 40 | var offset int 41 | for len(arr)-1 > offset && arr[offset][0] == prePfx { 42 | meth := arr[offset][1:] 43 | v := reflect.ValueOf(s).MethodByName(meth) 44 | if !v.IsValid() { 45 | return s 46 | } 47 | 48 | result := v.Call(nil) 49 | 50 | if sel, ok := result[0].Interface().(*goquery.Selection); ok { 51 | s = sel 52 | } 53 | offset++ 54 | } 55 | return s 56 | } 57 | 58 | func (tag goqueryTag) selector(which int) string { 59 | arr := strings.Split(string(tag), ",") 60 | if which > len(arr)-1 { 61 | return "" 62 | } 63 | var offset int 64 | for len(arr) > offset && arr[offset][0] == prePfx { 65 | offset++ 66 | } 67 | return arr[which+offset] 68 | } 69 | 70 | var ( 71 | textVal valFunc = func(s *goquery.Selection) string { 72 | return strings.TrimSpace(s.Text()) 73 | } 74 | htmlVal = func(s *goquery.Selection) string { 75 | str, _ := s.Html() 76 | return strings.TrimSpace(str) 77 | } 78 | 79 | vfCache sync.Map 80 | ) 81 | 82 | func attrFunc(attr string) valFunc { 83 | return func(s *goquery.Selection) string { 84 | str, _ := s.Attr(attr) 85 | return str 86 | } 87 | } 88 | 89 | func (tag goqueryTag) valFunc() valFunc { 90 | 91 | if fn, ok := vfCache.Load(tag); ok { 92 | return fn.(valFunc) 93 | } 94 | 95 | srcArr := strings.Split(string(tag), ",") 96 | if len(srcArr) < 2 { 97 | vfCache.Store(tag, textVal) 98 | return textVal 99 | } 100 | 101 | src := srcArr[1] 102 | 103 | var f valFunc 104 | switch { 105 | case src[0] == '[': 106 | // [someattr] will return value of .Attr("someattr") 107 | attr := src[1 : len(src)-1] 108 | f = attrFunc(attr) 109 | case src == "html": 110 | f = htmlVal 111 | case src == "text": 112 | f = textVal 113 | default: 114 | f = textVal 115 | } 116 | 117 | vfCache.Store(tag, f) 118 | return f 119 | } 120 | 121 | // popVal should allow us to handle arbitrarily nested maps as well as the 122 | // cleanly handling the possiblity of map[literal]literal by just delegating 123 | // back to `unmarshalByType`. 124 | func (tag goqueryTag) popVal() goqueryTag { 125 | arr := strings.Split(string(tag), ",") 126 | if len(arr) < 2 { 127 | return tag 128 | } 129 | newA := []string{arr[0]} 130 | newA = append(newA, arr[2:]...) 131 | 132 | return goqueryTag(strings.Join(newA, ",")) 133 | } 134 | 135 | // Unmarshal takes a byte slice and a destination pointer to any 136 | // interface{}, and unmarshals the document into the destination based on the 137 | // rules above. Any error returned here will likely be of type 138 | // CannotUnmarshalError, though an initial goquery error will pass through 139 | // directly. 140 | func Unmarshal(bs []byte, v interface{}) error { 141 | d, err := goquery.NewDocumentFromReader(bytes.NewReader(bs)) 142 | 143 | if err != nil { 144 | return err 145 | } 146 | 147 | return UnmarshalSelection(d.Selection, v) 148 | } 149 | 150 | func wrapUnmErr(err error, v reflect.Value) error { 151 | if err == nil { 152 | return nil 153 | } 154 | 155 | return &CannotUnmarshalError{ 156 | V: v, 157 | Reason: customUnmarshalError, 158 | Err: err, 159 | } 160 | } 161 | 162 | // UnmarshalSelection will unmarshal a goquery.goquery.Selection into an interface 163 | // appropriately annoated with goquery tags. 164 | func UnmarshalSelection(s *goquery.Selection, iface interface{}) error { 165 | v := reflect.ValueOf(iface) 166 | 167 | // Must come before v.IsNil() else IsNil panics on NonPointer value 168 | if v.Kind() != reflect.Ptr { 169 | return &CannotUnmarshalError{V: v, Reason: nonPointer} 170 | } 171 | 172 | if iface == nil || v.IsNil() { 173 | return &CannotUnmarshalError{V: v, Reason: nilValue} 174 | } 175 | 176 | u, v := indirect(v) 177 | 178 | if u != nil { 179 | return wrapUnmErr(u.UnmarshalHTML(s.Nodes), v) 180 | } 181 | 182 | return unmarshalByType(s, v, "") 183 | } 184 | 185 | func unmarshalByType(s *goquery.Selection, v reflect.Value, tag goqueryTag) error { 186 | u, v := indirect(v) 187 | 188 | if u != nil { 189 | return wrapUnmErr(u.UnmarshalHTML(s.Nodes), v) 190 | } 191 | 192 | // Handle special cases where we can just set the value directly 193 | switch val := v.Interface().(type) { 194 | case []*html.Node: 195 | val = append(val, s.Nodes...) 196 | v.Set(reflect.ValueOf(val)) 197 | return nil 198 | } 199 | 200 | t := v.Type() 201 | 202 | switch t.Kind() { 203 | case reflect.Struct: 204 | return unmarshalStruct(s, v) 205 | case reflect.Slice: 206 | return unmarshalSlice(s, v, tag) 207 | case reflect.Array: 208 | return unmarshalArray(s, v, tag) 209 | case reflect.Map: 210 | return unmarshalMap(s, v, tag) 211 | default: 212 | vf := tag.valFunc() 213 | str := vf(s) 214 | err := unmarshalLiteral(str, v) 215 | if err != nil { 216 | return &CannotUnmarshalError{ 217 | V: v, 218 | Reason: typeConversionError, 219 | Err: err, 220 | Val: str, 221 | } 222 | } 223 | return nil 224 | } 225 | } 226 | 227 | func unmarshalLiteral(s string, v reflect.Value) error { 228 | t := v.Type() 229 | 230 | switch t.Kind() { 231 | case reflect.Interface: 232 | if t.NumMethod() == 0 { 233 | // For empty interfaces, just set to a string 234 | nv := reflect.New(reflect.TypeOf(s)).Elem() 235 | nv.Set(reflect.ValueOf(s)) 236 | v.Set(nv) 237 | } 238 | case reflect.Bool: 239 | i, err := strconv.ParseBool(s) 240 | if err != nil { 241 | return err 242 | } 243 | v.SetBool(i) 244 | case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: 245 | i, err := strconv.ParseInt(s, 10, 64) 246 | if err != nil { 247 | return err 248 | } 249 | v.SetInt(i) 250 | case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: 251 | i, err := strconv.ParseUint(s, 10, 64) 252 | if err != nil { 253 | return err 254 | } 255 | v.SetUint(i) 256 | case reflect.Float32, reflect.Float64: 257 | i, err := strconv.ParseFloat(s, 64) 258 | if err != nil { 259 | return err 260 | } 261 | v.SetFloat(i) 262 | case reflect.String: 263 | v.SetString(s) 264 | } 265 | return nil 266 | } 267 | 268 | func unmarshalStruct(s *goquery.Selection, v reflect.Value) error { 269 | t := v.Type() 270 | 271 | for i := 0; i < t.NumField(); i++ { 272 | tag := goqueryTag(t.Field(i).Tag.Get(tagName)) 273 | 274 | if tag == ignoreTag { 275 | continue 276 | } 277 | 278 | // If tag is empty and the object doesn't implement Unmarshaler, skip 279 | if tag == "" { 280 | if u, _ := indirect(v.Field(i)); u == nil { 281 | continue 282 | } 283 | } 284 | 285 | sel := tag.preprocess(s) 286 | if tag != "" { 287 | selStr := tag.selector(0) 288 | sel = sel.Find(selStr) 289 | } 290 | 291 | err := unmarshalByType(sel, v.Field(i), tag) 292 | if err != nil { 293 | return &CannotUnmarshalError{ 294 | Reason: typeConversionError, 295 | Err: err, 296 | V: v, 297 | FldOrIdx: t.Field(i).Name, 298 | } 299 | } 300 | } 301 | return nil 302 | } 303 | 304 | func unmarshalArray(s *goquery.Selection, v reflect.Value, tag goqueryTag) error { 305 | if v.Type().Len() != len(s.Nodes) { 306 | return &CannotUnmarshalError{ 307 | Reason: arrayLengthMismatch, 308 | V: v, 309 | } 310 | } 311 | 312 | for i := 0; i < v.Type().Len(); i++ { 313 | err := unmarshalByType(s.Eq(i), v.Index(i), tag) 314 | if err != nil { 315 | return &CannotUnmarshalError{ 316 | Reason: typeConversionError, 317 | Err: err, 318 | V: v, 319 | FldOrIdx: i, 320 | } 321 | } 322 | } 323 | 324 | return nil 325 | } 326 | 327 | func unmarshalSlice(s *goquery.Selection, v reflect.Value, tag goqueryTag) error { 328 | slice := v 329 | eleT := v.Type().Elem() 330 | 331 | for i := 0; i < s.Length(); i++ { 332 | newV := reflect.New(TypeDeref(eleT)) 333 | 334 | err := unmarshalByType(s.Eq(i), newV, tag) 335 | 336 | if err != nil { 337 | return &CannotUnmarshalError{ 338 | Reason: typeConversionError, 339 | Err: err, 340 | V: v, 341 | FldOrIdx: i, 342 | } 343 | } 344 | 345 | if eleT.Kind() != reflect.Ptr { 346 | newV = newV.Elem() 347 | } 348 | 349 | v = reflect.Append(v, newV) 350 | } 351 | 352 | slice.Set(v) 353 | return nil 354 | } 355 | 356 | func childrenUntilMatch(s *goquery.Selection, sel string) *goquery.Selection { 357 | orig := s 358 | s = s.Children() 359 | for s.Length() != 0 && s.Filter(sel).Length() == 0 { 360 | s = s.Children() 361 | } 362 | if s.Length() == 0 { 363 | return orig 364 | } 365 | return s.Filter(sel) 366 | } 367 | 368 | func unmarshalMap(s *goquery.Selection, v reflect.Value, tag goqueryTag) error { 369 | // Make new map here because indirect for some Reason doesn't help us out 370 | if v.IsNil() { 371 | v.Set(reflect.MakeMap(v.Type())) 372 | } 373 | 374 | keyT, eleT := v.Type().Key(), v.Type().Elem() 375 | 376 | if tag.selector(1) == "" { 377 | // We need minimum one value selector to determine the map key 378 | return &CannotUnmarshalError{ 379 | Reason: missingValueSelector, 380 | V: v, 381 | } 382 | } 383 | 384 | valTag := tag 385 | 386 | // Find children at the same level that match the given selector 387 | s = childrenUntilMatch(s, tag.selector(1)) 388 | // Then augment the selector we will pass down to the next unmarshal step 389 | valTag = valTag.popVal() 390 | 391 | var err error 392 | s.EachWithBreak(func(_ int, subS *goquery.Selection) bool { 393 | newK, newV := reflect.New(TypeDeref(keyT)), reflect.New(TypeDeref(eleT)) 394 | 395 | err = unmarshalByType(subS, newK, tag) 396 | if err != nil { 397 | err = &CannotUnmarshalError{ 398 | Reason: mapKeyUnmarshalError, 399 | V: v, 400 | Err: err, 401 | FldOrIdx: newK.Interface(), 402 | Val: valTag.valFunc()(subS), 403 | } 404 | return false 405 | } 406 | 407 | err = unmarshalByType(subS, newV, valTag) 408 | if err != nil { 409 | return false 410 | } 411 | 412 | if eleT.Kind() != reflect.Ptr { 413 | newV = newV.Elem() 414 | } 415 | if keyT.Kind() != reflect.Ptr { 416 | newK = newK.Elem() 417 | } 418 | 419 | v.SetMapIndex(newK, newV) 420 | 421 | return true 422 | }) 423 | 424 | if err != nil { 425 | return &CannotUnmarshalError{ 426 | Reason: typeConversionError, 427 | Err: err, 428 | V: v, 429 | } 430 | } 431 | 432 | return nil 433 | } 434 | -------------------------------------------------------------------------------- /unmarshal_test.go: -------------------------------------------------------------------------------- 1 | package goq 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "strings" 7 | "testing" 8 | 9 | "golang.org/x/net/html" 10 | 11 | "github.com/PuerkitoBio/goquery" 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | const testPage = ` 16 | 17 |
18 |