├── LICENSE ├── example └── main.go ├── funcs_test.go ├── extract.go ├── test └── types.go ├── funcs.go ├── extract_test.go ├── README.md ├── sq.go └── sq_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Empty Interface 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | 10 | -------------------------------------------------------------------------------- /example/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/http" 7 | "net/url" 8 | "os" 9 | "time" 10 | 11 | "github.com/emptyinterface/sq" 12 | ) 13 | 14 | func main() { 15 | 16 | if len(os.Args) != 2 { 17 | fmt.Println(os.Args[0], "") 18 | os.Exit(1) 19 | } 20 | 21 | const site = "https://twitter.com/" 22 | 23 | var page struct { 24 | Title string `sq:"title | text"` 25 | Tweets []struct { 26 | Text string `sq:"p.tweet-text | text"` 27 | AuthorName string `sq:"strong.fullname | text"` 28 | Username string `sq:"span.username | text"` 29 | Link *url.URL `sq:"a.js-permalink | attr(href)"` 30 | Created time.Time `sq:"a.tweet-timestamp | attr(title) | time(3:04 PM - _2 Jan 2006)"` 31 | Retweets int `sq:"span.ProfileTweet-action--retweet > span | attr(data-tweet-stat-count)"` 32 | Likes int `sq:"span.ProfileTweet-action--favorite > span | attr(data-tweet-stat-count)"` 33 | } `sq:"div.content"` 34 | } 35 | 36 | resp, err := http.Get(site + os.Args[1]) 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | defer resp.Body.Close() 41 | 42 | for _, err := range sq.Scrape(&page, resp.Body) { 43 | fmt.Println(err) 44 | } 45 | 46 | fmt.Printf("%s\n\n", page.Title) 47 | 48 | for _, tweet := range page.Tweets { 49 | fmt.Printf("%s (%s)\n", tweet.Username, tweet.Created.Format("2006/01/02")) 50 | fmt.Println(tweet.Text) 51 | fmt.Printf("(Likes: %d, Retweets: %d)\n\n", tweet.Likes, tweet.Retweets) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /funcs_test.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestNilFuncs(t *testing.T) { 10 | 11 | s, err := (&parser{}).parse("string") 12 | if err != nil { 13 | t.Errorf("Expected nil, got %q", err) 14 | } 15 | if s != "string" { 16 | t.Errorf("Expected %q, got %q", "string", s) 17 | } 18 | 19 | v, err := (&loader{}).load(nil, "string") 20 | if err != nil { 21 | t.Errorf("Expected nil, got %q", err) 22 | } 23 | if v != "string" { 24 | t.Errorf("Expected %q, got %q", "string", v) 25 | } 26 | 27 | } 28 | 29 | func TestRegexp(t *testing.T) { 30 | 31 | f := parseFuncs["regexp"] 32 | 33 | tests := []struct { 34 | input, pattern, output string 35 | err error 36 | }{ 37 | {"test", "s", "s", nil}, 38 | {"the rain in spain falls", " in (\\w+)", "spain", nil}, 39 | {"the rain in spain falls", " in (\\w+)", "spain", nil}, 40 | {"the rain in spain falls", " rain(\\S?) ", "spain", ErrNoRegexpMatch}, 41 | {"the rain in spain falls", " dogs ", "spain", ErrNoRegexpMatch}, 42 | {"the rain in spain falls", "(", "spain", errors.New("error parsing regexp: missing closing ): `(`")}, 43 | } 44 | 45 | for _, test := range tests { 46 | ouput, err := f(test.input, test.pattern) 47 | if err != nil { 48 | if err.Error() != test.err.Error() { 49 | t.Errorf("Expected %q, got %q", test.err, err) 50 | } 51 | continue 52 | } 53 | if ouput != test.output { 54 | t.Errorf("Expected %q, got %q", test.output, ouput) 55 | } 56 | } 57 | 58 | } 59 | 60 | func TestTime(t *testing.T) { 61 | 62 | f := loadFuncs["time"] 63 | 64 | tests := []struct { 65 | input, layout string 66 | output time.Time 67 | }{ 68 | {"2006", "2006", time.Date(2006, 1, 1, 0, 0, 0, 0, time.UTC)}, 69 | {"2016 05 23", "2006 01 02", time.Date(2016, 5, 23, 0, 0, 0, 0, time.UTC)}, 70 | } 71 | 72 | for _, test := range tests { 73 | ouput, err := f(nil, test.input, test.layout) 74 | if err != nil { 75 | t.Error(err) 76 | } 77 | if ouput.(time.Time) != test.output { 78 | t.Errorf("Expected %v, got %v", test.output, ouput) 79 | } 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /extract.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strings" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | ) 10 | 11 | type ( 12 | path struct { 13 | selector string 14 | acc string 15 | parsers []parser 16 | loader *loader 17 | } 18 | ) 19 | 20 | const ( 21 | accessorAttr = "attr" 22 | accessorHTML = "html" 23 | accessorText = "text" 24 | ) 25 | 26 | func extractString(sel *goquery.Selection, acc string) (string, error) { 27 | 28 | switch { 29 | case acc == accessorHTML: 30 | s, err := sel.Html() 31 | return strings.TrimSpace(s), err 32 | case acc == accessorText: 33 | return strings.TrimSpace(sel.Text()), nil 34 | case strings.HasPrefix(acc, accessorAttr): 35 | s, exists := sel.Attr(trimAccessor(acc, accessorAttr)) 36 | if !exists { 37 | return "", fmt.Errorf("%s: %v", ErrAttributeNotFound, acc) 38 | } 39 | return strings.TrimSpace(s), nil 40 | // jank 41 | case acc == "": 42 | return "", nil 43 | } 44 | 45 | return "", fmt.Errorf("Bad accessor: %q", acc) 46 | 47 | } 48 | 49 | func trimAccessor(s, prefix string) string { 50 | s = strings.TrimPrefix(s, prefix) 51 | s = strings.TrimPrefix(s, "(") 52 | s = strings.TrimSuffix(s, ")") 53 | return strings.TrimSpace(s) 54 | } 55 | 56 | func parseFunctionSignature(s string) (string, string) { 57 | 58 | var name, args string 59 | 60 | if i := strings.IndexByte(s, '('); i > -1 { 61 | name = s[:i] 62 | if j := strings.LastIndexByte(s, ')'); j > -1 && i < j { 63 | args = s[i+1 : j] 64 | } 65 | } else { 66 | name = s 67 | } 68 | 69 | return name, args 70 | 71 | } 72 | 73 | func parseTag(tag reflect.StructTag) (*path, error) { 74 | p := &path{} 75 | for i, part := range strings.Split(tag.Get("sq"), " | ") { 76 | part = strings.TrimSpace(part) 77 | switch i { 78 | case 0: 79 | p.selector = part 80 | case 1: 81 | switch { 82 | case strings.HasPrefix(part, accessorAttr+"("), 83 | accessorHTML == part, 84 | accessorText == part: 85 | p.acc = part 86 | default: 87 | return nil, fmt.Errorf("Bad accessor: %q", part) 88 | } 89 | default: 90 | name, args := parseFunctionSignature(part) 91 | if pf, exists := parseFuncs[name]; exists { 92 | p.parsers = append(p.parsers, parser{f: pf, args: args}) 93 | } else if lf, exists := loadFuncs[name]; exists { 94 | p.loader = &loader{lf, args} 95 | } else { 96 | return nil, fmt.Errorf("%q not registered func", name) 97 | } 98 | } 99 | } 100 | if p.selector == "" { 101 | if strings.Contains(string(tag), "sq:") { 102 | return nil, fmt.Errorf("Bad tag: %q", tag) 103 | } 104 | return nil, ErrTagNotFound 105 | } 106 | return p, nil 107 | } 108 | -------------------------------------------------------------------------------- /test/types.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "net/url" 5 | "time" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/aymerick/douceur/css" 9 | "github.com/robertkrimen/otto/ast" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | type ( 14 | Row struct { 15 | private int 16 | String1 string `sq:"td:nth-child(1) | text"` 17 | String2 string `sq:"td:nth-child(2) | text"` 18 | String3 string `sq:"td:nth-child(3) | text"` 19 | RowMarkup string `sq:" . | html"` 20 | } 21 | CustomType string 22 | TextType struct { 23 | private string 24 | privatepointer *string 25 | Struct Row `sq:"table.list tr:nth-child(2)"` 26 | StructSlice []*Row `sq:"table.list tr"` 27 | Array [3]int `sq:"p.array | text"` 28 | Slice []float64 `sq:"p.slice | text | regexp([.\\d]+)"` 29 | ByteSlice []byte `sq:"p.byteslice | text"` 30 | EightByteArray [8]byte `sq:"p.eightbytearray | text"` 31 | Bool bool `sq:"p.bool | text"` 32 | Byte byte `sq:"p.byte | text"` 33 | Int int `sq:"p.int | text"` 34 | Int8 int8 `sq:"p.int8 | text"` 35 | Int16 int16 `sq:"p.int16 | text"` 36 | Int32 int32 `sq:"p.int32 | text"` 37 | Int64 int64 `sq:"p.int64 | text"` 38 | Uint *uint `sq:"p.uint | text"` 39 | Uint8 uint8 `sq:"p.uint8 | text"` 40 | Uint16 uint16 `sq:"p.uint16 | text"` 41 | Uint32 uint32 `sq:"p.uint32 | text"` 42 | Uint64 uint64 `sq:"p.uint64 | text"` 43 | Uintptr uintptr `sq:"p.uintptr | text"` 44 | Float32 float32 `sq:"p.float32 | text"` 45 | Float64 float64 `sq:"p.float64 | text"` 46 | Interface interface{} `sq:"p.interface | text"` 47 | String string `sq:"p.string | text"` 48 | Time time.Time `sq:"p.time | text | regexp([\\d\\s]{10,}) | time(2006 01 02)"` 49 | PointerToTime *time.Time `sq:"p.time | text | regexp([\\d\\s]{10,}) | time(2006 01 02)"` 50 | URL *url.URL `sq:"a | attr(href)"` 51 | Selection *goquery.Selection `sq:"div"` 52 | Selections []*goquery.Selection `sq:"div > p"` 53 | Node *html.Node `sq:"div"` 54 | Nodes []*html.Node `sq:"div > p"` 55 | Javascript *ast.Program `sq:"script[type$=javascript]:first-child"` 56 | Javascripts []*ast.Program `sq:"script[type$=javascript]"` 57 | Stylesheet *css.Stylesheet `sq:"style:first-of-type"` 58 | Stylesheets []*css.Stylesheet `sq:"style"` 59 | CustomType CustomType `sq:"p.string"` 60 | 61 | // errs 62 | Map map[string]interface{} `sq:"div"` 63 | BadBool bool `sq:"p.int | text"` 64 | BadInt int `sq:"p.bool | text"` 65 | BadUint uint `sq:"p.bool | text"` 66 | BadFloat float32 `sq:"p.bool | text"` 67 | BadTime time.Time `sq:"p.bool | text | time()"` 68 | BadSlice []byte `sq:"div | attr(missing)"` 69 | BadArray [8]byte `sq:"div | attr(missing)"` 70 | BadAttr int `sq:"div | attr(missing)"` 71 | BadTag int `sq:"derp(\d)"` 72 | BadParse string `sq:"p.bool | text | parsefail"` 73 | BadLoad string `sq:"p.bool | text | loadfail"` 74 | BadSliceofStructs []Badstruct `sq:"div"` 75 | BadArrayofStructs [2]Badstruct `sq:"div"` 76 | privatetagged string `sq:"a"` 77 | Missing string `sq:"blink"` 78 | MissingSelection *goquery.Selection `sq:"blink.selection"` 79 | MissingNode *html.Node `sq:"blink.node"` 80 | MissingJavascript *ast.Program `sq:"blink.javascript"` 81 | MissingStylesheet *css.Stylesheet `sq:"blink.css"` 82 | BadAccSelection *goquery.Selection `sq:"a | badacc.goquery"` 83 | BadAccNode *html.Node `sq:"a | badacc.node"` 84 | BadAccURL *url.URL `sq:"a | badacc.url"` 85 | BadAccJavascript *ast.Program `sq:"a | badacc.javascript"` 86 | BadAccStylesheet *css.Stylesheet `sq:"a | badacc.css"` 87 | BadParserSelection *goquery.Selection `sq:"a | text | parsefail"` 88 | BadParserNode *html.Node `sq:"a | text | parsefail"` 89 | BadParserURL *url.URL `sq:"a | text | parsefail"` 90 | BadParserJavascript *ast.Program `sq:"a | text | parsefail"` 91 | BadParserStylesheet *css.Stylesheet `sq:"a | text | parsefail"` 92 | } 93 | Badstruct struct { 94 | Field string `sq:"div | text | nestedfail"` 95 | } 96 | ) 97 | -------------------------------------------------------------------------------- /funcs.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "net/url" 7 | pathpkg "path" 8 | "reflect" 9 | "regexp" 10 | "strings" 11 | "time" 12 | 13 | "github.com/PuerkitoBio/goquery" 14 | douceur "github.com/aymerick/douceur/parser" 15 | "github.com/emptyinterface/ago" 16 | otto "github.com/robertkrimen/otto/parser" 17 | ) 18 | 19 | type ( 20 | ParseFunc func(s, arg string) (string, error) 21 | 22 | LoadFunc func(sel *goquery.Selection, s, arg string) (interface{}, error) 23 | 24 | TypeLoader struct { 25 | isType func(t reflect.Type) bool 26 | load func(sel *goquery.Selection, s string) (interface{}, error) 27 | } 28 | 29 | parser struct { 30 | f ParseFunc 31 | args string 32 | } 33 | loader struct { 34 | f LoadFunc 35 | args string 36 | } 37 | ) 38 | 39 | var ( 40 | ErrNoRegexpMatch = errors.New("regexp did not match the content") 41 | ) 42 | 43 | var ( 44 | parseFuncs = map[string]ParseFunc{ 45 | "regexp": func(s, pattern string) (string, error) { 46 | r, err := regexp.Compile(pattern) 47 | if err != nil { 48 | return "", err 49 | } 50 | matches := r.FindStringSubmatch(s) 51 | if len(matches) == 1 { 52 | return matches[0], nil 53 | } 54 | if len(matches) > 1 { 55 | if len(matches[1]) == 0 { 56 | return "", ErrNoRegexpMatch 57 | } 58 | return matches[1], nil 59 | } 60 | return "", ErrNoRegexpMatch 61 | }, 62 | "strip": func(s, pattern string) (string, error) { 63 | r, err := regexp.Compile(pattern) 64 | if err != nil { 65 | return "", err 66 | } 67 | return r.ReplaceAllString(s, ""), nil 68 | }, 69 | "path.prepend": func(s, token string) (string, error) { 70 | if strings.HasPrefix(s, token) { 71 | return s, nil 72 | } 73 | fmt.Println(pathpkg.Join(token, s)) 74 | return pathpkg.Join(token, s), nil 75 | }, 76 | "path.append": func(s, token string) (string, error) { 77 | if strings.HasSuffix(s, token) { 78 | return s, nil 79 | } 80 | fmt.Println(pathpkg.Join(s, token)) 81 | return pathpkg.Join(s, token), nil 82 | }, 83 | "prepend": func(s, token string) (string, error) { 84 | if strings.HasPrefix(s, token) { 85 | return s, nil 86 | } 87 | return token + s, nil 88 | }, 89 | "append": func(s, token string) (string, error) { 90 | if strings.HasSuffix(s, token) { 91 | return s, nil 92 | } 93 | return s + token, nil 94 | }, 95 | } 96 | 97 | loadFuncs = map[string]LoadFunc{ 98 | "time": func(_ *goquery.Selection, s, layout string) (interface{}, error) { 99 | return time.Parse(strings.TrimSpace(layout), strings.TrimSpace(s)) 100 | }, 101 | "ago": func(_ *goquery.Selection, s, _ string) (interface{}, error) { 102 | return ago.Parse(strings.TrimSpace(s)) 103 | }, 104 | } 105 | 106 | typeLoaders = map[string]TypeLoader{ 107 | "url": { 108 | isType: func(t reflect.Type) bool { 109 | return t.PkgPath() == "net/url" && t.Name() == "URL" 110 | }, 111 | load: func(_ *goquery.Selection, s string) (interface{}, error) { 112 | return url.Parse(s) 113 | }, 114 | }, 115 | "goquery": { 116 | isType: func(t reflect.Type) bool { 117 | return strings.HasSuffix(t.PkgPath(), "/goquery") && t.Name() == "Selection" 118 | }, 119 | load: func(sel *goquery.Selection, _ string) (interface{}, error) { 120 | return sel.Clone(), nil 121 | }, 122 | }, 123 | "html": { 124 | isType: func(t reflect.Type) bool { 125 | return t.PkgPath() == "golang.org/x/net/html" && t.Name() == "Node" 126 | }, 127 | load: func(sel *goquery.Selection, _ string) (interface{}, error) { 128 | return sel.Clone().Nodes[0], nil 129 | }, 130 | }, 131 | "otto": { 132 | isType: func(t reflect.Type) bool { 133 | return strings.HasSuffix(t.PkgPath(), "otto/ast") && t.Name() == "Program" 134 | }, 135 | load: func(sel *goquery.Selection, text string) (interface{}, error) { 136 | if text == "" { 137 | text = sel.Text() 138 | } 139 | return otto.ParseFile(nil, "", text, 0) 140 | }, 141 | }, 142 | "css": { 143 | isType: func(t reflect.Type) bool { 144 | return strings.HasSuffix(t.PkgPath(), "douceur/css") && t.Name() == "Stylesheet" 145 | }, 146 | load: func(sel *goquery.Selection, text string) (interface{}, error) { 147 | if text == "" { 148 | text = sel.Text() 149 | } 150 | return douceur.Parse(text) 151 | }, 152 | }, 153 | } 154 | ) 155 | 156 | func RegisterParseFunc(name string, f ParseFunc) { 157 | parseFuncs[name] = f 158 | } 159 | 160 | func RegisterLoadFunc(name string, f LoadFunc) { 161 | loadFuncs[name] = f 162 | } 163 | 164 | func RegisterTypeLoader(name string, isType func(t reflect.Type) bool, load func(sel *goquery.Selection, text string) (interface{}, error)) { 165 | typeLoaders[name] = TypeLoader{ 166 | isType: isType, 167 | load: load, 168 | } 169 | } 170 | 171 | func (p parser) parse(s string) (string, error) { 172 | if p.f != nil { 173 | return p.f(s, p.args) 174 | } 175 | return s, nil 176 | } 177 | 178 | func (l *loader) load(sel *goquery.Selection, s string) (interface{}, error) { 179 | if l.f != nil { 180 | return l.f(sel, s, l.args) 181 | } 182 | return s, nil 183 | } 184 | -------------------------------------------------------------------------------- /extract_test.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | ) 11 | 12 | func TestExtractString(t *testing.T) { 13 | 14 | const ( 15 | title = "test title" 16 | attr = "test attr" 17 | fragment = "test fragment" 18 | ) 19 | 20 | var testHTML = fmt.Sprintf(` 21 | 22 | %s 23 |

%s

24 | 25 | `, title, attr, fragment) 26 | 27 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(testHTML)) 28 | if err != nil { 29 | t.Error(err) 30 | } 31 | 32 | s, err := extractString(doc.Find("title"), accessorText) 33 | if err != nil { 34 | t.Error(err) 35 | } 36 | if s != title { 37 | t.Errorf("Expected %q, got %q", title, s) 38 | } 39 | 40 | s, err = extractString(doc.Find("[data-attr]"), fmt.Sprintf("%s(data-attr)", accessorAttr)) 41 | if err != nil { 42 | t.Error(err) 43 | } 44 | if s != attr { 45 | t.Errorf("Expected %q, got %q", attr, s) 46 | } 47 | 48 | expected := fmt.Errorf("%s: %v", ErrAttributeNotFound, "attr(missing)") 49 | s, err = extractString(doc.Find("[data-attr]"), fmt.Sprintf("%s(missing)", accessorAttr)) 50 | if err.Error() != expected.Error() { 51 | t.Errorf("Expected %q, got %q", expected, err) 52 | } 53 | if s != "" { 54 | t.Errorf("Expected empty string, got %q", s) 55 | } 56 | 57 | s, err = extractString(doc.Find("p"), accessorHTML) 58 | if err != nil { 59 | t.Error(err) 60 | } 61 | if s != fragment { 62 | t.Errorf("Expected %q, got %q", fragment, s) 63 | } 64 | 65 | expected = fmt.Errorf("Bad accessor: %q", "madeupaccessor") 66 | s, err = extractString(doc.Find("p"), "madeupaccessor") 67 | if err.Error() != expected.Error() { 68 | t.Errorf("Expected %q, got %q", expected, err) 69 | } 70 | if s != "" { 71 | t.Errorf("Expected empty string, got %q", s) 72 | } 73 | 74 | } 75 | 76 | func TestParseTag(t *testing.T) { 77 | 78 | tests := []struct { 79 | tag reflect.StructTag 80 | p *path 81 | err error 82 | }{ 83 | // good 84 | {`sq:"p.last"`, &path{selector: "p.last"}, nil}, 85 | {`sq:"p.last | text"`, &path{selector: "p.last", acc: "text"}, nil}, 86 | {`sq:"p.last | text | regexp(\\d+)"`, &path{ 87 | selector: "p.last", acc: "text", 88 | parsers: []parser{parser{args: "\\d+", f: parseFuncs["regexp"]}}}, 89 | nil, 90 | }, 91 | {`sq:"p.last | text | regexp(\\d+) | regexp(\\d)"`, 92 | &path{ 93 | selector: "p.last", acc: "text", 94 | parsers: []parser{ 95 | parser{args: "\\d+", f: parseFuncs["regexp"]}, 96 | parser{args: "\\d", f: parseFuncs["regexp"]}, 97 | }, 98 | }, 99 | nil, 100 | }, 101 | {`sq:"p.last | text | regexp(\\d+) | regexp(\\d) | time(01)"`, 102 | &path{ 103 | selector: "p.last", acc: "text", 104 | parsers: []parser{ 105 | parser{args: "\\d+", f: parseFuncs["regexp"]}, 106 | parser{args: "\\d", f: parseFuncs["regexp"]}, 107 | }, 108 | loader: &loader{args: "01", f: loadFuncs["time"]}, 109 | }, 110 | nil, 111 | }, 112 | {`sq:"p.last | attr(id) | regexp(\\d+) | regexp(\\d) | time(01)"`, 113 | &path{ 114 | selector: "p.last", acc: "attr(id)", 115 | parsers: []parser{ 116 | parser{args: "\\d+", f: parseFuncs["regexp"]}, 117 | parser{args: "\\d", f: parseFuncs["regexp"]}, 118 | }, 119 | loader: &loader{args: "01", f: loadFuncs["time"]}, 120 | }, 121 | nil, 122 | }, 123 | 124 | // bad 125 | {`sq:"p.last | fuzzy"`, nil, fmt.Errorf("Bad accessor: %q", `fuzzy`)}, 126 | {`sq:"p.last | text | unregifunc"`, nil, fmt.Errorf("%q not registered func", "unregifunc")}, 127 | {`sq:"p.last\d"`, nil, fmt.Errorf("Bad tag: %q", `sq:"p.last\d"`)}, 128 | {``, nil, ErrTagNotFound}, 129 | } 130 | 131 | for _, test := range tests { 132 | p, err := parseTag(test.tag) 133 | if err != nil { 134 | if err.Error() != test.err.Error() { 135 | t.Errorf("Expected error %q, got %q", test.err, err) 136 | } 137 | continue 138 | } 139 | if p.selector != test.p.selector { 140 | t.Errorf("Expected %q, got %q", test.p.selector, p.selector) 141 | } 142 | if p.acc != test.p.acc { 143 | t.Errorf("Expected %q, got %q", test.p.acc, p.acc) 144 | } 145 | if p.loader == nil && p.loader != test.p.loader { 146 | t.Errorf("Expected %#v, got %#v", test.p.loader, p.loader) 147 | } 148 | if p.loader != nil { 149 | if test.p.loader.args != p.loader.args { 150 | t.Errorf("Expected %#v, got %#v", test.p.loader.args, p.loader.args) 151 | } 152 | if reflect.ValueOf(test.p.loader.f).Pointer() != reflect.ValueOf(p.loader.f).Pointer() { 153 | t.Errorf("Expected %#v, got %#v", test.p.loader, p.loader) 154 | } 155 | } 156 | if len(p.parsers) != len(test.p.parsers) { 157 | t.Errorf("Expected %#v, got %#v", test.p.parsers, p.parsers) 158 | } 159 | for i, pp := range p.parsers { 160 | if test.p.parsers[i].args != pp.args { 161 | t.Errorf("Expected %q, got %q", test.p.parsers[i].args, pp.args) 162 | } 163 | if reflect.ValueOf(test.p.parsers[i].f).Pointer() != reflect.ValueOf(pp.f).Pointer() { 164 | t.Errorf("Expected %#v, got %#v", test.p.parsers[i].f, pp.f) 165 | } 166 | } 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sq 2 | 3 | sq is a very simple, powerful scraping library 4 | 5 | sq uses [struct tags](https://golang.org/pkg/reflect/#StructTag) as configuration, [reflection](golang.org/pkg/reflect), and [goquery](https://github.com/PuerkitoBio/goquery) to unmarshall data out of HTML pages. 6 | 7 | ```go 8 | type ExamplePage struct { 9 | Title string `sq:"title | text"` 10 | 11 | Users []struct { 12 | ID int `sq:"td:nth-child(1) | text | regexp(\\d+)"` 13 | Name string `sq:"td:nth-child(2) | text"` 14 | Email string `sq:"td:nth-child(3) a | attr(href) | regexp(mailto:(.+))"` 15 | Website *url.URL `sq:"td:nth-child(4) > a | attr(href)"` 16 | Timestamp time.Time `sq:"td:nth-child(5) | text | time(2006 02 03)"` 17 | RowMarkup string `sq:" . | html"` 18 | } `sq:"table tr"` 19 | 20 | Stylesheets []*css.Stylesheet `sq:"style"` 21 | Javascripts []*ast.Program `sq:"script [type$=javascript]"` 22 | 23 | HTMLSnippet *html.Node `sq:"div.container"` 24 | GoquerySelection *goquery.Selection `sq:"[href], [src]"` 25 | } 26 | 27 | resp, err := http.Get("https://example.com") 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | defer resp.Body.Close() 32 | 33 | var p ExamplePage 34 | 35 | // Scrape continues on error and returns a slice of errors that occurred. 36 | errs := sq.Scrape(&p, resp.Body) 37 | for _, err := range errs { 38 | fmt.Println(err) 39 | } 40 | ``` 41 | 42 | 43 | *Note: go struct tags are parsed as strings and so all backslashes must be escaped. (ie. `\d+` -> `\\d+`)* 44 | 45 | ## Accessors, Parsers, and Loaders 46 | 47 | Accessors, parsers, loaders are specified in the tag in a unix-style pipeline. 48 | 49 | **Accessors** 50 | 51 | * `text`: The `text` accessor emits the result of goquery's [`Text()`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection.Text) method on the matched [`Selection`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection). 52 | * `html`: The `html` accessor emits the result of goquery's [`Html()`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection.Html) method on the matched [`Selection`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection). 53 | * `attr()`: The `attr()` accessor emits the result of goquery's [`Attr()`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection.Attr) method with the supplied argument on the matched [`Selection`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection). An error will be returned if the specified attribute is not found. 54 | 55 | **Parsers** 56 | 57 | * `regexp()`: The `regexp` parser takes a regular expression and applies it to the input emitted by the previous accessor or parser function. When no subcapture group is specified, the first match is emitted. If a subcapture group is specified, the first subcapture is returned. 58 | 59 | **Loaders** 60 | 61 | * `time()`: The `time()` loader calls [`time.Parse()`](https://golang.org/pkg/time/#Parse) with the supplied format on the input emitted from the previous accessor or parser function. 62 | 63 | Custom parsers and loaders may be added or overridden: 64 | 65 | ```go 66 | // unescapes content 67 | sq.RegisterParseFunc("unescape", func(s, _ string) (string, error) { 68 | return html.UnescapeString(s), nil 69 | }) 70 | 71 | // loads a time.Duration from a datestamp 72 | sq.RegisterLoadFunc("age", func(_ *goquery.Selection, s, layout string) (interface{}, error) { 73 | t, err := time.Parse(layout, s) 74 | if err != nil { 75 | return nil, err 76 | } 77 | return time.Since(t), nil 78 | }) 79 | 80 | // example use 81 | type Page struct { 82 | Alerts []struct { 83 | Title string `sq:"h3 | text"` 84 | Age time.Duration `sq:"span.posted | unescape | age(2006 02 03 15:04:05 MST)"` 85 | } `sq:"div.alert"` 86 | } 87 | ``` 88 | 89 | 90 | ## Types 91 | 92 | sq supports the full list of native go types except `map`, `func`, `chan`, and `complex`. 93 | 94 | Several web related datastructures are also detected and loaded: 95 | 96 | * [`url.URL`](https://golang.org/pkg/net/url/#URL): The `url.URL` type from the go std lib loaded using [`url.Parse`](https://golang.org/pkg/net/url/#Parse) 97 | * [`github.com/aymerick/douceur/css.Stylesheet`](https://godoc.org/github.com/aymerick/douceur/css#Stylesheet): This is a parse tree representing a stylesheet. 98 | * [`github.com/robertkrimen/otto/ast.Program`](https://godoc.org/github.com/robertkrimen/otto/ast#Program): This is an ast representing a block of javascript. 99 | * [`golang.org/x/net/html.Node`](https://godoc.org/golang.org/x/net/html#Node): This is the ast node of the parsed html. 100 | * [`github.com/PuerkitoBio/goquery.Selection`](https://godoc.org/github.com/PuerkitoBio/goquery#Selection): This is a convenience wrapper around the underlying html node[s]. 101 | 102 | Each of these types are detected and loaded automatically using a [`TypeLoader`](https://godoc.org/github.com/emptyinterface/sq#TypeLoader). Overriding or adding type loaders is simple. 103 | 104 | A [`TypeLoader`](https://godoc.org/github.com/emptyinterface/sq#TypeLoader) is a pair of functions with a name. It takes function that checks for a match, and a function that does the loading. 105 | 106 | ```go 107 | // This is the typeloader for detecting url.URLs and loading them. 108 | sq.RegisterTypeLoader("url", 109 | func(t reflect.Type) bool { 110 | return t.PkgPath() == "net/url" && t.Name() == "URL" 111 | }, 112 | func(_ *goquery.Selection, s string) (interface{}, error) { 113 | return url.Parse(s) 114 | }, 115 | ) 116 | ``` 117 | 118 | ### Docs 119 | 120 | [godoc](https://godoc.org/github.com/emptyinterface/sq) 121 | 122 | ### License 123 | 124 | MIT 2016 125 | 126 | -------------------------------------------------------------------------------- /sq.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | "reflect" 8 | "strconv" 9 | "unicode" 10 | "unicode/utf8" 11 | 12 | "github.com/PuerkitoBio/goquery" 13 | ) 14 | 15 | var ( 16 | // reflection errors 17 | ErrInvalidKind = errors.New("invalid kind") 18 | ErrNotSettable = errors.New("v is not settable") 19 | ErrNonStructPtrValue = errors.New("*struct type required") 20 | ErrTagNotFound = errors.New("sq tag not found") 21 | 22 | // not found errors 23 | ErrNodeNotFound = errors.New("node not found") 24 | ErrAttributeNotFound = errors.New("attribute not found") 25 | ) 26 | 27 | func Scrape(structPtr interface{}, r io.Reader) []error { 28 | 29 | v := reflect.ValueOf(structPtr) 30 | 31 | if v.Kind() != reflect.Ptr || v.Type().Elem().Kind() != reflect.Struct { 32 | return []error{ErrNonStructPtrValue} 33 | } 34 | 35 | doc, err := goquery.NewDocumentFromReader(r) 36 | if err != nil { 37 | return []error{err} 38 | } 39 | 40 | return hydrateValue(&v, doc.Selection, nil) 41 | 42 | } 43 | 44 | // initialize and dereference pointers 45 | func resolvePointer(v *reflect.Value) { 46 | for v.Kind() == reflect.Ptr { 47 | if v.IsNil() { 48 | v.Set(reflect.New(v.Type().Elem())) 49 | } 50 | *v = v.Elem() 51 | } 52 | } 53 | 54 | func hydrateValue(v *reflect.Value, sel *goquery.Selection, p *path) []error { 55 | 56 | resolvePointer(v) 57 | 58 | if !v.CanSet() { 59 | return nil 60 | } 61 | 62 | if p != nil && len(p.selector) > 0 && p.selector != "." && !sel.Is(p.selector) { 63 | sel = sel.Find(p.selector) 64 | if sel.Size() == 0 { 65 | return []error{fmt.Errorf("%q did not match", p.selector)} 66 | } 67 | } 68 | 69 | if p != nil && p.loader != nil { 70 | if err := setValueFromSel(v, sel, p); err != nil { 71 | return []error{err} 72 | } 73 | return nil 74 | } 75 | 76 | t := v.Type() 77 | 78 | for _, tl := range typeLoaders { 79 | if tl.isType(t) { 80 | p.loader = &loader{ 81 | f: func(sel *goquery.Selection, text, _ string) (interface{}, error) { 82 | return tl.load(sel, text) 83 | }, 84 | } 85 | if err := setValueFromSel(v, sel, p); err != nil { 86 | return []error{err} 87 | } 88 | return nil 89 | } 90 | } 91 | 92 | switch v.Kind() { 93 | 94 | case reflect.Struct: 95 | 96 | var errs []error 97 | for i := 0; i < t.NumField(); i++ { 98 | ft := t.Field(i) 99 | p, err := parseTag(ft.Tag) 100 | if err != nil { 101 | if err != ErrTagNotFound { 102 | errs = append(errs, err) 103 | } 104 | } else { 105 | if r, _ := utf8.DecodeRuneInString(ft.Name); !unicode.IsUpper(r) { 106 | errs = append(errs, fmt.Errorf("private field with sq tag: %q", ft.Name)) 107 | } else { 108 | f := v.Field(i) 109 | if err := hydrateValue(&f, sel, p); err != nil { 110 | errs = append(errs, err...) 111 | } 112 | } 113 | } 114 | } 115 | return errs 116 | 117 | case reflect.Array: 118 | 119 | // handle [N]byte copy from string 120 | if t.Elem().Kind() == reflect.Uint8 { 121 | s, err := extractString(sel, p.acc) 122 | if err != nil { 123 | return []error{err} 124 | } 125 | reflect.Copy(*v, reflect.ValueOf([]byte(s))) 126 | return nil 127 | } 128 | 129 | var errs []error 130 | sel.Each(func(i int, sel *goquery.Selection) { 131 | if i < v.Len() { 132 | vv := v.Index(i) 133 | if err := hydrateValue(&vv, sel, p); err != nil { 134 | errs = append(errs, err...) 135 | } 136 | } 137 | }) 138 | return errs 139 | 140 | case reflect.Slice: 141 | 142 | // handle []byte setting directly 143 | if t.Elem().Kind() == reflect.Uint8 { 144 | s, err := extractString(sel, p.acc) 145 | if err != nil { 146 | return []error{err} 147 | } 148 | v.SetBytes([]byte(s)) 149 | return nil 150 | } 151 | 152 | var errs []error 153 | slicev := reflect.MakeSlice(t, sel.Size(), sel.Size()) 154 | sel.Each(func(i int, sel *goquery.Selection) { 155 | vv := slicev.Index(i) 156 | if err := hydrateValue(&vv, sel, p); err != nil { 157 | errs = append(errs, err...) 158 | } 159 | }) 160 | v.Set(slicev) 161 | return errs 162 | 163 | case reflect.Bool, 164 | reflect.Int, 165 | reflect.Int8, 166 | reflect.Int16, 167 | reflect.Int32, 168 | reflect.Int64, 169 | reflect.Uint, 170 | reflect.Uint8, 171 | reflect.Uint16, 172 | reflect.Uint32, 173 | reflect.Uint64, 174 | reflect.Uintptr, 175 | reflect.Float32, 176 | reflect.Float64, 177 | reflect.Interface, 178 | reflect.String: 179 | if err := setValueFromSel(v, sel, p); err != nil { 180 | return []error{err} 181 | } 182 | return nil 183 | 184 | default: 185 | // case reflect.Map: 186 | // case reflect.Complex64: 187 | // case reflect.Complex128: 188 | // case reflect.Chan: 189 | // case reflect.Func: 190 | return []error{fmt.Errorf("%s: %v", ErrInvalidKind, v.Kind())} 191 | } 192 | 193 | } 194 | 195 | func setValueFromSel(v *reflect.Value, sel *goquery.Selection, p *path) error { 196 | 197 | s, err := extractString(sel, p.acc) 198 | if err != nil { 199 | return err 200 | } 201 | 202 | for _, pp := range p.parsers { 203 | s, err = pp.parse(s) 204 | if err != nil { 205 | return fmt.Errorf("%s: (parser fail) %q", p.selector, err) 206 | } 207 | } 208 | 209 | if p.loader != nil { 210 | vv, err := p.loader.load(sel, s) 211 | if err != nil { 212 | return fmt.Errorf("%s: (loader fail) %q", p.selector, err) 213 | } 214 | rv := reflect.ValueOf(vv) 215 | // deref pointers for correct setting. 216 | // by this point we're operating on 217 | // non-pointer values only. 218 | for rv.Kind() == reflect.Ptr { 219 | rv = rv.Elem() 220 | } 221 | v.Set(rv) 222 | return nil 223 | } 224 | 225 | switch v.Kind() { 226 | case reflect.Bool: 227 | b, err := strconv.ParseBool(s) 228 | if err != nil { 229 | return fmt.Errorf("%s: %s", p.selector, err) 230 | } 231 | v.SetBool(b) 232 | case reflect.Int, 233 | reflect.Int8, 234 | reflect.Int16, 235 | reflect.Int32, 236 | reflect.Int64: 237 | n, err := strconv.ParseInt(s, 10, v.Type().Bits()) 238 | if err != nil { 239 | return fmt.Errorf("%s: %s", p.selector, err) 240 | } 241 | v.SetInt(n) 242 | case reflect.Uint, 243 | reflect.Uint8, 244 | reflect.Uint16, 245 | reflect.Uint32, 246 | reflect.Uint64, 247 | reflect.Uintptr: 248 | n, err := strconv.ParseUint(s, 10, v.Type().Bits()) 249 | if err != nil { 250 | return fmt.Errorf("%s: %s", p.selector, err) 251 | } 252 | v.SetUint(n) 253 | case reflect.Float32, 254 | reflect.Float64: 255 | n, err := strconv.ParseFloat(s, v.Type().Bits()) 256 | if err != nil { 257 | return fmt.Errorf("%s: %s", p.selector, err) 258 | } 259 | v.SetFloat(n) 260 | case reflect.String: 261 | v.SetString(s) 262 | case reflect.Interface: 263 | v.Set(reflect.ValueOf(s)) 264 | default: 265 | // unable to set these values from a string 266 | // but should never reach this block with 267 | // one of these value kinds. 268 | // case reflect.Slice: 269 | // case reflect.Struct: 270 | // case reflect.Array: 271 | // case reflect.Map: 272 | // case reflect.Complex64: 273 | // case reflect.Complex128: 274 | // case reflect.Chan: 275 | // case reflect.Func: 276 | panic("unreachable") 277 | } 278 | 279 | return nil 280 | 281 | } 282 | -------------------------------------------------------------------------------- /sq_test.go: -------------------------------------------------------------------------------- 1 | package sq 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "reflect" 8 | "strconv" 9 | "strings" 10 | "testing" 11 | 12 | "github.com/PuerkitoBio/goquery" 13 | "github.com/emptyinterface/sq/test" 14 | "github.com/robertkrimen/otto/ast" 15 | ) 16 | 17 | type errreader struct{} 18 | 19 | var errreadererr = errors.New("errreader") 20 | 21 | func (_ errreader) Read(_ []byte) (int, error) { return 0, errreadererr } 22 | 23 | func TestFails(t *testing.T) { 24 | 25 | var notapointer struct{} 26 | errs := Scrape(notapointer, strings.NewReader("")) 27 | if len(errs) != 1 || errs[0] != ErrNonStructPtrValue { 28 | t.Errorf("Expected %q, got %q", ErrNonStructPtrValue, errs) 29 | } 30 | 31 | errs = Scrape(¬apointer, errreader{}) 32 | if len(errs) != 1 || errs[0] != errreadererr { 33 | t.Errorf("Expected %q, got %q", errreadererr, errs) 34 | } 35 | 36 | } 37 | 38 | func TestText(t *testing.T) { 39 | 40 | const testHTML = ` 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 |
123
112233
111222333
56 |
57 | 58 |

0

59 |

1

60 |

2

61 |
62 |

0

63 |

1

64 |

2

65 |

3

66 |

0.1

67 |

1.1

68 |

2.1f

69 |

ByteSlice

70 |

EightByteArray

71 |

true

72 |

8

73 |

-48

74 |

8

75 |

16

76 |

32

77 |

64

78 |

48

79 |

8

80 |

16

81 |

32

82 |

64

83 |

255

84 |

1.234

85 |

2.468

86 |

Interface

87 |

String

88 |

The date today is: 2016 05 23

89 | 90 | ` 91 | RegisterParseFunc("nestedfail", func(_, _ string) (string, error) { 92 | return "", errors.New("nested fail") 93 | }) 94 | RegisterParseFunc("parsefail", func(_, _ string) (string, error) { 95 | return "", errors.New("parse fail") 96 | }) 97 | RegisterLoadFunc("loadfail", func(_ *goquery.Selection, _, _ string) (interface{}, error) { 98 | return "", errors.New("load fail") 99 | }) 100 | 101 | RegisterTypeLoader("customtype", 102 | func(t reflect.Type) bool { 103 | return strings.HasSuffix(t.PkgPath(), "test") && t.Name() == "CustomType" 104 | }, 105 | func(_ *goquery.Selection, text string) (interface{}, error) { 106 | return test.CustomType(text), nil 107 | }, 108 | ) 109 | 110 | var expectederrs = []string{ 111 | `invalid kind: map`, 112 | `p.int: strconv.ParseBool: parsing "-48": invalid syntax`, 113 | `p.bool: strconv.ParseInt: parsing "true": invalid syntax`, 114 | `p.bool: strconv.ParseUint: parsing "true": invalid syntax`, 115 | `p.bool: strconv.ParseFloat: parsing "true": invalid syntax`, 116 | `p.bool: (loader fail) "parsing time \"true\": extra text: true"`, 117 | `attribute not found: attr(missing)`, 118 | `attribute not found: attr(missing)`, 119 | `attribute not found: attr(missing)`, 120 | `Bad tag: "sq:\"derp(\\d)\""`, 121 | `p.bool: (parser fail) "parse fail"`, 122 | `p.bool: (loader fail) "load fail"`, 123 | `div: (parser fail) "nested fail"`, 124 | `div: (parser fail) "nested fail"`, 125 | `private field with sq tag: "privatetagged"`, 126 | `"blink" did not match`, 127 | `"blink.selection" did not match`, 128 | `"blink.node" did not match`, 129 | `"blink.javascript" did not match`, 130 | `"blink.css" did not match`, 131 | `Bad accessor: "badacc.goquery"`, 132 | `Bad accessor: "badacc.node"`, 133 | `Bad accessor: "badacc.url"`, 134 | `Bad accessor: "badacc.javascript"`, 135 | `Bad accessor: "badacc.css"`, 136 | `a: (parser fail) "parse fail"`, 137 | `a: (parser fail) "parse fail"`, 138 | `a: (parser fail) "parse fail"`, 139 | `a: (parser fail) "parse fail"`, 140 | `a: (parser fail) "parse fail"`, 141 | } 142 | 143 | var tt test.TextType 144 | errs := Scrape(&tt, strings.NewReader(testHTML)) 145 | if len(errs) != len(expectederrs) { 146 | t.Errorf("Expected %q\ngot %q", expectederrs, errs) 147 | } else { 148 | for i, err := range errs { 149 | if err.Error() != expectederrs[i] { 150 | t.Errorf("Expected %q, got %q", expectederrs[i], err.Error()) 151 | } 152 | } 153 | } 154 | 155 | if tt.Struct.String1 != "11" { 156 | t.Errorf("Expected %q, got %q", "11", tt.Struct.String1) 157 | } 158 | if tt.Struct.String2 != "22" { 159 | t.Errorf("Expected %q, got %q", "22", tt.Struct.String2) 160 | } 161 | if tt.Struct.String3 != "33" { 162 | t.Errorf("Expected %q, got %q", "33", tt.Struct.String3) 163 | } 164 | for i, row := range tt.StructSlice { 165 | v1 := strings.Repeat("1", i+1) 166 | if row.String1 != v1 { 167 | t.Errorf("Expected %q, got %q", v1, row.String1) 168 | } 169 | v2 := strings.Repeat("2", i+1) 170 | if row.String2 != v2 { 171 | t.Errorf("Expected %q, got %q", v2, row.String2) 172 | } 173 | v3 := strings.Repeat("3", i+1) 174 | if row.String3 != v3 { 175 | t.Errorf("Expected %q, got %q", v3, row.String3) 176 | } 177 | markup := fmt.Sprintf("%s%s%s", v1, v2, v3) 178 | if row.RowMarkup != markup { 179 | t.Errorf("Expected %q, got %q", markup, row.RowMarkup) 180 | } 181 | } 182 | for i, v := range tt.Array { 183 | if v != i { 184 | t.Errorf("Expected %d, got %d", i, v) 185 | } 186 | } 187 | if len(tt.Slice) != 3 { 188 | t.Errorf("Expected slice len %d, got %d", 3, len(tt.Slice)) 189 | } 190 | for i, v := range tt.Slice { 191 | if ev := float64(i) + 0.1; v != ev { 192 | t.Errorf("Expected %f, got %f", ev, v) 193 | } 194 | } 195 | if !bytes.Equal(tt.EightByteArray[:], []byte("EightByt")) { 196 | t.Errorf("Expected %q, got %q", []byte("EightByt"), tt.EightByteArray) 197 | } 198 | if !bytes.Equal(tt.ByteSlice, []byte("ByteSlice")) { 199 | t.Errorf("Expected %q, got %q", []byte("ByteSlice"), tt.ByteSlice) 200 | } 201 | if tt.Bool != true { 202 | t.Errorf("Expected %v, got %v", true, tt.Bool) 203 | } 204 | if tt.Byte != 8 { 205 | t.Errorf("Expected %v, got %v", 8, tt.Byte) 206 | } 207 | if tt.Int != -48 { 208 | t.Errorf("Expected %v, got %v", -48, tt.Int) 209 | } 210 | if tt.Int8 != 8 { 211 | t.Errorf("Expected %v, got %v", 8, tt.Int8) 212 | } 213 | if tt.Int16 != 16 { 214 | t.Errorf("Expected %v, got %v", 16, tt.Int16) 215 | } 216 | if tt.Int32 != 32 { 217 | t.Errorf("Expected %v, got %v", 32, tt.Int32) 218 | } 219 | if tt.Int64 != 64 { 220 | t.Errorf("Expected %v, got %v", 64, tt.Int64) 221 | } 222 | if *tt.Uint != 48 { 223 | t.Errorf("Expected %v, got %v", 48, tt.Uint) 224 | } 225 | if tt.Uint8 != 8 { 226 | t.Errorf("Expected %v, got %v", 8, tt.Uint8) 227 | } 228 | if tt.Uint16 != 16 { 229 | t.Errorf("Expected %v, got %v", 16, tt.Uint16) 230 | } 231 | if tt.Uint32 != 32 { 232 | t.Errorf("Expected %v, got %v", 32, tt.Uint32) 233 | } 234 | if tt.Uint64 != 64 { 235 | t.Errorf("Expected %v, got %v", 64, tt.Uint64) 236 | } 237 | if tt.Uintptr != 255 { 238 | t.Errorf("Expected %v, got %v", 255, tt.Uintptr) 239 | } 240 | if tt.Float32 != 1.234 { 241 | t.Errorf("Expected %v, got %v", 1.234, tt.Float32) 242 | } 243 | if tt.Float64 != 2.468 { 244 | t.Errorf("Expected %v, got %v", 2.468, tt.Float64) 245 | } 246 | if tt.Interface != "Interface" { 247 | t.Errorf("Expected %v, got %v", "Interface", tt.Interface) 248 | } 249 | if tt.String != "String" { 250 | t.Errorf("Expected %v, got %v", "String", tt.String) 251 | } 252 | if tt.Time.Year() != 2016 { 253 | t.Errorf("Expected %d, got %d", 2016, tt.Time.Year()) 254 | } 255 | if tt.Time.Month() != 5 { 256 | t.Errorf("Expected %d, got %d", 5, tt.Time.Month()) 257 | } 258 | if tt.Time.Day() != 23 { 259 | t.Errorf("Expected %d, got %d", 23, tt.Time.Day()) 260 | } 261 | if tt.PointerToTime.Year() != 2016 { 262 | t.Errorf("Expected %d, got %d", 2016, tt.PointerToTime.Year()) 263 | } 264 | if tt.PointerToTime.Month() != 5 { 265 | t.Errorf("Expected %d, got %d", 5, tt.PointerToTime.Month()) 266 | } 267 | if tt.PointerToTime.Day() != 23 { 268 | t.Errorf("Expected %d, got %d", 23, tt.PointerToTime.Day()) 269 | } 270 | if tt.URL.String() != "https://www.google.com" { 271 | t.Errorf("Expected: %q, got %q", "https://www.google.com", tt.URL.String()) 272 | } 273 | if a := tt.Selection.Find("a"); a.Text() != "∆" { 274 | t.Errorf("Expected %q, got %q", "∆", a.Text()) 275 | } 276 | if len(tt.Selections) != 3 { 277 | t.Errorf("Expected 3 items, got %d", len(tt.Selections)) 278 | } 279 | for i, sel := range tt.Selections { 280 | if strconv.Itoa(i) != sel.Text() { 281 | t.Errorf("Expected %v, got %v", i, sel.Text()) 282 | } 283 | } 284 | if tt.Node.Data != "div" { 285 | t.Errorf("Expected %q, got %q", "div", tt.Node.Data) 286 | } 287 | for i, node := range tt.Nodes { 288 | if node.Data != "p" { 289 | t.Errorf("Expected %q, got %q", "p", node.Data) 290 | } 291 | if node.FirstChild.Data != strconv.Itoa(i) { 292 | t.Errorf("Expected %d, got %q", i, node.FirstChild.Data) 293 | } 294 | } 295 | { 296 | decl := tt.Javascript.DeclarationList[0].(*ast.VariableDeclaration) 297 | val := decl.List[0].Initializer.(*ast.NumberLiteral) 298 | if val.Literal != "1" { 299 | t.Errorf("Expected 1, got %q", val.Literal) 300 | } 301 | } 302 | for i := 0; i < 3; i++ { 303 | decl := tt.Javascripts[i].DeclarationList[0].(*ast.VariableDeclaration) 304 | val := decl.List[0].Initializer.(*ast.NumberLiteral) 305 | if ev := strconv.Itoa(i + 1); ev != val.Literal { 306 | t.Errorf("Expected %q, got %q", ev, val.Literal) 307 | } 308 | } 309 | if val := tt.Stylesheet.Rules[0].Declarations[0].Value; val != "1px" { 310 | t.Errorf("Expected 1px, got %q", val) 311 | } 312 | for i := 0; i < 3; i++ { 313 | exp := strconv.Itoa(i+1) + "px" 314 | val := tt.Stylesheets[i].Rules[0].Declarations[0].Value 315 | if val != exp { 316 | t.Errorf("Expected %q, got %q", exp, val) 317 | } 318 | } 319 | 320 | } 321 | --------------------------------------------------------------------------------