├── .github └── workflows │ └── testing.yml ├── .gitignore ├── LICENSE ├── README.md ├── cache.go ├── go.mod ├── go.sum ├── query.go └── query_test.go /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | test: 6 | strategy: 7 | matrix: 8 | go-version: ["1.20", 1.21.x, 1.22.x] 9 | os: [ubuntu-latest, macos-latest, windows-latest] 10 | runs-on: ${{ matrix.os }} 11 | 12 | steps: 13 | - name: Setup Go 14 | uses: actions/setup-go@v5 15 | with: 16 | go-version: ${{ matrix.go-version }} 17 | 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | 21 | - name: Test 22 | run: | 23 | go version 24 | go test . -v -cover 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode 3 | debug 4 | *.test 5 | 6 | ./build 7 | 8 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 9 | *.o 10 | *.a 11 | *.so 12 | 13 | 14 | # Folders 15 | _obj 16 | _test 17 | 18 | # Architecture specific extensions/prefixes 19 | *.[568vq] 20 | [568vq].out 21 | 22 | *.cgo1.go 23 | *.cgo2.c 24 | _cgo_defun.c 25 | _cgo_gotypes.go 26 | _cgo_export.* 27 | 28 | _testmain.go 29 | 30 | *.exe 31 | *.test 32 | *.prof -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy 2 | of this software and associated documentation files (the "Software"), to deal 3 | in the Software without restriction, including without limitation the rights 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 5 | copies of the Software, and to permit persons to whom the Software is 6 | furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in 9 | all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # htmlquery 2 | 3 | [![Build Status](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml/badge.svg)](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml) 4 | [![GoDoc](https://godoc.org/github.com/antchfx/htmlquery?status.svg)](https://godoc.org/github.com/antchfx/htmlquery) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/htmlquery)](https://goreportcard.com/report/github.com/antchfx/htmlquery) 6 | 7 | # Overview 8 | 9 | `htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression. 10 | 11 | `htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query. 12 | 13 | You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath 14 | 15 | # XPath query packages for Go 16 | 17 | | Name | Description | 18 | | ------------------------------------------------- | ----------------------------------------- | 19 | | [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document | 20 | | [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document | 21 | | [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document | 22 | 23 | # Installation 24 | 25 | ``` 26 | go get github.com/antchfx/htmlquery 27 | ``` 28 | 29 | # Getting Started 30 | 31 | #### Query, returns matched elements or error. 32 | 33 | ```go 34 | nodes, err := htmlquery.QueryAll(doc, "//a") 35 | if err != nil { 36 | panic(`not a valid XPath expression.`) 37 | } 38 | ``` 39 | 40 | #### Load HTML document from URL. 41 | 42 | ```go 43 | doc, err := htmlquery.LoadURL("http://example.com/") 44 | ``` 45 | 46 | #### Load HTML from document. 47 | 48 | ```go 49 | filePath := "/home/user/sample.html" 50 | doc, err := htmlquery.LoadDoc(filePath) 51 | ``` 52 | 53 | #### Load HTML document from string. 54 | 55 | ```go 56 | s := `....` 57 | doc, err := htmlquery.Parse(strings.NewReader(s)) 58 | ``` 59 | 60 | #### Find all A elements. 61 | 62 | ```go 63 | list := htmlquery.Find(doc, "//a") 64 | ``` 65 | 66 | #### Find all A elements that have `href` attribute. 67 | 68 | ```go 69 | list := htmlquery.Find(doc, "//a[@href]") 70 | ``` 71 | 72 | #### Find all A elements with `href` attribute and only return `href` value. 73 | 74 | ```go 75 | list := htmlquery.Find(doc, "//a/@href") 76 | for _ , n := range list{ 77 | fmt.Println(htmlquery.InnerText(n)) // output @href value 78 | } 79 | ``` 80 | 81 | ### Find the third A element. 82 | 83 | ```go 84 | a := htmlquery.FindOne(doc, "//a[3]") 85 | ``` 86 | 87 | ### Find children element (img) under A `href` and print the source 88 | 89 | ```go 90 | a := htmlquery.FindOne(doc, "//a") 91 | img := htmlquery.FindOne(a, "//img") 92 | fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value 93 | ``` 94 | 95 | #### Evaluate the number of all IMG element. 96 | 97 | ```go 98 | expr, _ := xpath.Compile("count(//img)") 99 | v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64) 100 | fmt.Printf("total count is %f", v) 101 | ``` 102 | 103 | # Quick Starts 104 | 105 | ```go 106 | func main() { 107 | doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang") 108 | if err != nil { 109 | panic(err) 110 | } 111 | // Find all news item. 112 | list, err := htmlquery.QueryAll(doc, "//ol/li") 113 | if err != nil { 114 | panic(err) 115 | } 116 | for i, n := range list { 117 | a := htmlquery.FindOne(n, "//a") 118 | if a != nil { 119 | fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href")) 120 | } 121 | } 122 | } 123 | ``` 124 | 125 | # FAQ 126 | 127 | #### `Find()` vs `QueryAll()`, which is better? 128 | 129 | `Find` and `QueryAll` both do the same things, searches all of matched html nodes. 130 | The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you. 131 | 132 | #### Can I save my query expression object for the next query? 133 | 134 | Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object. 135 | 136 | Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance. 137 | 138 | #### XPath query object cache performance 139 | 140 | ``` 141 | goos: windows 142 | goarch: amd64 143 | pkg: github.com/antchfx/htmlquery 144 | BenchmarkSelectorCache-4 20000000 55.2 ns/op 145 | BenchmarkDisableSelectorCache-4 500000 3162 ns/op 146 | ``` 147 | 148 | #### How to disable caching? 149 | 150 | ``` 151 | htmlquery.DisableSelectorCache = true 152 | ``` 153 | 154 | # Questions 155 | 156 | Please let me know if you have any questions. 157 | -------------------------------------------------------------------------------- /cache.go: -------------------------------------------------------------------------------- 1 | package htmlquery 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/antchfx/xpath" 7 | "github.com/golang/groupcache/lru" 8 | ) 9 | 10 | // DisableSelectorCache will disable caching for the query selector if value is true. 11 | var DisableSelectorCache = false 12 | 13 | // SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50. 14 | // Will disable caching if SelectorCacheMaxEntries <= 0. 15 | var SelectorCacheMaxEntries = 50 16 | 17 | var ( 18 | cacheOnce sync.Once 19 | cache *lru.Cache 20 | cacheMutex sync.Mutex 21 | ) 22 | 23 | func getQuery(expr string) (*xpath.Expr, error) { 24 | if DisableSelectorCache || SelectorCacheMaxEntries <= 0 { 25 | return xpath.Compile(expr) 26 | } 27 | cacheOnce.Do(func() { 28 | cache = lru.New(SelectorCacheMaxEntries) 29 | }) 30 | cacheMutex.Lock() 31 | defer cacheMutex.Unlock() 32 | if v, ok := cache.Get(expr); ok { 33 | return v.(*xpath.Expr), nil 34 | } 35 | v, err := xpath.Compile(expr) 36 | if err != nil { 37 | return nil, err 38 | } 39 | cache.Add(expr, v) 40 | return v, nil 41 | 42 | } 43 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/antchfx/htmlquery 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/antchfx/xpath v1.3.3 7 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da 8 | golang.org/x/net v0.33.0 9 | ) 10 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs= 2 | github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= 3 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= 4 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 5 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 6 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 8 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 9 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= 10 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 11 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= 12 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 13 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 14 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 15 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 16 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 17 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 18 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 19 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 20 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 21 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 22 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 23 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= 24 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 25 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= 26 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= 27 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 28 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 29 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 30 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 31 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 32 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 33 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 34 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 35 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 36 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 37 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 38 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 39 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 40 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 41 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 42 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 43 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 44 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 45 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 46 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= 47 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 48 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 49 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 50 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 51 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= 52 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= 53 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= 54 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 55 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 56 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 57 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 58 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 59 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 60 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 61 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 62 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 63 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= 64 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 65 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 66 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 67 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 68 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 69 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= 70 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 71 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 72 | -------------------------------------------------------------------------------- /query.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package htmlquery provides extract data from HTML documents using XPath expression. 3 | */ 4 | package htmlquery 5 | 6 | import ( 7 | "bufio" 8 | "compress/gzip" 9 | "compress/zlib" 10 | "fmt" 11 | "io" 12 | "net/http" 13 | "os" 14 | "strings" 15 | 16 | "github.com/antchfx/xpath" 17 | "golang.org/x/net/html" 18 | "golang.org/x/net/html/charset" 19 | ) 20 | 21 | var _ xpath.NodeNavigator = &NodeNavigator{} 22 | 23 | // CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node. 24 | func CreateXPathNavigator(top *html.Node) *NodeNavigator { 25 | return &NodeNavigator{curr: top, root: top, attr: -1} 26 | } 27 | 28 | // Find is like QueryAll but Will panics if the expression `expr` cannot be parsed. 29 | // 30 | // See `QueryAll()` function. 31 | func Find(top *html.Node, expr string) []*html.Node { 32 | nodes, err := QueryAll(top, expr) 33 | if err != nil { 34 | panic(err) 35 | } 36 | return nodes 37 | } 38 | 39 | // FindOne is like Query but will panics if the expression `expr` cannot be parsed. 40 | // See `Query()` function. 41 | func FindOne(top *html.Node, expr string) *html.Node { 42 | node, err := Query(top, expr) 43 | if err != nil { 44 | panic(err) 45 | } 46 | return node 47 | } 48 | 49 | // QueryAll searches the html.Node that matches by the specified XPath expr. 50 | // Return an error if the expression `expr` cannot be parsed. 51 | func QueryAll(top *html.Node, expr string) ([]*html.Node, error) { 52 | exp, err := getQuery(expr) 53 | if err != nil { 54 | return nil, err 55 | } 56 | nodes := QuerySelectorAll(top, exp) 57 | return nodes, nil 58 | } 59 | 60 | // Query runs the given XPath expression against the given html.Node and 61 | // returns the first matching html.Node, or nil if no matches are found. 62 | // 63 | // Returns an error if the expression `expr` cannot be parsed. 64 | func Query(top *html.Node, expr string) (*html.Node, error) { 65 | exp, err := getQuery(expr) 66 | if err != nil { 67 | return nil, err 68 | } 69 | return QuerySelector(top, exp), nil 70 | } 71 | 72 | // QuerySelector returns the first matched html.Node by the specified XPath selector. 73 | func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node { 74 | t := selector.Select(CreateXPathNavigator(top)) 75 | if t.MoveNext() { 76 | return getCurrentNode(t.Current().(*NodeNavigator)) 77 | } 78 | return nil 79 | } 80 | 81 | // QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors. 82 | func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node { 83 | var elems []*html.Node 84 | t := selector.Select(CreateXPathNavigator(top)) 85 | for t.MoveNext() { 86 | nav := t.Current().(*NodeNavigator) 87 | n := getCurrentNode(nav) 88 | elems = append(elems, n) 89 | } 90 | return elems 91 | } 92 | 93 | // LoadURL loads the HTML document from the specified URL. Default enabling gzip on a HTTP request. 94 | func LoadURL(url string) (*html.Node, error) { 95 | req, err := http.NewRequest("GET", url, nil) 96 | if err != nil { 97 | return nil, err 98 | } 99 | // Enable gzip compression. 100 | req.Header.Add("Accept-Encoding", "gzip") 101 | resp, err := http.DefaultClient.Do(req) 102 | if err != nil { 103 | return nil, err 104 | } 105 | var reader io.ReadCloser 106 | 107 | defer func() { 108 | if reader != nil { 109 | reader.Close() 110 | } 111 | }() 112 | encoding := resp.Header.Get("Content-Encoding") 113 | switch encoding { 114 | case "gzip": 115 | reader, err = gzip.NewReader(resp.Body) 116 | if err != nil { 117 | return nil, err 118 | } 119 | case "deflate": 120 | reader, err = zlib.NewReader(resp.Body) 121 | if err != nil { 122 | return nil, err 123 | } 124 | case "": 125 | reader = resp.Body 126 | default: 127 | return nil, fmt.Errorf("%s compression is not support", encoding) 128 | } 129 | 130 | r, err := charset.NewReader(reader, resp.Header.Get("Content-Type")) 131 | if err != nil { 132 | return nil, err 133 | } 134 | return html.Parse(r) 135 | } 136 | 137 | // LoadDoc loads the HTML document from the specified file path. 138 | func LoadDoc(path string) (*html.Node, error) { 139 | f, err := os.Open(path) 140 | if err != nil { 141 | return nil, err 142 | } 143 | defer f.Close() 144 | 145 | return html.Parse(bufio.NewReader(f)) 146 | } 147 | 148 | func getCurrentNode(n *NodeNavigator) *html.Node { 149 | if n.NodeType() == xpath.AttributeNode { 150 | childNode := &html.Node{ 151 | Type: html.TextNode, 152 | Data: n.Value(), 153 | } 154 | return &html.Node{ 155 | Type: html.ElementNode, 156 | Data: n.LocalName(), 157 | FirstChild: childNode, 158 | LastChild: childNode, 159 | } 160 | 161 | } 162 | return n.curr 163 | } 164 | 165 | // Parse returns the parse tree for the HTML from the given Reader. 166 | func Parse(r io.Reader) (*html.Node, error) { 167 | return html.Parse(r) 168 | } 169 | 170 | // InnerText returns the text between the start and end tags of the object. 171 | func InnerText(n *html.Node) string { 172 | var output func(*strings.Builder, *html.Node) 173 | output = func(b *strings.Builder, n *html.Node) { 174 | switch n.Type { 175 | case html.TextNode: 176 | b.WriteString(n.Data) 177 | return 178 | case html.CommentNode: 179 | return 180 | } 181 | for child := n.FirstChild; child != nil; child = child.NextSibling { 182 | output(b, child) 183 | } 184 | } 185 | 186 | var b strings.Builder 187 | output(&b, n) 188 | return b.String() 189 | } 190 | 191 | // SelectAttr returns the attribute value with the specified name. 192 | func SelectAttr(n *html.Node, name string) (val string) { 193 | if n == nil { 194 | return 195 | } 196 | if n.Type == html.ElementNode && n.Parent == nil && name == n.Data { 197 | return InnerText(n) 198 | } 199 | for _, attr := range n.Attr { 200 | if attr.Key == name { 201 | val = attr.Val 202 | break 203 | } 204 | } 205 | return 206 | } 207 | 208 | // ExistsAttr returns whether attribute with specified name exists. 209 | func ExistsAttr(n *html.Node, name string) bool { 210 | if n == nil { 211 | return false 212 | } 213 | for _, attr := range n.Attr { 214 | if attr.Key == name { 215 | return true 216 | } 217 | } 218 | return false 219 | } 220 | 221 | // OutputHTML returns the text including tags name. 222 | func OutputHTML(n *html.Node, self bool) string { 223 | var b strings.Builder 224 | if self { 225 | html.Render(&b, n) 226 | } else { 227 | for n := n.FirstChild; n != nil; n = n.NextSibling { 228 | html.Render(&b, n) 229 | } 230 | } 231 | return b.String() 232 | } 233 | 234 | type NodeNavigator struct { 235 | root, curr *html.Node 236 | attr int 237 | } 238 | 239 | func (h *NodeNavigator) Current() *html.Node { 240 | return h.curr 241 | } 242 | 243 | func (h *NodeNavigator) NodeType() xpath.NodeType { 244 | switch h.curr.Type { 245 | case html.CommentNode: 246 | return xpath.CommentNode 247 | case html.TextNode: 248 | return xpath.TextNode 249 | case html.DocumentNode: 250 | return xpath.RootNode 251 | case html.ElementNode: 252 | if h.attr != -1 { 253 | return xpath.AttributeNode 254 | } 255 | return xpath.ElementNode 256 | case html.DoctypeNode: 257 | // ignored declare and as Root-Node type. 258 | return xpath.RootNode 259 | } 260 | panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type)) 261 | } 262 | 263 | func (h *NodeNavigator) LocalName() string { 264 | if h.attr != -1 { 265 | return h.curr.Attr[h.attr].Key 266 | } 267 | return h.curr.Data 268 | } 269 | 270 | func (*NodeNavigator) Prefix() string { 271 | return "" 272 | } 273 | 274 | func (h *NodeNavigator) Value() string { 275 | switch h.curr.Type { 276 | case html.CommentNode: 277 | return h.curr.Data 278 | case html.ElementNode: 279 | if h.attr != -1 { 280 | return h.curr.Attr[h.attr].Val 281 | } 282 | return InnerText(h.curr) 283 | case html.TextNode: 284 | return h.curr.Data 285 | } 286 | return "" 287 | } 288 | 289 | func (h *NodeNavigator) Copy() xpath.NodeNavigator { 290 | n := *h 291 | return &n 292 | } 293 | 294 | func (h *NodeNavigator) MoveToRoot() { 295 | h.curr = h.root 296 | } 297 | 298 | func (h *NodeNavigator) MoveToParent() bool { 299 | if h.attr != -1 { 300 | h.attr = -1 301 | return true 302 | } else if node := h.curr.Parent; node != nil { 303 | h.curr = node 304 | return true 305 | } 306 | return false 307 | } 308 | 309 | func (h *NodeNavigator) MoveToNextAttribute() bool { 310 | if h.attr >= len(h.curr.Attr)-1 { 311 | return false 312 | } 313 | h.attr++ 314 | return true 315 | } 316 | 317 | func (h *NodeNavigator) MoveToChild() bool { 318 | if h.attr != -1 { 319 | return false 320 | } 321 | if node := h.curr.FirstChild; node != nil { 322 | h.curr = node 323 | return true 324 | } 325 | return false 326 | } 327 | 328 | func (h *NodeNavigator) MoveToFirst() bool { 329 | if h.attr != -1 || h.curr.PrevSibling == nil { 330 | return false 331 | } 332 | for { 333 | node := h.curr.PrevSibling 334 | if node == nil { 335 | break 336 | } 337 | h.curr = node 338 | } 339 | return true 340 | } 341 | 342 | func (h *NodeNavigator) String() string { 343 | return h.Value() 344 | } 345 | 346 | func (h *NodeNavigator) MoveToNext() bool { 347 | if h.attr != -1 { 348 | return false 349 | } 350 | if node := h.curr.NextSibling; node != nil { 351 | h.curr = node 352 | return true 353 | } 354 | return false 355 | } 356 | 357 | func (h *NodeNavigator) MoveToPrevious() bool { 358 | if h.attr != -1 { 359 | return false 360 | } 361 | if node := h.curr.PrevSibling; node != nil { 362 | h.curr = node 363 | return true 364 | } 365 | return false 366 | } 367 | 368 | func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool { 369 | node, ok := other.(*NodeNavigator) 370 | if !ok || node.root != h.root { 371 | return false 372 | } 373 | 374 | h.curr = node.curr 375 | h.attr = node.attr 376 | return true 377 | } 378 | -------------------------------------------------------------------------------- /query_test.go: -------------------------------------------------------------------------------- 1 | package htmlquery 2 | 3 | import ( 4 | "compress/gzip" 5 | "fmt" 6 | "io/ioutil" 7 | "net/http" 8 | "net/http/httptest" 9 | "os" 10 | "strings" 11 | "sync" 12 | "testing" 13 | 14 | "github.com/antchfx/xpath" 15 | "golang.org/x/net/html" 16 | ) 17 | 18 | const htmlSample = ` 19 | 20 | Hello,World! 21 | 22 | 23 |
24 |
25 | 26 |

City Gallery

27 |
28 | 35 |
36 |

London

37 | Mountain View 38 |

London is the capital city of England. It is the most populous city in the United Kingdom, with a metropolitan area of over 13 million inhabitants.

39 |

Standing on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.

40 |
41 | 42 |
43 | 44 | 45 | ` 46 | 47 | var testDoc = loadHTML(htmlSample) 48 | 49 | func BenchmarkSelectorCache(b *testing.B) { 50 | DisableSelectorCache = false 51 | for i := 0; i < b.N; i++ { 52 | getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*") 53 | } 54 | } 55 | 56 | func BenchmarkDisableSelectorCache(b *testing.B) { 57 | DisableSelectorCache = true 58 | for i := 0; i < b.N; i++ { 59 | getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*") 60 | } 61 | } 62 | 63 | func TestSelectorCache(t *testing.T) { 64 | SelectorCacheMaxEntries = 2 65 | for i := 1; i <= 3; i++ { 66 | getQuery(fmt.Sprintf("//a[position()=%d]", i)) 67 | } 68 | getQuery("//a[position()=3]") 69 | 70 | } 71 | 72 | func TestLoadURL(t *testing.T) { 73 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 74 | fmt.Fprint(w, htmlSample) 75 | })) 76 | defer ts.Close() 77 | 78 | _, err := LoadURL(ts.URL) 79 | if err != nil { 80 | t.Fatal(err) 81 | } 82 | } 83 | 84 | func TestLoadURLWithGzipResponse(t *testing.T) { 85 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 86 | w.Header().Add("Content-Encoding", "gzip") 87 | gz := gzip.NewWriter(w) 88 | defer gz.Close() 89 | fmt.Fprint(gz, htmlSample) 90 | })) 91 | defer ts.Close() 92 | 93 | _, err := LoadURL(ts.URL) 94 | if err != nil { 95 | t.Fatal(err) 96 | } 97 | } 98 | 99 | func TestLoadDoc(t *testing.T) { 100 | tempHTMLdoc, err := ioutil.TempFile("", "sample_*.html") 101 | if err != nil { 102 | t.Fatal(err) 103 | } 104 | tempHTMLFilename := tempHTMLdoc.Name() 105 | defer func(tempHTMLdoc *os.File, filename string) { 106 | tempHTMLdoc.Close() 107 | os.Remove(filename) 108 | }(tempHTMLdoc, tempHTMLFilename) 109 | 110 | tempHTMLdoc.Write([]byte(htmlSample)) 111 | 112 | if _, err := LoadDoc(tempHTMLFilename); err != nil { 113 | t.Fatal(err) 114 | } 115 | } 116 | 117 | func TestNavigator(t *testing.T) { 118 | top := FindOne(testDoc, "//html") 119 | nav := &NodeNavigator{curr: top, root: top, attr: -1} 120 | nav.MoveToChild() // HEAD 121 | nav.MoveToNext() 122 | if nav.NodeType() != xpath.TextNode { 123 | t.Fatalf("expectd node type is TextNode,but got %vs", nav.NodeType()) 124 | } 125 | nav.MoveToNext() // 126 | if nav.Value() != InnerText(FindOne(testDoc, "//body")) { 127 | t.Fatal("body not equal") 128 | } 129 | nav.MoveToPrevious() // 130 | nav.MoveToParent() // 131 | if nav.curr != top { 132 | t.Fatal("current node is not html node") 133 | } 134 | nav.MoveToNextAttribute() 135 | if nav.LocalName() != "lang" { 136 | t.Fatal("node not move to lang attribute") 137 | } 138 | 139 | nav.MoveToParent() 140 | nav.MoveToFirst() // 141 | if nav.curr.Type != html.DoctypeNode { 142 | t.Fatalf("expected node type is DoctypeNode,but got %d", nav.curr.Type) 143 | } 144 | } 145 | 146 | func TestXPath(t *testing.T) { 147 | node := FindOne(testDoc, "//html") 148 | if SelectAttr(node, "lang") != "en-US" { 149 | t.Fatal("//html[@lang] != en-Us") 150 | } 151 | 152 | node = FindOne(testDoc, "//header") 153 | if strings.Index(InnerText(node), "Logo") > 0 { 154 | t.Fatal("InnerText() have comment node text") 155 | } 156 | if !strings.Contains(OutputHTML(node, true), "Logo") { 157 | t.Fatal("OutputHTML() shoud have comment node text") 158 | } 159 | link := FindOne(testDoc, "//a[1]/@href") 160 | if link == nil { 161 | t.Fatal("link is nil") 162 | } 163 | if v := InnerText(link); v != "/London" { 164 | t.Fatalf("expect value is /London, but got %s", v) 165 | } 166 | 167 | } 168 | 169 | func TestXPathCdUp(t *testing.T) { 170 | doc := loadHTML(``) 171 | node := FindOne(doc, "//b/@attr/..") 172 | t.Logf("node = %#v", node) 173 | if node == nil || node.Data != "b" { 174 | t.Fatal("//b/@id/.. != ") 175 | } 176 | } 177 | 178 | func TestConcurrentQuery(t *testing.T) { 179 | var wg sync.WaitGroup 180 | for i := 0; i < 10; i++ { 181 | wg.Add(1) 182 | go func(i int) { 183 | defer wg.Done() 184 | s := `
a
` 185 | doc := loadHTML(s) 186 | if n := FindOne(doc, `//div`); n == nil { 187 | t.Fatalf("should find one but got nil [%d]", i) 188 | } 189 | }(i) 190 | } 191 | wg.Wait() 192 | } 193 | 194 | func loadHTML(str string) *html.Node { 195 | node, err := Parse(strings.NewReader(str)) 196 | if err != nil { 197 | panic(err) 198 | } 199 | return node 200 | } 201 | --------------------------------------------------------------------------------