├── .github
└── workflows
│ └── testing.yml
├── .gitignore
├── LICENSE
├── README.md
├── cache.go
├── go.mod
├── go.sum
├── query.go
└── query_test.go
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
1 | name: Testing
2 | on: [push, pull_request]
3 |
4 | jobs:
5 | test:
6 | strategy:
7 | matrix:
8 | go-version: ["1.20", 1.21.x, 1.22.x]
9 | os: [ubuntu-latest, macos-latest, windows-latest]
10 | runs-on: ${{ matrix.os }}
11 |
12 | steps:
13 | - name: Setup Go
14 | uses: actions/setup-go@v5
15 | with:
16 | go-version: ${{ matrix.go-version }}
17 |
18 | - name: Checkout code
19 | uses: actions/checkout@v4
20 |
21 | - name: Test
22 | run: |
23 | go version
24 | go test . -v -cover
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # vscode
2 | .vscode
3 | debug
4 | *.test
5 |
6 | ./build
7 |
8 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
9 | *.o
10 | *.a
11 | *.so
12 |
13 |
14 | # Folders
15 | _obj
16 | _test
17 |
18 | # Architecture specific extensions/prefixes
19 | *.[568vq]
20 | [568vq].out
21 |
22 | *.cgo1.go
23 | *.cgo2.c
24 | _cgo_defun.c
25 | _cgo_gotypes.go
26 | _cgo_export.*
27 |
28 | _testmain.go
29 |
30 | *.exe
31 | *.test
32 | *.prof
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Permission is hereby granted, free of charge, to any person obtaining a copy
2 | of this software and associated documentation files (the "Software"), to deal
3 | in the Software without restriction, including without limitation the rights
4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5 | copies of the Software, and to permit persons to whom the Software is
6 | furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in
9 | all copies or substantial portions of the Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # htmlquery
2 |
3 | [](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml)
4 | [](https://godoc.org/github.com/antchfx/htmlquery)
5 | [](https://goreportcard.com/report/github.com/antchfx/htmlquery)
6 |
7 | # Overview
8 |
9 | `htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
10 |
11 | `htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
12 |
13 | You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
14 |
15 | # XPath query packages for Go
16 |
17 | | Name | Description |
18 | | ------------------------------------------------- | ----------------------------------------- |
19 | | [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
20 | | [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
21 | | [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
22 |
23 | # Installation
24 |
25 | ```
26 | go get github.com/antchfx/htmlquery
27 | ```
28 |
29 | # Getting Started
30 |
31 | #### Query, returns matched elements or error.
32 |
33 | ```go
34 | nodes, err := htmlquery.QueryAll(doc, "//a")
35 | if err != nil {
36 | panic(`not a valid XPath expression.`)
37 | }
38 | ```
39 |
40 | #### Load HTML document from URL.
41 |
42 | ```go
43 | doc, err := htmlquery.LoadURL("http://example.com/")
44 | ```
45 |
46 | #### Load HTML from document.
47 |
48 | ```go
49 | filePath := "/home/user/sample.html"
50 | doc, err := htmlquery.LoadDoc(filePath)
51 | ```
52 |
53 | #### Load HTML document from string.
54 |
55 | ```go
56 | s := `....`
57 | doc, err := htmlquery.Parse(strings.NewReader(s))
58 | ```
59 |
60 | #### Find all A elements.
61 |
62 | ```go
63 | list := htmlquery.Find(doc, "//a")
64 | ```
65 |
66 | #### Find all A elements that have `href` attribute.
67 |
68 | ```go
69 | list := htmlquery.Find(doc, "//a[@href]")
70 | ```
71 |
72 | #### Find all A elements with `href` attribute and only return `href` value.
73 |
74 | ```go
75 | list := htmlquery.Find(doc, "//a/@href")
76 | for _ , n := range list{
77 | fmt.Println(htmlquery.InnerText(n)) // output @href value
78 | }
79 | ```
80 |
81 | ### Find the third A element.
82 |
83 | ```go
84 | a := htmlquery.FindOne(doc, "//a[3]")
85 | ```
86 |
87 | ### Find children element (img) under A `href` and print the source
88 |
89 | ```go
90 | a := htmlquery.FindOne(doc, "//a")
91 | img := htmlquery.FindOne(a, "//img")
92 | fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
93 | ```
94 |
95 | #### Evaluate the number of all IMG element.
96 |
97 | ```go
98 | expr, _ := xpath.Compile("count(//img)")
99 | v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
100 | fmt.Printf("total count is %f", v)
101 | ```
102 |
103 | # Quick Starts
104 |
105 | ```go
106 | func main() {
107 | doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
108 | if err != nil {
109 | panic(err)
110 | }
111 | // Find all news item.
112 | list, err := htmlquery.QueryAll(doc, "//ol/li")
113 | if err != nil {
114 | panic(err)
115 | }
116 | for i, n := range list {
117 | a := htmlquery.FindOne(n, "//a")
118 | if a != nil {
119 | fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
120 | }
121 | }
122 | }
123 | ```
124 |
125 | # FAQ
126 |
127 | #### `Find()` vs `QueryAll()`, which is better?
128 |
129 | `Find` and `QueryAll` both do the same things, searches all of matched html nodes.
130 | The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
131 |
132 | #### Can I save my query expression object for the next query?
133 |
134 | Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
135 |
136 | Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
137 |
138 | #### XPath query object cache performance
139 |
140 | ```
141 | goos: windows
142 | goarch: amd64
143 | pkg: github.com/antchfx/htmlquery
144 | BenchmarkSelectorCache-4 20000000 55.2 ns/op
145 | BenchmarkDisableSelectorCache-4 500000 3162 ns/op
146 | ```
147 |
148 | #### How to disable caching?
149 |
150 | ```
151 | htmlquery.DisableSelectorCache = true
152 | ```
153 |
154 | # Questions
155 |
156 | Please let me know if you have any questions.
157 |
--------------------------------------------------------------------------------
/cache.go:
--------------------------------------------------------------------------------
1 | package htmlquery
2 |
3 | import (
4 | "sync"
5 |
6 | "github.com/antchfx/xpath"
7 | "github.com/golang/groupcache/lru"
8 | )
9 |
10 | // DisableSelectorCache will disable caching for the query selector if value is true.
11 | var DisableSelectorCache = false
12 |
13 | // SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
14 | // Will disable caching if SelectorCacheMaxEntries <= 0.
15 | var SelectorCacheMaxEntries = 50
16 |
17 | var (
18 | cacheOnce sync.Once
19 | cache *lru.Cache
20 | cacheMutex sync.Mutex
21 | )
22 |
23 | func getQuery(expr string) (*xpath.Expr, error) {
24 | if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
25 | return xpath.Compile(expr)
26 | }
27 | cacheOnce.Do(func() {
28 | cache = lru.New(SelectorCacheMaxEntries)
29 | })
30 | cacheMutex.Lock()
31 | defer cacheMutex.Unlock()
32 | if v, ok := cache.Get(expr); ok {
33 | return v.(*xpath.Expr), nil
34 | }
35 | v, err := xpath.Compile(expr)
36 | if err != nil {
37 | return nil, err
38 | }
39 | cache.Add(expr, v)
40 | return v, nil
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/antchfx/htmlquery
2 |
3 | go 1.14
4 |
5 | require (
6 | github.com/antchfx/xpath v1.3.3
7 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da
8 | golang.org/x/net v0.33.0
9 | )
10 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
2 | github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
3 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
4 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
5 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
6 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
8 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
9 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
10 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
11 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
12 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
13 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
14 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
15 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
16 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
17 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
18 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
19 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
20 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
21 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
22 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
23 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
24 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
25 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
26 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
27 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
28 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
29 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
30 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
31 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
32 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
33 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
34 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
35 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
36 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
37 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
38 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
39 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
40 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
41 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
42 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
43 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
44 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
45 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
46 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
47 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
48 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
49 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
50 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
51 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
52 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
53 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
54 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
55 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
56 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
57 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
58 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
59 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
60 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
61 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
62 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
63 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
64 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
65 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
66 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
67 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
68 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
69 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
70 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
71 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
72 |
--------------------------------------------------------------------------------
/query.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package htmlquery provides extract data from HTML documents using XPath expression.
3 | */
4 | package htmlquery
5 |
6 | import (
7 | "bufio"
8 | "compress/gzip"
9 | "compress/zlib"
10 | "fmt"
11 | "io"
12 | "net/http"
13 | "os"
14 | "strings"
15 |
16 | "github.com/antchfx/xpath"
17 | "golang.org/x/net/html"
18 | "golang.org/x/net/html/charset"
19 | )
20 |
21 | var _ xpath.NodeNavigator = &NodeNavigator{}
22 |
23 | // CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
24 | func CreateXPathNavigator(top *html.Node) *NodeNavigator {
25 | return &NodeNavigator{curr: top, root: top, attr: -1}
26 | }
27 |
28 | // Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
29 | //
30 | // See `QueryAll()` function.
31 | func Find(top *html.Node, expr string) []*html.Node {
32 | nodes, err := QueryAll(top, expr)
33 | if err != nil {
34 | panic(err)
35 | }
36 | return nodes
37 | }
38 |
39 | // FindOne is like Query but will panics if the expression `expr` cannot be parsed.
40 | // See `Query()` function.
41 | func FindOne(top *html.Node, expr string) *html.Node {
42 | node, err := Query(top, expr)
43 | if err != nil {
44 | panic(err)
45 | }
46 | return node
47 | }
48 |
49 | // QueryAll searches the html.Node that matches by the specified XPath expr.
50 | // Return an error if the expression `expr` cannot be parsed.
51 | func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
52 | exp, err := getQuery(expr)
53 | if err != nil {
54 | return nil, err
55 | }
56 | nodes := QuerySelectorAll(top, exp)
57 | return nodes, nil
58 | }
59 |
60 | // Query runs the given XPath expression against the given html.Node and
61 | // returns the first matching html.Node, or nil if no matches are found.
62 | //
63 | // Returns an error if the expression `expr` cannot be parsed.
64 | func Query(top *html.Node, expr string) (*html.Node, error) {
65 | exp, err := getQuery(expr)
66 | if err != nil {
67 | return nil, err
68 | }
69 | return QuerySelector(top, exp), nil
70 | }
71 |
72 | // QuerySelector returns the first matched html.Node by the specified XPath selector.
73 | func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
74 | t := selector.Select(CreateXPathNavigator(top))
75 | if t.MoveNext() {
76 | return getCurrentNode(t.Current().(*NodeNavigator))
77 | }
78 | return nil
79 | }
80 |
81 | // QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
82 | func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
83 | var elems []*html.Node
84 | t := selector.Select(CreateXPathNavigator(top))
85 | for t.MoveNext() {
86 | nav := t.Current().(*NodeNavigator)
87 | n := getCurrentNode(nav)
88 | elems = append(elems, n)
89 | }
90 | return elems
91 | }
92 |
93 | // LoadURL loads the HTML document from the specified URL. Default enabling gzip on a HTTP request.
94 | func LoadURL(url string) (*html.Node, error) {
95 | req, err := http.NewRequest("GET", url, nil)
96 | if err != nil {
97 | return nil, err
98 | }
99 | // Enable gzip compression.
100 | req.Header.Add("Accept-Encoding", "gzip")
101 | resp, err := http.DefaultClient.Do(req)
102 | if err != nil {
103 | return nil, err
104 | }
105 | var reader io.ReadCloser
106 |
107 | defer func() {
108 | if reader != nil {
109 | reader.Close()
110 | }
111 | }()
112 | encoding := resp.Header.Get("Content-Encoding")
113 | switch encoding {
114 | case "gzip":
115 | reader, err = gzip.NewReader(resp.Body)
116 | if err != nil {
117 | return nil, err
118 | }
119 | case "deflate":
120 | reader, err = zlib.NewReader(resp.Body)
121 | if err != nil {
122 | return nil, err
123 | }
124 | case "":
125 | reader = resp.Body
126 | default:
127 | return nil, fmt.Errorf("%s compression is not support", encoding)
128 | }
129 |
130 | r, err := charset.NewReader(reader, resp.Header.Get("Content-Type"))
131 | if err != nil {
132 | return nil, err
133 | }
134 | return html.Parse(r)
135 | }
136 |
137 | // LoadDoc loads the HTML document from the specified file path.
138 | func LoadDoc(path string) (*html.Node, error) {
139 | f, err := os.Open(path)
140 | if err != nil {
141 | return nil, err
142 | }
143 | defer f.Close()
144 |
145 | return html.Parse(bufio.NewReader(f))
146 | }
147 |
148 | func getCurrentNode(n *NodeNavigator) *html.Node {
149 | if n.NodeType() == xpath.AttributeNode {
150 | childNode := &html.Node{
151 | Type: html.TextNode,
152 | Data: n.Value(),
153 | }
154 | return &html.Node{
155 | Type: html.ElementNode,
156 | Data: n.LocalName(),
157 | FirstChild: childNode,
158 | LastChild: childNode,
159 | }
160 |
161 | }
162 | return n.curr
163 | }
164 |
165 | // Parse returns the parse tree for the HTML from the given Reader.
166 | func Parse(r io.Reader) (*html.Node, error) {
167 | return html.Parse(r)
168 | }
169 |
170 | // InnerText returns the text between the start and end tags of the object.
171 | func InnerText(n *html.Node) string {
172 | var output func(*strings.Builder, *html.Node)
173 | output = func(b *strings.Builder, n *html.Node) {
174 | switch n.Type {
175 | case html.TextNode:
176 | b.WriteString(n.Data)
177 | return
178 | case html.CommentNode:
179 | return
180 | }
181 | for child := n.FirstChild; child != nil; child = child.NextSibling {
182 | output(b, child)
183 | }
184 | }
185 |
186 | var b strings.Builder
187 | output(&b, n)
188 | return b.String()
189 | }
190 |
191 | // SelectAttr returns the attribute value with the specified name.
192 | func SelectAttr(n *html.Node, name string) (val string) {
193 | if n == nil {
194 | return
195 | }
196 | if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
197 | return InnerText(n)
198 | }
199 | for _, attr := range n.Attr {
200 | if attr.Key == name {
201 | val = attr.Val
202 | break
203 | }
204 | }
205 | return
206 | }
207 |
208 | // ExistsAttr returns whether attribute with specified name exists.
209 | func ExistsAttr(n *html.Node, name string) bool {
210 | if n == nil {
211 | return false
212 | }
213 | for _, attr := range n.Attr {
214 | if attr.Key == name {
215 | return true
216 | }
217 | }
218 | return false
219 | }
220 |
221 | // OutputHTML returns the text including tags name.
222 | func OutputHTML(n *html.Node, self bool) string {
223 | var b strings.Builder
224 | if self {
225 | html.Render(&b, n)
226 | } else {
227 | for n := n.FirstChild; n != nil; n = n.NextSibling {
228 | html.Render(&b, n)
229 | }
230 | }
231 | return b.String()
232 | }
233 |
234 | type NodeNavigator struct {
235 | root, curr *html.Node
236 | attr int
237 | }
238 |
239 | func (h *NodeNavigator) Current() *html.Node {
240 | return h.curr
241 | }
242 |
243 | func (h *NodeNavigator) NodeType() xpath.NodeType {
244 | switch h.curr.Type {
245 | case html.CommentNode:
246 | return xpath.CommentNode
247 | case html.TextNode:
248 | return xpath.TextNode
249 | case html.DocumentNode:
250 | return xpath.RootNode
251 | case html.ElementNode:
252 | if h.attr != -1 {
253 | return xpath.AttributeNode
254 | }
255 | return xpath.ElementNode
256 | case html.DoctypeNode:
257 | // ignored declare and as Root-Node type.
258 | return xpath.RootNode
259 | }
260 | panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
261 | }
262 |
263 | func (h *NodeNavigator) LocalName() string {
264 | if h.attr != -1 {
265 | return h.curr.Attr[h.attr].Key
266 | }
267 | return h.curr.Data
268 | }
269 |
270 | func (*NodeNavigator) Prefix() string {
271 | return ""
272 | }
273 |
274 | func (h *NodeNavigator) Value() string {
275 | switch h.curr.Type {
276 | case html.CommentNode:
277 | return h.curr.Data
278 | case html.ElementNode:
279 | if h.attr != -1 {
280 | return h.curr.Attr[h.attr].Val
281 | }
282 | return InnerText(h.curr)
283 | case html.TextNode:
284 | return h.curr.Data
285 | }
286 | return ""
287 | }
288 |
289 | func (h *NodeNavigator) Copy() xpath.NodeNavigator {
290 | n := *h
291 | return &n
292 | }
293 |
294 | func (h *NodeNavigator) MoveToRoot() {
295 | h.curr = h.root
296 | }
297 |
298 | func (h *NodeNavigator) MoveToParent() bool {
299 | if h.attr != -1 {
300 | h.attr = -1
301 | return true
302 | } else if node := h.curr.Parent; node != nil {
303 | h.curr = node
304 | return true
305 | }
306 | return false
307 | }
308 |
309 | func (h *NodeNavigator) MoveToNextAttribute() bool {
310 | if h.attr >= len(h.curr.Attr)-1 {
311 | return false
312 | }
313 | h.attr++
314 | return true
315 | }
316 |
317 | func (h *NodeNavigator) MoveToChild() bool {
318 | if h.attr != -1 {
319 | return false
320 | }
321 | if node := h.curr.FirstChild; node != nil {
322 | h.curr = node
323 | return true
324 | }
325 | return false
326 | }
327 |
328 | func (h *NodeNavigator) MoveToFirst() bool {
329 | if h.attr != -1 || h.curr.PrevSibling == nil {
330 | return false
331 | }
332 | for {
333 | node := h.curr.PrevSibling
334 | if node == nil {
335 | break
336 | }
337 | h.curr = node
338 | }
339 | return true
340 | }
341 |
342 | func (h *NodeNavigator) String() string {
343 | return h.Value()
344 | }
345 |
346 | func (h *NodeNavigator) MoveToNext() bool {
347 | if h.attr != -1 {
348 | return false
349 | }
350 | if node := h.curr.NextSibling; node != nil {
351 | h.curr = node
352 | return true
353 | }
354 | return false
355 | }
356 |
357 | func (h *NodeNavigator) MoveToPrevious() bool {
358 | if h.attr != -1 {
359 | return false
360 | }
361 | if node := h.curr.PrevSibling; node != nil {
362 | h.curr = node
363 | return true
364 | }
365 | return false
366 | }
367 |
368 | func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
369 | node, ok := other.(*NodeNavigator)
370 | if !ok || node.root != h.root {
371 | return false
372 | }
373 |
374 | h.curr = node.curr
375 | h.attr = node.attr
376 | return true
377 | }
378 |
--------------------------------------------------------------------------------
/query_test.go:
--------------------------------------------------------------------------------
1 | package htmlquery
2 |
3 | import (
4 | "compress/gzip"
5 | "fmt"
6 | "io/ioutil"
7 | "net/http"
8 | "net/http/httptest"
9 | "os"
10 | "strings"
11 | "sync"
12 | "testing"
13 |
14 | "github.com/antchfx/xpath"
15 | "golang.org/x/net/html"
16 | )
17 |
18 | const htmlSample = `
19 |
20 | Hello,World!
21 |
22 |
23 |
24 |
25 |
26 |
City Gallery
27 |
28 |
35 |
36 |
London
37 |
38 |
London is the capital city of England. It is the most populous city in the United Kingdom, with a metropolitan area of over 13 million inhabitants.
39 |
Standing on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.