"); err != nil {
161 | t.Error(err)
162 | }
163 | if err := c.InsertAfterAsText("
"); err != nil {
164 | t.Error(err)
165 | }
166 | return lolhtml.Continue
167 | },
168 | },
169 | },
170 | },
171 | )
172 | if err != nil {
173 | t.Error(err)
174 | }
175 |
176 | if _, err := w.Write([]byte("")); err != nil {
177 | t.Error(err)
178 | }
179 | if err := w.Close(); err != nil {
180 | t.Error(err)
181 | }
182 | wantedText := "</div>"
183 | if finalText := buf.String(); finalText != wantedText {
184 | t.Errorf("want %s got %s \n", wantedText, finalText)
185 | }
186 | }
187 |
188 | func TestComment_StopRewriting(t *testing.T) {
189 | var buf bytes.Buffer
190 | w, err := lolhtml.NewWriter(
191 | &buf,
192 | &lolhtml.Handlers{
193 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
194 | {
195 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective {
196 | return lolhtml.Stop
197 | },
198 | },
199 | },
200 | },
201 | )
202 | if err != nil {
203 | t.Error(err)
204 | }
205 |
206 | _, err = w.Write([]byte("
"))
207 | if err == nil {
208 | t.FailNow()
209 | }
210 | if err.Error() != "The rewriter has been stopped." {
211 | t.Error(err)
212 | }
213 | }
214 |
215 | func TestComment_StopRewritingWithSelector(t *testing.T) {
216 | var buf bytes.Buffer
217 | w, err := lolhtml.NewWriter(
218 | &buf,
219 | &lolhtml.Handlers{
220 | ElementContentHandler: []lolhtml.ElementContentHandler{
221 | {
222 | Selector: "*",
223 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective {
224 | return lolhtml.Stop
225 | },
226 | },
227 | },
228 | },
229 | )
230 | if err != nil {
231 | t.Error(err)
232 | }
233 |
234 | _, err = w.Write([]byte("
"))
235 | if err == nil {
236 | t.FailNow()
237 | }
238 | if err.Error() != "The rewriter has been stopped." {
239 | t.Error(err)
240 | }
241 | }
242 |
--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "io/ioutil"
8 | "path/filepath"
9 | "runtime"
10 | "testing"
11 |
12 | "github.com/coolspring8/go-lolhtml"
13 | )
14 |
15 | const dataDir = "testdata"
16 |
17 | const ChunkSize = 1024
18 |
19 | func BenchmarkNewWriter(b *testing.B) {
20 | benchmarks := []struct {
21 | category string
22 | name string
23 | handlers *lolhtml.Handlers
24 | }{
25 | {
26 | "Parsing",
27 | "TagScanner",
28 | nil,
29 | },
30 | {
31 | "Parsing",
32 | "Lexer",
33 | &lolhtml.Handlers{
34 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
35 | {
36 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective {
37 | return lolhtml.Continue
38 | },
39 | },
40 | },
41 | },
42 | },
43 | {
44 | "Parsing",
45 | "TextRewritableUnitParsingAndDecoding",
46 | &lolhtml.Handlers{
47 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
48 | {
49 | TextChunkHandler: func(c *lolhtml.TextChunk) lolhtml.RewriterDirective {
50 | return lolhtml.Continue
51 | },
52 | },
53 | },
54 | },
55 | },
56 | {
57 | "Rewriting",
58 | "ModificationOfTagsOfAnElementWithLotsOfContent",
59 | &lolhtml.Handlers{
60 | ElementContentHandler: []lolhtml.ElementContentHandler{
61 | {
62 | Selector: "body",
63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
64 | err := e.SetTagName("body1")
65 | if err != nil {
66 | b.Fatal(err)
67 | }
68 | err = e.InsertAfterEndTagAsText("test")
69 | if err != nil {
70 | b.Fatal(err)
71 | }
72 | return lolhtml.Continue
73 | },
74 | },
75 | },
76 | },
77 | },
78 | {
79 | "Rewriting",
80 | "RemoveContentOfAnElement",
81 | &lolhtml.Handlers{
82 | ElementContentHandler: []lolhtml.ElementContentHandler{
83 | {
84 | Selector: "ul",
85 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
86 | err := e.SetInnerContentAsText("")
87 | if err != nil {
88 | b.Fatal(err)
89 | }
90 | return lolhtml.Continue
91 | },
92 | },
93 | },
94 | },
95 | },
96 | {
97 | "SelectorMatching",
98 | "MatchAllSelector",
99 | &lolhtml.Handlers{
100 | ElementContentHandler: []lolhtml.ElementContentHandler{
101 | {
102 | Selector: "*",
103 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
104 | return lolhtml.Continue
105 | },
106 | },
107 | },
108 | },
109 | },
110 | {
111 | "SelectorMatching",
112 | "TagNameSelector",
113 | &lolhtml.Handlers{
114 | ElementContentHandler: []lolhtml.ElementContentHandler{
115 | {
116 | Selector: "div",
117 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
118 | return lolhtml.Continue
119 | },
120 | },
121 | },
122 | },
123 | },
124 | {
125 | "SelectorMatching",
126 | "ClassSelector",
127 | &lolhtml.Handlers{
128 | ElementContentHandler: []lolhtml.ElementContentHandler{
129 | {
130 | Selector: ".note",
131 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
132 | return lolhtml.Continue
133 | },
134 | },
135 | },
136 | },
137 | },
138 | {
139 | "SelectorMatching",
140 | "AttributeSelector",
141 | &lolhtml.Handlers{
142 | ElementContentHandler: []lolhtml.ElementContentHandler{
143 | {
144 | Selector: "[href]",
145 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
146 | return lolhtml.Continue
147 | },
148 | },
149 | },
150 | },
151 | },
152 | {
153 | "SelectorMatching",
154 | "MultipleSelectors",
155 | &lolhtml.Handlers{
156 | ElementContentHandler: []lolhtml.ElementContentHandler{
157 | {
158 | Selector: "ul",
159 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
160 | return lolhtml.Continue
161 | },
162 | },
163 | {
164 | Selector: "ul > li",
165 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
166 | return lolhtml.Continue
167 | },
168 | },
169 | {
170 | Selector: "table > tbody td dfn",
171 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
172 | return lolhtml.Continue
173 | },
174 | },
175 | {
176 | Selector: "body table > tbody tr",
177 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
178 | return lolhtml.Continue
179 | },
180 | },
181 | {
182 | Selector: "body [href]",
183 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
184 | return lolhtml.Continue
185 | },
186 | },
187 | {
188 | Selector: "div img",
189 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
190 | return lolhtml.Continue
191 | },
192 | },
193 | {
194 | Selector: "div.note span",
195 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
196 | return lolhtml.Continue
197 | },
198 | },
199 | },
200 | },
201 | },
202 | }
203 |
204 | files, err := ioutil.ReadDir(dataDir)
205 | if err != nil {
206 | b.Fatal("benchmark data files not found", err)
207 | }
208 |
209 | for _, file := range files {
210 | data, err := ioutil.ReadFile(filepath.Join(dataDir, file.Name()))
211 | if err != nil {
212 | b.Fatal("cannot read benchmark data files", err)
213 | }
214 |
215 | for _, bm := range benchmarks {
216 | b.Run(fmt.Sprintf("%s-%s-%s", bm.category, bm.name, file.Name()), func(b *testing.B) {
217 | b.SetBytes(int64(len(data)))
218 | b.ReportAllocs()
219 | runtime.GC()
220 | b.ResetTimer()
221 | for i := 0; i < b.N; i++ {
222 | w, err := lolhtml.NewWriter(nil, bm.handlers)
223 | if err != nil {
224 | b.Fatal(err)
225 | }
226 |
227 | r := bytes.NewReader(data)
228 | copyBuf := make([]byte, ChunkSize)
229 | _, err = io.CopyBuffer(w, r, copyBuf)
230 | if err != nil {
231 | b.Fatal(err)
232 | }
233 |
234 | err = w.Close()
235 | if err != nil {
236 | b.Fatal(err)
237 | }
238 | }
239 | })
240 | }
241 | }
242 | }
243 |
--------------------------------------------------------------------------------
/examples/web-scraper/main.go:
--------------------------------------------------------------------------------
1 | // This is a ported Go version of https://web.scraper.workers.dev/, whose source code is
2 | // available at https://github.com/adamschwartz/web.scraper.workers.dev licensed under MIT.
3 | //
4 | // This translation is for demonstration purpose only, so many parts of the code are suboptimal.
5 | //
6 | // Sometimes you may get a "different" result, as Go's encoding/json package always sorts the
7 | // keys of a map (when using multiple selectors), and encodes a nil slice as the null JSON value.
8 | package main
9 |
10 | import (
11 | "encoding/json"
12 | "fmt"
13 | "io"
14 | "log"
15 | "net/http"
16 | "regexp"
17 | "strings"
18 |
19 | "github.com/coolspring8/go-lolhtml"
20 | )
21 |
22 | var (
23 | debug = true
24 | listenAddress = ":80"
25 | mainPageFileName = "index.html"
26 | )
27 |
28 | var (
29 | urlHasPrefix = regexp.MustCompile(`^[a-zA-Z]+://`)
30 | unifyWhitespace = regexp.MustCompile(`\s{2,}`)
31 | )
32 |
33 | // used to separate texts in different elements.
34 | var textSeparator = "TEXT_SEPARATOR_TEXT_SEPARATOR"
35 |
36 | func main() {
37 | log.Printf("Server started at %s", listenAddress)
38 | http.HandleFunc("/", handler)
39 | log.Fatal(http.ListenAndServe(listenAddress, nil))
40 | }
41 |
42 | func handler(w http.ResponseWriter, req *http.Request) {
43 | log.Println(req.URL)
44 |
45 | // 404
46 | if req.URL.Path != "/" {
47 | w.WriteHeader(http.StatusNotFound)
48 | _, _ = w.Write([]byte("Not found"))
49 | return
50 | }
51 |
52 | q := req.URL.Query()
53 |
54 | url := q.Get("url")
55 | if url != "" && !urlHasPrefix.MatchString(url) {
56 | url = "http://" + url
57 | }
58 |
59 | selector := q.Get("selector")
60 |
61 | attr := q.Get("attr")
62 |
63 | var spaced bool
64 | _spaced := q.Get("spaced")
65 | if _spaced != "" {
66 | spaced = true
67 | } else {
68 | spaced = false
69 | }
70 |
71 | var pretty bool
72 | _pretty := q.Get("pretty")
73 | if _pretty != "" {
74 | pretty = true
75 | } else {
76 | pretty = false
77 | }
78 |
79 | // home page
80 | if url == "" && selector == "" {
81 | http.ServeFile(w, req, mainPageFileName)
82 | return
83 | }
84 |
85 | // text or attr: get text, part 1/2
86 | handlers := lolhtml.Handlers{}
87 | // matches and selectors are used by text scraper
88 | matches := make(map[string][]string)
89 | var selectors []string
90 | _selectors := strings.Split(selector, ",")
91 | for _, s := range _selectors {
92 | selectors = append(selectors, strings.TrimSpace(s))
93 | }
94 | // attrValue is used by attribute scraper
95 | var attrValue string
96 | if attr == "" {
97 | nextText := make(map[string]string)
98 |
99 | for _, s := range selectors {
100 | s := s
101 | handlers.ElementContentHandler = append(
102 | handlers.ElementContentHandler,
103 | lolhtml.ElementContentHandler{
104 | Selector: s,
105 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
106 | matches[s] = append(matches[s], textSeparator)
107 | nextText[s] = ""
108 | return lolhtml.Continue
109 | },
110 | TextChunkHandler: func(t *lolhtml.TextChunk) lolhtml.RewriterDirective {
111 | nextText[s] += t.Content()
112 | if t.IsLastInTextNode() {
113 | if spaced {
114 | nextText[s] += " "
115 | }
116 | matches[s] = append(matches[s], nextText[s])
117 | nextText[s] = ""
118 | }
119 | return lolhtml.Continue
120 | },
121 | },
122 | )
123 | }
124 | } else {
125 | handlers = lolhtml.Handlers{
126 | ElementContentHandler: []lolhtml.ElementContentHandler{
127 | {
128 | Selector: selector,
129 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
130 | attrValue, _ = e.AttributeValue(attr)
131 | return lolhtml.Stop
132 | },
133 | },
134 | },
135 | }
136 | }
137 |
138 | lolWriter, err := lolhtml.NewWriter(
139 | nil,
140 | &handlers,
141 | )
142 | if err != nil {
143 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
144 | return
145 | }
146 |
147 | // fetch target page content
148 | resp, err := http.Get(url)
149 | if err != nil {
150 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
151 | return
152 | }
153 | if resp.StatusCode != http.StatusOK {
154 | sendError(w, http.StatusBadGateway, fmt.Sprintf("Status %d requesting %s", resp.StatusCode, url), pretty)
155 | return
156 | }
157 | defer resp.Body.Close()
158 |
159 | // might be confusing
160 | _, err = io.Copy(lolWriter, resp.Body)
161 | if err != nil && err.Error() != "The rewriter has been stopped." {
162 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
163 | return
164 | }
165 | if err == nil || err.Error() != "The rewriter has been stopped." {
166 | err = lolWriter.Close()
167 | if err != nil {
168 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
169 | return
170 | }
171 | }
172 |
173 | // text or attr: post-process texts, part 2/2
174 | if attr == "" {
175 | for _, s := range selectors {
176 | var nodeCompleteTexts []string
177 | nextText := ""
178 |
179 | for _, text := range matches[s] {
180 | if text == textSeparator {
181 | if strings.TrimSpace(nextText) != "" {
182 | nodeCompleteTexts = append(nodeCompleteTexts, cleanText(nextText))
183 | nextText = ""
184 | }
185 | } else {
186 | nextText += text
187 | }
188 | }
189 |
190 | lastText := cleanText(nextText)
191 | if lastText != "" {
192 | nodeCompleteTexts = append(nodeCompleteTexts, lastText)
193 | }
194 | matches[s] = nodeCompleteTexts
195 | }
196 | }
197 |
198 | w.WriteHeader(http.StatusOK)
199 |
200 | enc := json.NewEncoder(w)
201 | enc.SetEscapeHTML(false)
202 | if pretty {
203 | enc.SetIndent("", " ")
204 | }
205 |
206 | if attr == "" {
207 | err = enc.Encode(Response{Result: matches})
208 | } else {
209 | err = enc.Encode(Response{Result: attrValue})
210 | }
211 | if err != nil {
212 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
213 | return
214 | }
215 | }
216 |
217 | type Response struct {
218 | Result interface{} `json:"result,omitempty"`
219 | Error string `json:"error,omitempty"`
220 | }
221 |
222 | func sendError(w http.ResponseWriter, statusCode int, errorText string, pretty bool) {
223 | w.WriteHeader(statusCode)
224 |
225 | enc := json.NewEncoder(w)
226 | enc.SetEscapeHTML(false)
227 | if pretty {
228 | enc.SetIndent("", " ")
229 | }
230 |
231 | // redact concrete error message if debug != true
232 | if !debug && statusCode == http.StatusInternalServerError {
233 | errorText = "Internal server error"
234 | }
235 |
236 | err := enc.Encode(Response{Error: errorText})
237 | if err != nil {
238 | _, _ = w.Write([]byte(errorText))
239 | }
240 | }
241 |
242 | func cleanText(s string) string {
243 | return unifyWhitespace.ReplaceAllString(strings.TrimSpace(s), " ")
244 | }
245 |
--------------------------------------------------------------------------------
/element.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include
5 | #include "lol_html.h"
6 | */
7 | import "C"
8 | import (
9 | "errors"
10 | "unsafe"
11 | )
12 |
13 | // Element represents an HTML element.
14 | type Element C.lol_html_element_t
15 |
16 | // ElementHandlerFunc is a callback handler function to do something with an Element.
17 | type ElementHandlerFunc func(*Element) RewriterDirective
18 |
19 | // TagName gets the element's tag name.
20 | func (e *Element) TagName() string {
21 | tagNameC := (str)(C.lol_html_element_tag_name_get((*C.lol_html_element_t)(e)))
22 | defer tagNameC.Free()
23 | return tagNameC.String()
24 | }
25 |
26 | // SetTagName sets the element's tag name.
27 | func (e *Element) SetTagName(name string) error {
28 | nameC := C.CString(name)
29 | defer C.free(unsafe.Pointer(nameC))
30 | nameLen := len(name)
31 | errCode := C.lol_html_element_tag_name_set((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
32 | if errCode == 0 {
33 | return nil
34 | }
35 | return getError()
36 | }
37 |
38 | // NamespaceURI gets the element's namespace URI.
39 | func (e *Element) NamespaceURI() string {
40 | // don't need to be freed
41 | namespaceURIC := C.lol_html_element_namespace_uri_get((*C.lol_html_element_t)(e))
42 | return C.GoString(namespaceURIC)
43 | }
44 |
45 | // AttributeIterator returns a pointer to an AttributeIterator. Can be used to iterate
46 | // over all attributes of the element.
47 | func (e *Element) AttributeIterator() *AttributeIterator {
48 | return (*AttributeIterator)(C.lol_html_attributes_iterator_get((*C.lol_html_element_t)(e)))
49 | }
50 |
51 | // AttributeValue returns the value of the attribute on this element.
52 | func (e *Element) AttributeValue(name string) (string, error) {
53 | nameC := C.CString(name)
54 | defer C.free(unsafe.Pointer(nameC))
55 | nameLen := len(name)
56 | valueC := (*str)(C.lol_html_element_get_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)))
57 | defer valueC.Free()
58 | // always check error, so not using getError()
59 | errC := (*str)(C.lol_html_take_last_error())
60 | defer errC.Free()
61 | errMsg := errC.String()
62 | if errMsg != "" {
63 | return "", errors.New(errMsg)
64 | }
65 | return valueC.String(), nil
66 | }
67 |
68 | // HasAttribute returns whether the element has the attribute of this name or not.
69 | func (e *Element) HasAttribute(name string) (bool, error) {
70 | nameC := C.CString(name)
71 | defer C.free(unsafe.Pointer(nameC))
72 | nameLen := len(name)
73 | codeC := C.lol_html_element_has_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
74 | if codeC == 1 {
75 | return true, nil
76 | } else if codeC == 0 {
77 | return false, nil
78 | }
79 | return false, getError()
80 | }
81 |
82 | // SetAttribute updates or creates the attribute with name and value on the element.
83 | func (e *Element) SetAttribute(name string, value string) error {
84 | nameC := C.CString(name)
85 | defer C.free(unsafe.Pointer(nameC))
86 | nameLen := len(name)
87 | valueC := C.CString(value)
88 | defer C.free(unsafe.Pointer(valueC))
89 | valueLen := len(value)
90 | errCode := C.lol_html_element_set_attribute(
91 | (*C.lol_html_element_t)(e),
92 | nameC,
93 | C.size_t(nameLen),
94 | valueC,
95 | C.size_t(valueLen),
96 | )
97 | if errCode == 0 {
98 | return nil
99 | }
100 | return getError()
101 | }
102 |
103 | // RemoveAttribute removes the attribute with the name from the element.
104 | func (e *Element) RemoveAttribute(name string) error {
105 | nameC := C.CString(name)
106 | defer C.free(unsafe.Pointer(nameC))
107 | nameLen := len(name)
108 | errCode := C.lol_html_element_remove_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
109 | if errCode == 0 {
110 | return nil
111 | }
112 | return getError()
113 | }
114 |
115 | type elementAlter int
116 |
117 | const (
118 | elementInsertBeforeStartTag elementAlter = iota
119 | elementInsertAfterStartTag
120 | elementInsertBeforeEndTag
121 | elementInsertAfterEndTag
122 | elementSetInnerContent
123 | elementReplace
124 | )
125 |
126 | func (e *Element) alter(content string, alter elementAlter, isHTML bool) error {
127 | contentC := C.CString(content)
128 | defer C.free(unsafe.Pointer(contentC))
129 | contentLen := len(content)
130 | var errCode C.int
131 | switch alter {
132 | case elementInsertBeforeStartTag:
133 | errCode = C.lol_html_element_before((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
134 | case elementInsertAfterStartTag:
135 | errCode = C.lol_html_element_prepend((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
136 | case elementInsertBeforeEndTag:
137 | errCode = C.lol_html_element_append((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
138 | case elementInsertAfterEndTag:
139 | errCode = C.lol_html_element_after((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
140 | case elementSetInnerContent:
141 | errCode = C.lol_html_element_set_inner_content((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
142 | case elementReplace:
143 | errCode = C.lol_html_element_replace((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
144 | default:
145 | panic("not implemented")
146 | }
147 | if errCode == 0 {
148 | return nil
149 | }
150 | return getError()
151 | }
152 |
153 | // InsertBeforeStartTagAsText inserts the given content before the element's start tag.
154 | //
155 | // The rewriter will HTML-escape the content before insertion:
156 | //
157 | // `<` will be replaced with `<`
158 | //
159 | // `>` will be replaced with `>`
160 | //
161 | // `&` will be replaced with `&`
162 | func (e *Element) InsertBeforeStartTagAsText(content string) error {
163 | return e.alter(content, elementInsertBeforeStartTag, false)
164 | }
165 |
166 | // InsertBeforeStartTagAsHTML inserts the given content before the element's start tag.
167 | // The content is inserted as is.
168 | func (e *Element) InsertBeforeStartTagAsHTML(content string) error {
169 | return e.alter(content, elementInsertBeforeStartTag, true)
170 | }
171 |
172 | // InsertAfterStartTagAsText inserts (prepend) the given content after the element's start tag.
173 | //
174 | // The rewriter will HTML-escape the content before insertion:
175 | //
176 | // `<` will be replaced with `<`
177 | //
178 | // `>` will be replaced with `>`
179 | //
180 | // `&` will be replaced with `&`
181 | func (e *Element) InsertAfterStartTagAsText(content string) error {
182 | return e.alter(content, elementInsertAfterStartTag, false)
183 | }
184 |
185 | // InsertAfterStartTagAsHTML inserts (prepend) the given content after the element's start tag.
186 | // The content is inserted as is.
187 | func (e *Element) InsertAfterStartTagAsHTML(content string) error {
188 | return e.alter(content, elementInsertAfterStartTag, true)
189 | }
190 |
191 | // InsertBeforeEndTagAsText inserts (append) the given content after the element's end tag.
192 | //
193 | // The rewriter will HTML-escape the content before insertion:
194 | //
195 | // `<` will be replaced with `<`
196 | //
197 | // `>` will be replaced with `>`
198 | //
199 | // `&` will be replaced with `&`
200 | func (e *Element) InsertBeforeEndTagAsText(content string) error {
201 | return e.alter(content, elementInsertBeforeEndTag, false)
202 | }
203 |
204 | // InsertBeforeEndTagAsHTML inserts (append) the given content before the element's end tag.
205 | // The content is inserted as is.
206 | func (e *Element) InsertBeforeEndTagAsHTML(content string) error {
207 | return e.alter(content, elementInsertBeforeEndTag, true)
208 | }
209 |
210 | // InsertAfterEndTagAsText inserts the given content after the element's end tag.
211 | //
212 | // The rewriter will HTML-escape the content before insertion:
213 | //
214 | // `<` will be replaced with `<`
215 | //
216 | // `>` will be replaced with `>`
217 | //
218 | // `&` will be replaced with `&`
219 | func (e *Element) InsertAfterEndTagAsText(content string) error {
220 | return e.alter(content, elementInsertAfterEndTag, false)
221 | }
222 |
223 | // InsertAfterEndTagAsHTML inserts the given content after the element's end tag.
224 | // The content is inserted as is.
225 | func (e *Element) InsertAfterEndTagAsHTML(content string) error {
226 | return e.alter(content, elementInsertAfterEndTag, true)
227 | }
228 |
229 | // SetInnerContentAsText overwrites the element's inner content.
230 | //
231 | // The rewriter will HTML-escape the content:
232 | //
233 | // `<` will be replaced with `<`
234 | //
235 | // `>` will be replaced with `>`
236 | //
237 | // `&` will be replaced with `&`
238 | func (e *Element) SetInnerContentAsText(content string) error {
239 | return e.alter(content, elementSetInnerContent, false)
240 | }
241 |
242 | // SetInnerContentAsHTML overwrites the element's inner content.
243 | // The content is kept as is.
244 | func (e *Element) SetInnerContentAsHTML(content string) error {
245 | return e.alter(content, elementSetInnerContent, true)
246 | }
247 |
248 | // ReplaceAsText replace the whole element with the supplied content.
249 | //
250 | // The rewriter will HTML-escape the content:
251 | //
252 | // `<` will be replaced with `<`
253 | //
254 | // `>` will be replaced with `>`
255 | //
256 | // `&` will be replaced with `&`
257 | func (e *Element) ReplaceAsText(content string) error {
258 | return e.alter(content, elementReplace, false)
259 | }
260 |
261 | // ReplaceAsHTML replace the whole element with the supplied content.
262 | // The content is kept as is.
263 | func (e *Element) ReplaceAsHTML(content string) error {
264 | return e.alter(content, elementReplace, true)
265 | }
266 |
267 | // Remove completely removes the element.
268 | func (e *Element) Remove() {
269 | C.lol_html_element_remove((*C.lol_html_element_t)(e))
270 | }
271 |
272 | // RemoveAndKeepContent removes the element but keeps the inner content.
273 | func (e *Element) RemoveAndKeepContent() {
274 | C.lol_html_element_remove_and_keep_content((*C.lol_html_element_t)(e))
275 | }
276 |
277 | // IsRemoved returns whether the element is removed or not.
278 | func (e *Element) IsRemoved() bool {
279 | return (bool)(C.lol_html_element_is_removed((*C.lol_html_element_t)(e)))
280 | }
281 |
--------------------------------------------------------------------------------
/element_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/coolspring8/go-lolhtml"
8 | )
9 |
10 | func TestElement_ModifyTagName(t *testing.T) {
11 | var buf bytes.Buffer
12 | w, err := lolhtml.NewWriter(
13 | &buf,
14 | &lolhtml.Handlers{
15 | ElementContentHandler: []lolhtml.ElementContentHandler{
16 | {
17 | Selector: "*",
18 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
19 | wantName := "div"
20 | if name := e.TagName(); name != wantName {
21 | t.Errorf("got %s want %s\n", name, wantName)
22 | }
23 | err := e.SetTagName("")
24 | if err == nil {
25 | t.FailNow()
26 | }
27 | if err.Error() != "Tag name can't be empty." {
28 | t.Error(err)
29 | }
30 | if err = e.SetTagName("span"); err != nil {
31 | t.Error(err)
32 | }
33 | return lolhtml.Continue
34 | },
35 | },
36 | },
37 | },
38 | )
39 | if err != nil {
40 | t.Error(err)
41 | }
42 |
43 | if _, err = w.Write([]byte("Hi ")); err != nil {
44 | t.Error(err)
45 | }
46 | if err = w.Close(); err != nil {
47 | t.Error(err)
48 | }
49 | wantedText := "Hi
"
50 | if finalText := buf.String(); finalText != wantedText {
51 | t.Errorf("want %s got %s \n", wantedText, finalText)
52 | }
53 | }
54 |
55 | func TestElement_ModifyAttributes(t *testing.T) {
56 | var buf bytes.Buffer
57 | w, err := lolhtml.NewWriter(
58 | &buf,
59 | &lolhtml.Handlers{
60 | ElementContentHandler: []lolhtml.ElementContentHandler{
61 | {
62 | Selector: "*",
63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
64 | has, err := e.HasAttribute("foo")
65 | if err != nil {
66 | t.Error(err)
67 | }
68 | if !has {
69 | t.FailNow()
70 | }
71 | has, err = e.HasAttribute("Bar")
72 | if err != nil {
73 | t.Error(err)
74 | }
75 | if has {
76 | t.FailNow()
77 | }
78 |
79 | a, err := e.AttributeValue("foo")
80 | if err != nil {
81 | t.Error(err)
82 | }
83 | wantValue := "42"
84 | if a != wantValue {
85 | t.Errorf("got %s; want %s", a, wantValue)
86 | }
87 | a, err = e.AttributeValue("Bar")
88 | if err != nil {
89 | t.Error(err)
90 | }
91 | if a != "" {
92 | t.Errorf("got %s; want empty", a)
93 | }
94 |
95 | if err := e.SetAttribute("Bar", "hey"); err != nil {
96 | t.Error(err)
97 | }
98 |
99 | if err := e.RemoveAttribute("foo"); err != nil {
100 | t.Error(err)
101 | }
102 |
103 | return lolhtml.Continue
104 | },
105 | },
106 | },
107 | },
108 | )
109 | if err != nil {
110 | t.Error(err)
111 | }
112 |
113 | if _, err = w.Write([]byte("")); err != nil {
114 | t.Error(err)
115 | }
116 | if err = w.Close(); err != nil {
117 | t.Error(err)
118 | }
119 | wantedText := ""
120 | if finalText := buf.String(); finalText != wantedText {
121 | t.Errorf("want %s got %s \n", wantedText, finalText)
122 | }
123 | }
124 |
125 | func TestElement_InsertContentAroundElement(t *testing.T) {
126 | var buf bytes.Buffer
127 | w, err := lolhtml.NewWriter(
128 | &buf,
129 | &lolhtml.Handlers{
130 | ElementContentHandler: []lolhtml.ElementContentHandler{
131 | {
132 | Selector: "*",
133 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
134 | if err := e.InsertBeforeStartTagAsText("&before"); err != nil {
135 | t.Error(err)
136 | }
137 | if err := e.InsertAfterStartTagAsHTML(""); err != nil {
138 | t.Error(err)
139 | }
140 | if err := e.InsertBeforeEndTagAsHTML(""); err != nil {
141 | t.Error(err)
142 | }
143 | if err := e.InsertAfterEndTagAsText("&after"); err != nil {
144 | t.Error(err)
145 | }
146 | return lolhtml.Continue
147 | },
148 | },
149 | },
150 | },
151 | )
152 | if err != nil {
153 | t.Error(err)
154 | }
155 |
156 | if _, err = w.Write([]byte("Hi
")); err != nil {
157 | t.Error(err)
158 | }
159 | if err = w.Close(); err != nil {
160 | t.Error(err)
161 | }
162 | wantedText := "&beforeHi
&after"
163 | if finalText := buf.String(); finalText != wantedText {
164 | t.Errorf("want %s got %s \n", wantedText, finalText)
165 | }
166 | }
167 |
168 | func TestElement_SetInnerContent(t *testing.T) {
169 | var buf bytes.Buffer
170 | w, err := lolhtml.NewWriter(
171 | &buf,
172 | &lolhtml.Handlers{
173 | ElementContentHandler: []lolhtml.ElementContentHandler{
174 | {
175 | Selector: "div",
176 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
177 | if err := e.SetInnerContentAsText("hey & ya"); err != nil {
178 | t.Error(err)
179 | }
180 | return lolhtml.Continue
181 | },
182 | },
183 | },
184 | },
185 | )
186 | if err != nil {
187 | t.Error(err)
188 | }
189 |
190 | if _, err = w.Write([]byte("42
")); err != nil {
191 | t.Error(err)
192 | }
193 | if err = w.Close(); err != nil {
194 | t.Error(err)
195 | }
196 | wantedText := "hey & ya
"
197 | if finalText := buf.String(); finalText != wantedText {
198 | t.Errorf("want %s got %s \n", wantedText, finalText)
199 | }
200 | }
201 |
202 | func TestElement_Replace(t *testing.T) {
203 | var buf bytes.Buffer
204 | w, err := lolhtml.NewWriter(
205 | &buf,
206 | &lolhtml.Handlers{
207 | ElementContentHandler: []lolhtml.ElementContentHandler{
208 | {
209 | Selector: "div",
210 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
211 | if err := e.ReplaceAsHTML("hey & ya"); err != nil {
212 | t.Error(err)
213 | }
214 | return lolhtml.Continue
215 | },
216 | },
217 | },
218 | },
219 | )
220 | if err != nil {
221 | t.Error(err)
222 | }
223 |
224 | if _, err = w.Write([]byte("42
Hello good bye
Hello2 ")); err != nil {
225 | t.Error(err)
226 | }
227 | if err = w.Close(); err != nil {
228 | t.Error(err)
229 | }
230 | wantedText := "hey & yaHellohey & ya Hello2 "
231 | if finalText := buf.String(); finalText != wantedText {
232 | t.Errorf("want %s got %s \n", wantedText, finalText)
233 | }
234 | }
235 |
236 | func TestElement_Remove(t *testing.T) {
237 | var buf bytes.Buffer
238 | w, err := lolhtml.NewWriter(
239 | &buf,
240 | &lolhtml.Handlers{
241 | ElementContentHandler: []lolhtml.ElementContentHandler{
242 | {
243 | Selector: "h1",
244 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
245 | if e.IsRemoved() {
246 | t.FailNow()
247 | }
248 | e.Remove()
249 | if !e.IsRemoved() {
250 | t.FailNow()
251 | }
252 | return lolhtml.Continue
253 | },
254 | },
255 | },
256 | },
257 | )
258 | if err != nil {
259 | t.Error(err)
260 | }
261 |
262 | if _, err = w.Write([]byte("42
Hello Hello2 ")); err != nil {
263 | t.Error(err)
264 | }
265 | if err = w.Close(); err != nil {
266 | t.Error(err)
267 | }
268 | wantedText := "42
Hello2 "
269 | if finalText := buf.String(); finalText != wantedText {
270 | t.Errorf("want %s got %s \n", wantedText, finalText)
271 | }
272 | }
273 |
274 | func TestElement_RemoveElementAndKeepContent(t *testing.T) {
275 | var buf bytes.Buffer
276 | w, err := lolhtml.NewWriter(
277 | &buf,
278 | &lolhtml.Handlers{
279 | ElementContentHandler: []lolhtml.ElementContentHandler{
280 | {
281 | Selector: "h2",
282 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
283 | if e.IsRemoved() {
284 | t.FailNow()
285 | }
286 | e.RemoveAndKeepContent()
287 | if !e.IsRemoved() {
288 | t.FailNow()
289 | }
290 | return lolhtml.Continue
291 | },
292 | },
293 | },
294 | },
295 | )
296 | if err != nil {
297 | t.Error(err)
298 | }
299 |
300 | if _, err = w.Write([]byte("42Hello1
Hello Hello2 ")); err != nil {
301 | t.Error(err)
302 | }
303 | if err = w.Close(); err != nil {
304 | t.Error(err)
305 | }
306 | wantedText := "42Hello1
Hello Hello2"
307 | if finalText := buf.String(); finalText != wantedText {
308 | t.Errorf("want %s got %s \n", wantedText, finalText)
309 | }
310 | }
311 |
312 | func TestElement_GetEmptyElementAttribute(t *testing.T) {
313 | var buf bytes.Buffer
314 | w, err := lolhtml.NewWriter(
315 | &buf,
316 | &lolhtml.Handlers{
317 | ElementContentHandler: []lolhtml.ElementContentHandler{
318 | {
319 | Selector: "span",
320 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
321 | has, err := e.HasAttribute("foo")
322 | if err != nil {
323 | t.Error(err)
324 | }
325 | if !has {
326 | t.FailNow()
327 | }
328 | value, err := e.AttributeValue("foo")
329 | if err != nil {
330 | t.Error(err)
331 | }
332 | if value != "" {
333 | t.Errorf("got %s; want empty", value)
334 | }
335 | return lolhtml.Continue
336 | },
337 | },
338 | },
339 | },
340 | )
341 | if err != nil {
342 | t.Error(err)
343 | }
344 |
345 | if _, err = w.Write([]byte("")); err != nil {
346 | t.Error(err)
347 | }
348 | if err = w.Close(); err != nil {
349 | t.Error(err)
350 | }
351 | wantedText := ""
352 | if finalText := buf.String(); finalText != wantedText {
353 | t.Errorf("want %s got %s \n", wantedText, finalText)
354 | }
355 | }
356 |
357 | func TestElement_IterateAttributes(t *testing.T) {
358 | w, err := lolhtml.NewWriter(
359 | nil,
360 | &lolhtml.Handlers{
361 | ElementContentHandler: []lolhtml.ElementContentHandler{
362 | {
363 | Selector: "*",
364 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
365 | ai := e.AttributeIterator()
366 |
367 | a := ai.Next()
368 | if name := a.Name(); name != "foo" {
369 | t.Errorf("got %s; want foo", name)
370 | }
371 | if value := a.Value(); value != "42" {
372 | t.Errorf("got %s; want foo", value)
373 | }
374 |
375 | a = ai.Next()
376 | if name := a.Name(); name != "bar" {
377 | t.Errorf("got %s; want bar", name)
378 | }
379 | if value := a.Value(); value != "1337" {
380 | t.Errorf("got %s; want 1337", value)
381 | }
382 |
383 | a = ai.Next()
384 | if a != nil {
385 | t.FailNow()
386 | }
387 |
388 | return lolhtml.Continue
389 | },
390 | },
391 | },
392 | },
393 | )
394 | if err != nil {
395 | t.Error(err)
396 | }
397 |
398 | if _, err = w.Write([]byte("")); err != nil {
399 | t.Error(err)
400 | }
401 | if err = w.Close(); err != nil {
402 | t.Error(err)
403 | }
404 | }
405 |
406 | func TestElement_AssertNsIsHtml(t *testing.T) {
407 | w, err := lolhtml.NewWriter(
408 | nil,
409 | &lolhtml.Handlers{
410 | ElementContentHandler: []lolhtml.ElementContentHandler{
411 | {
412 | Selector: "script",
413 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
414 | wantedText := "http://www.w3.org/1999/xhtml"
415 | if ns := e.NamespaceURI(); ns != wantedText {
416 | t.Errorf("got %s; want %s", ns, wantedText)
417 | }
418 | return lolhtml.Continue
419 | },
420 | },
421 | },
422 | },
423 | )
424 | if err != nil {
425 | t.Error(err)
426 | }
427 |
428 | if _, err = w.Write([]byte("")); err != nil {
429 | t.Error(err)
430 | }
431 | if err = w.Close(); err != nil {
432 | t.Error(err)
433 | }
434 | }
435 |
436 | func TestElement_AssertNsIsSvg(t *testing.T) {
437 | w, err := lolhtml.NewWriter(
438 | nil,
439 | &lolhtml.Handlers{
440 | ElementContentHandler: []lolhtml.ElementContentHandler{
441 | {
442 | Selector: "script",
443 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
444 | wantedText := "http://www.w3.org/2000/svg"
445 | if ns := e.NamespaceURI(); ns != wantedText {
446 | t.Errorf("got %s; want %s", ns, wantedText)
447 | }
448 | return lolhtml.Continue
449 | },
450 | },
451 | },
452 | },
453 | )
454 | if err != nil {
455 | t.Error(err)
456 | }
457 |
458 | if _, err = w.Write([]byte("
")); err != nil {
459 | t.Error(err)
460 | }
461 | if err = w.Close(); err != nil {
462 | t.Error(err)
463 | }
464 | }
465 |
466 | func TestElement_StopRewriting(t *testing.T) {
467 | w, err := lolhtml.NewWriter(
468 | nil,
469 | &lolhtml.Handlers{
470 | ElementContentHandler: []lolhtml.ElementContentHandler{
471 | {
472 | Selector: "span",
473 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
474 | return lolhtml.Stop
475 | },
476 | },
477 | },
478 | },
479 | )
480 | if err != nil {
481 | t.Error(err)
482 | }
483 |
484 | _, err = w.Write([]byte("
"))
485 | if err == nil {
486 | t.FailNow()
487 | }
488 | if err.Error() != "The rewriter has been stopped." {
489 | t.Error(err)
490 | }
491 | err = w.Close()
492 | if err == nil {
493 | t.FailNow()
494 | }
495 | if err.Error() != "The rewriter has been stopped." {
496 | t.Error(err)
497 | }
498 | }
499 |
--------------------------------------------------------------------------------
/examples/web-scraper/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Web Scraper · By Adam Schwartz · Powered by Cloudflare Workers®
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
238 |
239 |
240 |
241 |
242 |
246 |
247 |
291 |
292 |
293 |
294 |
Update preview Scrape
295 |
Permalink
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
About
307 |
308 |
Web Scraper makes it effortless to scrape websites. Just provide a URL and CSS selector and it will return JSON containing the text contents of the matching elements.
309 |
Built by Adam Schwartz using Cloudflare Workers , open-source and available on Github .
310 |
311 |
312 |
313 | Done
314 | View example
315 |
316 |
317 |
322 |
323 |
324 |
325 |
458 |
459 |
--------------------------------------------------------------------------------