"); err != nil {
161 | t.Error(err)
162 | }
163 | if err := c.InsertAfterAsText("
"); err != nil {
164 | t.Error(err)
165 | }
166 | return lolhtml.Continue
167 | },
168 | },
169 | },
170 | },
171 | )
172 | if err != nil {
173 | t.Error(err)
174 | }
175 |
176 | if _, err := w.Write([]byte("")); err != nil {
177 | t.Error(err)
178 | }
179 | if err := w.Close(); err != nil {
180 | t.Error(err)
181 | }
182 | wantedText := "</div>"
183 | if finalText := buf.String(); finalText != wantedText {
184 | t.Errorf("want %s got %s \n", wantedText, finalText)
185 | }
186 | }
187 |
188 | func TestComment_StopRewriting(t *testing.T) {
189 | var buf bytes.Buffer
190 | w, err := lolhtml.NewWriter(
191 | &buf,
192 | &lolhtml.Handlers{
193 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
194 | {
195 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective {
196 | return lolhtml.Stop
197 | },
198 | },
199 | },
200 | },
201 | )
202 | if err != nil {
203 | t.Error(err)
204 | }
205 |
206 | _, err = w.Write([]byte("
"))
207 | if err == nil {
208 | t.FailNow()
209 | }
210 | if err.Error() != "The rewriter has been stopped." {
211 | t.Error(err)
212 | }
213 | }
214 |
215 | func TestComment_StopRewritingWithSelector(t *testing.T) {
216 | var buf bytes.Buffer
217 | w, err := lolhtml.NewWriter(
218 | &buf,
219 | &lolhtml.Handlers{
220 | ElementContentHandler: []lolhtml.ElementContentHandler{
221 | {
222 | Selector: "*",
223 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective {
224 | return lolhtml.Stop
225 | },
226 | },
227 | },
228 | },
229 | )
230 | if err != nil {
231 | t.Error(err)
232 | }
233 |
234 | _, err = w.Write([]byte("
"))
235 | if err == nil {
236 | t.FailNow()
237 | }
238 | if err.Error() != "The rewriter has been stopped." {
239 | t.Error(err)
240 | }
241 | }
242 |
--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include "lol_html.h"
5 | */
6 | import "C"
7 | import (
8 | "unsafe"
9 | )
10 |
11 | // Config defines settings for the rewriter.
12 | type Config struct {
13 | // defaults to "utf-8".
14 | Encoding string
15 | // defaults to PreallocatedParsingBufferSize: 1024, MaxAllowedMemoryUsage: 1<<63 - 1.
16 | Memory *MemorySettings
17 | // defaults to func([]byte) {}. In other words, totally discard output.
18 | Sink OutputSink
19 | // defaults to true. If true, bail out for security reasons when ambiguous.
20 | Strict bool
21 | }
22 |
23 | func newDefaultConfig() Config {
24 | return Config{
25 | Encoding: "utf-8",
26 | Memory: &MemorySettings{
27 | PreallocatedParsingBufferSize: 1024,
28 | MaxAllowedMemoryUsage: 1<<63 - 1,
29 | },
30 | Sink: func([]byte) {},
31 | Strict: true,
32 | }
33 | }
34 |
35 | // MemorySettings sets the memory limitations for the rewriter.
36 | type MemorySettings struct {
37 | PreallocatedParsingBufferSize int // defaults to 1024
38 | MaxAllowedMemoryUsage int // defaults to 1<<63 -1
39 | }
40 |
41 | // OutputSink is a callback function where output is written to. A byte slice is passed each time,
42 | // representing a chunk of output.
43 | //
44 | // Exported for special usages which require each output chunk to be identified and processed
45 | // individually. For most common uses, NewWriter would be more convenient.
46 | type OutputSink func([]byte)
47 |
48 | // DocumentContentHandler is a group of handlers that would be applied to the whole HTML document.
49 | type DocumentContentHandler struct {
50 | DoctypeHandler DoctypeHandlerFunc
51 | CommentHandler CommentHandlerFunc
52 | TextChunkHandler TextChunkHandlerFunc
53 | DocumentEndHandler DocumentEndHandlerFunc
54 | }
55 |
56 | // ElementContentHandler is a group of handlers that would be applied to the content matched by
57 | // the given selector.
58 | type ElementContentHandler struct {
59 | Selector string
60 | ElementHandler ElementHandlerFunc
61 | CommentHandler CommentHandlerFunc
62 | TextChunkHandler TextChunkHandlerFunc
63 | }
64 |
65 | // Handlers contain DocumentContentHandlers and ElementContentHandlers. Can contain arbitrary numbers
66 | // of them, including zero (nil slice).
67 | type Handlers struct {
68 | DocumentContentHandler []DocumentContentHandler
69 | ElementContentHandler []ElementContentHandler
70 | }
71 |
72 | //export callbackSink
73 | func callbackSink(chunk *C.char, chunkLen C.size_t, userData unsafe.Pointer) {
74 | c := C.GoBytes(unsafe.Pointer(chunk), C.int(chunkLen))
75 | cb := restorePointer(userData).(OutputSink)
76 | cb(c)
77 | }
78 |
79 | //export callbackDoctype
80 | func callbackDoctype(doctype *Doctype, userData unsafe.Pointer) RewriterDirective {
81 | cb := restorePointer(userData).(DoctypeHandlerFunc)
82 | return cb(doctype)
83 | }
84 |
85 | //export callbackComment
86 | func callbackComment(comment *Comment, userData unsafe.Pointer) RewriterDirective {
87 | cb := restorePointer(userData).(CommentHandlerFunc)
88 | return cb(comment)
89 | }
90 |
91 | //export callbackTextChunk
92 | func callbackTextChunk(textChunk *TextChunk, userData unsafe.Pointer) RewriterDirective {
93 | cb := restorePointer(userData).(TextChunkHandlerFunc)
94 | return cb(textChunk)
95 | }
96 |
97 | //export callbackElement
98 | func callbackElement(element *Element, userData unsafe.Pointer) RewriterDirective {
99 | cb := restorePointer(userData).(ElementHandlerFunc)
100 | return cb(element)
101 | }
102 |
103 | //export callbackDocumentEnd
104 | func callbackDocumentEnd(documentEnd *DocumentEnd, userData unsafe.Pointer) RewriterDirective {
105 | cb := restorePointer(userData).(DocumentEndHandlerFunc)
106 | return cb(documentEnd)
107 | }
108 |
--------------------------------------------------------------------------------
/const.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | import "C"
4 |
5 | // RewriterDirective is a "status code“ that should be returned by callback handlers, to inform the
6 | // rewriter to continue or stop parsing.
7 | type RewriterDirective int
8 |
9 | const (
10 | // Continue lets the normal parsing process continue.
11 | Continue RewriterDirective = iota
12 |
13 | // Stop stops the rewriter immediately. Content currently buffered is discarded, and an error is returned.
14 | // After stopping, the Writer should not be used anymore except for Close().
15 | Stop
16 | )
17 |
--------------------------------------------------------------------------------
/doctype.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include "lol_html.h"
5 | */
6 | import "C"
7 |
8 | // Doctype represents the document's doctype.
9 | type Doctype C.lol_html_doctype_t
10 |
11 | // DoctypeHandlerFunc is a callback handler function to do something with a Comment.
12 | type DoctypeHandlerFunc func(*Doctype) RewriterDirective
13 |
14 | // Name returns doctype name.
15 | func (d *Doctype) Name() string {
16 | nameC := (*str)(C.lol_html_doctype_name_get((*C.lol_html_doctype_t)(d)))
17 | defer nameC.Free()
18 | return nameC.String()
19 | }
20 |
21 | // PublicID returns doctype public ID.
22 | func (d *Doctype) PublicID() string {
23 | nameC := (*str)(C.lol_html_doctype_public_id_get((*C.lol_html_doctype_t)(d)))
24 | defer nameC.Free()
25 | return nameC.String()
26 | }
27 |
28 | // SystemID returns doctype system ID.
29 | func (d *Doctype) SystemID() string {
30 | nameC := (*str)(C.lol_html_doctype_system_id_get((*C.lol_html_doctype_t)(d)))
31 | defer nameC.Free()
32 | return nameC.String()
33 | }
34 |
--------------------------------------------------------------------------------
/doctype_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/coolspring8/go-lolhtml"
7 | )
8 |
9 | func TestDoctype_GetDoctypeFields(t *testing.T) {
10 | w, err := lolhtml.NewWriter(
11 | nil,
12 | &lolhtml.Handlers{
13 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
14 | {
15 | DoctypeHandler: func(doctype *lolhtml.Doctype) lolhtml.RewriterDirective {
16 | if name := doctype.Name(); name != "math" {
17 | t.Errorf("wrong doctype name %s\n", name)
18 | }
19 | if publicId := doctype.PublicID(); publicId != "" {
20 | t.Errorf("wrong doctype name %s\n", publicId)
21 | }
22 | if systemId := doctype.SystemID(); systemId != "http://www.w3.org/Math/DTD/mathml1/mathml.dtd" {
23 | t.Errorf("wrong doctype name %s\n", systemId)
24 | }
25 | return lolhtml.Continue
26 | },
27 | },
28 | },
29 | },
30 | )
31 | if err != nil {
32 | t.Error(err)
33 | }
34 |
35 | _, err = w.Write([]byte(``))
36 | if err != nil {
37 | t.Error(err)
38 | }
39 | err = w.Close()
40 | if err != nil {
41 | t.Error(err)
42 | }
43 | }
44 |
45 | func TestDoctype_StopRewriting(t *testing.T) {
46 | w, err := lolhtml.NewWriter(
47 | nil,
48 | &lolhtml.Handlers{
49 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
50 | {
51 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective {
52 | return lolhtml.Stop
53 | },
54 | },
55 | },
56 | },
57 | )
58 | if err != nil {
59 | t.Error(err)
60 | }
61 |
62 | _, err = w.Write([]byte(""))
63 | if err == nil {
64 | t.FailNow()
65 | }
66 | if err.Error() != "The rewriter has been stopped." {
67 | t.Error(err)
68 | }
69 | err = w.Close()
70 | if err == nil {
71 | t.FailNow()
72 | }
73 | if err.Error() != "The rewriter has been stopped." {
74 | t.Error(err)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/documentend.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include
5 | #include "lol_html.h"
6 | */
7 | import "C"
8 | import "unsafe"
9 |
10 | // DocumentEnd represents the end of the document.
11 | type DocumentEnd C.lol_html_doc_end_t
12 |
13 | // DocumentEndHandlerFunc is a callback handler function to do something with a DocumentEnd.
14 | type DocumentEndHandlerFunc func(*DocumentEnd) RewriterDirective
15 |
16 | // AppendAsText appends the given content at the end of the document.
17 | //
18 | // The rewriter will HTML-escape the content before appending:
19 | //
20 | // `<` will be replaced with `<`
21 | //
22 | // `>` will be replaced with `>`
23 | //
24 | // `&` will be replaced with `&`
25 | func (d *DocumentEnd) AppendAsText(content string) error {
26 | contentC := C.CString(content)
27 | defer C.free(unsafe.Pointer(contentC))
28 | contentLen := len(content)
29 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), false)
30 | if errCode == 0 {
31 | return nil
32 | }
33 | return getError()
34 | }
35 |
36 | // AppendAsHTML appends the given content at the end of the document.
37 | // The content is appended as is.
38 | func (d *DocumentEnd) AppendAsHTML(content string) error {
39 | contentC := C.CString(content)
40 | defer C.free(unsafe.Pointer(contentC))
41 | contentLen := len(content)
42 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), true)
43 | if errCode == 0 {
44 | return nil
45 | }
46 | return getError()
47 | }
48 |
--------------------------------------------------------------------------------
/documentend_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/coolspring8/go-lolhtml"
8 | )
9 |
10 | func TestDocumentEnd_AppendToEmptyDoc(t *testing.T) {
11 | var buf bytes.Buffer
12 | w, err := lolhtml.NewWriter(
13 | &buf,
14 | &lolhtml.Handlers{
15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
16 | {
17 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective {
18 | if err := docEnd.AppendAsHTML(""); err != nil {
19 | t.Error(err)
20 | }
21 | if err := docEnd.AppendAsText("hello & world"); err != nil {
22 | t.Error(err)
23 | }
24 | return lolhtml.Continue
25 | },
26 | },
27 | },
28 | },
29 | )
30 | if err != nil {
31 | t.Error(err)
32 | }
33 |
34 | if _, err = w.Write([]byte("")); err != nil {
35 | t.Error(err)
36 | }
37 | if err = w.Close(); err != nil {
38 | t.Error(err)
39 | }
40 | wantedText := "hello & world"
41 | if finalText := buf.String(); finalText != wantedText {
42 | t.Errorf("want %s got %s \n", wantedText, finalText)
43 | }
44 | }
45 |
46 | func TestDocumentEnd_AppendAtEnd(t *testing.T) {
47 | var buf bytes.Buffer
48 | w, err := lolhtml.NewWriter(
49 | &buf,
50 | &lolhtml.Handlers{
51 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
52 | {
53 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective {
54 | if err := docEnd.AppendAsHTML(""); err != nil {
55 | t.Error(err)
56 | }
57 | if err := docEnd.AppendAsText("hello & world"); err != nil {
58 | t.Error(err)
59 | }
60 | return lolhtml.Continue
61 | },
62 | },
63 | },
64 | },
65 | )
66 | if err != nil {
67 | t.Error(err)
68 | }
69 |
70 | if _, err = w.Write([]byte("Hello
")); err != nil {
71 | t.Error(err)
72 | }
73 | if err = w.Close(); err != nil {
74 | t.Error(err)
75 | }
76 | wantedText := "Hello
hello & world"
77 | if finalText := buf.String(); finalText != wantedText {
78 | t.Errorf("want %s got %s \n", wantedText, finalText)
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/element.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include
5 | #include "lol_html.h"
6 | */
7 | import "C"
8 | import (
9 | "errors"
10 | "unsafe"
11 | )
12 |
13 | // Element represents an HTML element.
14 | type Element C.lol_html_element_t
15 |
16 | // ElementHandlerFunc is a callback handler function to do something with an Element.
17 | type ElementHandlerFunc func(*Element) RewriterDirective
18 |
19 | // TagName gets the element's tag name.
20 | func (e *Element) TagName() string {
21 | tagNameC := (str)(C.lol_html_element_tag_name_get((*C.lol_html_element_t)(e)))
22 | defer tagNameC.Free()
23 | return tagNameC.String()
24 | }
25 |
26 | // SetTagName sets the element's tag name.
27 | func (e *Element) SetTagName(name string) error {
28 | nameC := C.CString(name)
29 | defer C.free(unsafe.Pointer(nameC))
30 | nameLen := len(name)
31 | errCode := C.lol_html_element_tag_name_set((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
32 | if errCode == 0 {
33 | return nil
34 | }
35 | return getError()
36 | }
37 |
38 | // NamespaceURI gets the element's namespace URI.
39 | func (e *Element) NamespaceURI() string {
40 | // don't need to be freed
41 | namespaceURIC := C.lol_html_element_namespace_uri_get((*C.lol_html_element_t)(e))
42 | return C.GoString(namespaceURIC)
43 | }
44 |
45 | // AttributeIterator returns a pointer to an AttributeIterator. Can be used to iterate
46 | // over all attributes of the element.
47 | func (e *Element) AttributeIterator() *AttributeIterator {
48 | return (*AttributeIterator)(C.lol_html_attributes_iterator_get((*C.lol_html_element_t)(e)))
49 | }
50 |
51 | // AttributeValue returns the value of the attribute on this element.
52 | func (e *Element) AttributeValue(name string) (string, error) {
53 | nameC := C.CString(name)
54 | defer C.free(unsafe.Pointer(nameC))
55 | nameLen := len(name)
56 | valueC := (*str)(C.lol_html_element_get_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)))
57 | defer valueC.Free()
58 | // always check error, so not using getError()
59 | errC := (*str)(C.lol_html_take_last_error())
60 | defer errC.Free()
61 | errMsg := errC.String()
62 | if errMsg != "" {
63 | return "", errors.New(errMsg)
64 | }
65 | return valueC.String(), nil
66 | }
67 |
68 | // HasAttribute returns whether the element has the attribute of this name or not.
69 | func (e *Element) HasAttribute(name string) (bool, error) {
70 | nameC := C.CString(name)
71 | defer C.free(unsafe.Pointer(nameC))
72 | nameLen := len(name)
73 | codeC := C.lol_html_element_has_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
74 | if codeC == 1 {
75 | return true, nil
76 | } else if codeC == 0 {
77 | return false, nil
78 | }
79 | return false, getError()
80 | }
81 |
82 | // SetAttribute updates or creates the attribute with name and value on the element.
83 | func (e *Element) SetAttribute(name string, value string) error {
84 | nameC := C.CString(name)
85 | defer C.free(unsafe.Pointer(nameC))
86 | nameLen := len(name)
87 | valueC := C.CString(value)
88 | defer C.free(unsafe.Pointer(valueC))
89 | valueLen := len(value)
90 | errCode := C.lol_html_element_set_attribute(
91 | (*C.lol_html_element_t)(e),
92 | nameC,
93 | C.size_t(nameLen),
94 | valueC,
95 | C.size_t(valueLen),
96 | )
97 | if errCode == 0 {
98 | return nil
99 | }
100 | return getError()
101 | }
102 |
103 | // RemoveAttribute removes the attribute with the name from the element.
104 | func (e *Element) RemoveAttribute(name string) error {
105 | nameC := C.CString(name)
106 | defer C.free(unsafe.Pointer(nameC))
107 | nameLen := len(name)
108 | errCode := C.lol_html_element_remove_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))
109 | if errCode == 0 {
110 | return nil
111 | }
112 | return getError()
113 | }
114 |
115 | type elementAlter int
116 |
117 | const (
118 | elementInsertBeforeStartTag elementAlter = iota
119 | elementInsertAfterStartTag
120 | elementInsertBeforeEndTag
121 | elementInsertAfterEndTag
122 | elementSetInnerContent
123 | elementReplace
124 | )
125 |
126 | func (e *Element) alter(content string, alter elementAlter, isHTML bool) error {
127 | contentC := C.CString(content)
128 | defer C.free(unsafe.Pointer(contentC))
129 | contentLen := len(content)
130 | var errCode C.int
131 | switch alter {
132 | case elementInsertBeforeStartTag:
133 | errCode = C.lol_html_element_before((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
134 | case elementInsertAfterStartTag:
135 | errCode = C.lol_html_element_prepend((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
136 | case elementInsertBeforeEndTag:
137 | errCode = C.lol_html_element_append((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
138 | case elementInsertAfterEndTag:
139 | errCode = C.lol_html_element_after((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
140 | case elementSetInnerContent:
141 | errCode = C.lol_html_element_set_inner_content((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
142 | case elementReplace:
143 | errCode = C.lol_html_element_replace((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML))
144 | default:
145 | panic("not implemented")
146 | }
147 | if errCode == 0 {
148 | return nil
149 | }
150 | return getError()
151 | }
152 |
153 | // InsertBeforeStartTagAsText inserts the given content before the element's start tag.
154 | //
155 | // The rewriter will HTML-escape the content before insertion:
156 | //
157 | // `<` will be replaced with `<`
158 | //
159 | // `>` will be replaced with `>`
160 | //
161 | // `&` will be replaced with `&`
162 | func (e *Element) InsertBeforeStartTagAsText(content string) error {
163 | return e.alter(content, elementInsertBeforeStartTag, false)
164 | }
165 |
166 | // InsertBeforeStartTagAsHTML inserts the given content before the element's start tag.
167 | // The content is inserted as is.
168 | func (e *Element) InsertBeforeStartTagAsHTML(content string) error {
169 | return e.alter(content, elementInsertBeforeStartTag, true)
170 | }
171 |
172 | // InsertAfterStartTagAsText inserts (prepend) the given content after the element's start tag.
173 | //
174 | // The rewriter will HTML-escape the content before insertion:
175 | //
176 | // `<` will be replaced with `<`
177 | //
178 | // `>` will be replaced with `>`
179 | //
180 | // `&` will be replaced with `&`
181 | func (e *Element) InsertAfterStartTagAsText(content string) error {
182 | return e.alter(content, elementInsertAfterStartTag, false)
183 | }
184 |
185 | // InsertAfterStartTagAsHTML inserts (prepend) the given content after the element's start tag.
186 | // The content is inserted as is.
187 | func (e *Element) InsertAfterStartTagAsHTML(content string) error {
188 | return e.alter(content, elementInsertAfterStartTag, true)
189 | }
190 |
191 | // InsertBeforeEndTagAsText inserts (append) the given content after the element's end tag.
192 | //
193 | // The rewriter will HTML-escape the content before insertion:
194 | //
195 | // `<` will be replaced with `<`
196 | //
197 | // `>` will be replaced with `>`
198 | //
199 | // `&` will be replaced with `&`
200 | func (e *Element) InsertBeforeEndTagAsText(content string) error {
201 | return e.alter(content, elementInsertBeforeEndTag, false)
202 | }
203 |
204 | // InsertBeforeEndTagAsHTML inserts (append) the given content before the element's end tag.
205 | // The content is inserted as is.
206 | func (e *Element) InsertBeforeEndTagAsHTML(content string) error {
207 | return e.alter(content, elementInsertBeforeEndTag, true)
208 | }
209 |
210 | // InsertAfterEndTagAsText inserts the given content after the element's end tag.
211 | //
212 | // The rewriter will HTML-escape the content before insertion:
213 | //
214 | // `<` will be replaced with `<`
215 | //
216 | // `>` will be replaced with `>`
217 | //
218 | // `&` will be replaced with `&`
219 | func (e *Element) InsertAfterEndTagAsText(content string) error {
220 | return e.alter(content, elementInsertAfterEndTag, false)
221 | }
222 |
223 | // InsertAfterEndTagAsHTML inserts the given content after the element's end tag.
224 | // The content is inserted as is.
225 | func (e *Element) InsertAfterEndTagAsHTML(content string) error {
226 | return e.alter(content, elementInsertAfterEndTag, true)
227 | }
228 |
229 | // SetInnerContentAsText overwrites the element's inner content.
230 | //
231 | // The rewriter will HTML-escape the content:
232 | //
233 | // `<` will be replaced with `<`
234 | //
235 | // `>` will be replaced with `>`
236 | //
237 | // `&` will be replaced with `&`
238 | func (e *Element) SetInnerContentAsText(content string) error {
239 | return e.alter(content, elementSetInnerContent, false)
240 | }
241 |
242 | // SetInnerContentAsHTML overwrites the element's inner content.
243 | // The content is kept as is.
244 | func (e *Element) SetInnerContentAsHTML(content string) error {
245 | return e.alter(content, elementSetInnerContent, true)
246 | }
247 |
248 | // ReplaceAsText replace the whole element with the supplied content.
249 | //
250 | // The rewriter will HTML-escape the content:
251 | //
252 | // `<` will be replaced with `<`
253 | //
254 | // `>` will be replaced with `>`
255 | //
256 | // `&` will be replaced with `&`
257 | func (e *Element) ReplaceAsText(content string) error {
258 | return e.alter(content, elementReplace, false)
259 | }
260 |
261 | // ReplaceAsHTML replace the whole element with the supplied content.
262 | // The content is kept as is.
263 | func (e *Element) ReplaceAsHTML(content string) error {
264 | return e.alter(content, elementReplace, true)
265 | }
266 |
267 | // Remove completely removes the element.
268 | func (e *Element) Remove() {
269 | C.lol_html_element_remove((*C.lol_html_element_t)(e))
270 | }
271 |
272 | // RemoveAndKeepContent removes the element but keeps the inner content.
273 | func (e *Element) RemoveAndKeepContent() {
274 | C.lol_html_element_remove_and_keep_content((*C.lol_html_element_t)(e))
275 | }
276 |
277 | // IsRemoved returns whether the element is removed or not.
278 | func (e *Element) IsRemoved() bool {
279 | return (bool)(C.lol_html_element_is_removed((*C.lol_html_element_t)(e)))
280 | }
281 |
--------------------------------------------------------------------------------
/element_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/coolspring8/go-lolhtml"
8 | )
9 |
10 | func TestElement_ModifyTagName(t *testing.T) {
11 | var buf bytes.Buffer
12 | w, err := lolhtml.NewWriter(
13 | &buf,
14 | &lolhtml.Handlers{
15 | ElementContentHandler: []lolhtml.ElementContentHandler{
16 | {
17 | Selector: "*",
18 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
19 | wantName := "div"
20 | if name := e.TagName(); name != wantName {
21 | t.Errorf("got %s want %s\n", name, wantName)
22 | }
23 | err := e.SetTagName("")
24 | if err == nil {
25 | t.FailNow()
26 | }
27 | if err.Error() != "Tag name can't be empty." {
28 | t.Error(err)
29 | }
30 | if err = e.SetTagName("span"); err != nil {
31 | t.Error(err)
32 | }
33 | return lolhtml.Continue
34 | },
35 | },
36 | },
37 | },
38 | )
39 | if err != nil {
40 | t.Error(err)
41 | }
42 |
43 | if _, err = w.Write([]byte("Hi ")); err != nil {
44 | t.Error(err)
45 | }
46 | if err = w.Close(); err != nil {
47 | t.Error(err)
48 | }
49 | wantedText := "Hi
"
50 | if finalText := buf.String(); finalText != wantedText {
51 | t.Errorf("want %s got %s \n", wantedText, finalText)
52 | }
53 | }
54 |
55 | func TestElement_ModifyAttributes(t *testing.T) {
56 | var buf bytes.Buffer
57 | w, err := lolhtml.NewWriter(
58 | &buf,
59 | &lolhtml.Handlers{
60 | ElementContentHandler: []lolhtml.ElementContentHandler{
61 | {
62 | Selector: "*",
63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
64 | has, err := e.HasAttribute("foo")
65 | if err != nil {
66 | t.Error(err)
67 | }
68 | if !has {
69 | t.FailNow()
70 | }
71 | has, err = e.HasAttribute("Bar")
72 | if err != nil {
73 | t.Error(err)
74 | }
75 | if has {
76 | t.FailNow()
77 | }
78 |
79 | a, err := e.AttributeValue("foo")
80 | if err != nil {
81 | t.Error(err)
82 | }
83 | wantValue := "42"
84 | if a != wantValue {
85 | t.Errorf("got %s; want %s", a, wantValue)
86 | }
87 | a, err = e.AttributeValue("Bar")
88 | if err != nil {
89 | t.Error(err)
90 | }
91 | if a != "" {
92 | t.Errorf("got %s; want empty", a)
93 | }
94 |
95 | if err := e.SetAttribute("Bar", "hey"); err != nil {
96 | t.Error(err)
97 | }
98 |
99 | if err := e.RemoveAttribute("foo"); err != nil {
100 | t.Error(err)
101 | }
102 |
103 | return lolhtml.Continue
104 | },
105 | },
106 | },
107 | },
108 | )
109 | if err != nil {
110 | t.Error(err)
111 | }
112 |
113 | if _, err = w.Write([]byte("")); err != nil {
114 | t.Error(err)
115 | }
116 | if err = w.Close(); err != nil {
117 | t.Error(err)
118 | }
119 | wantedText := ""
120 | if finalText := buf.String(); finalText != wantedText {
121 | t.Errorf("want %s got %s \n", wantedText, finalText)
122 | }
123 | }
124 |
125 | func TestElement_InsertContentAroundElement(t *testing.T) {
126 | var buf bytes.Buffer
127 | w, err := lolhtml.NewWriter(
128 | &buf,
129 | &lolhtml.Handlers{
130 | ElementContentHandler: []lolhtml.ElementContentHandler{
131 | {
132 | Selector: "*",
133 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
134 | if err := e.InsertBeforeStartTagAsText("&before"); err != nil {
135 | t.Error(err)
136 | }
137 | if err := e.InsertAfterStartTagAsHTML(""); err != nil {
138 | t.Error(err)
139 | }
140 | if err := e.InsertBeforeEndTagAsHTML(""); err != nil {
141 | t.Error(err)
142 | }
143 | if err := e.InsertAfterEndTagAsText("&after"); err != nil {
144 | t.Error(err)
145 | }
146 | return lolhtml.Continue
147 | },
148 | },
149 | },
150 | },
151 | )
152 | if err != nil {
153 | t.Error(err)
154 | }
155 |
156 | if _, err = w.Write([]byte("Hi
")); err != nil {
157 | t.Error(err)
158 | }
159 | if err = w.Close(); err != nil {
160 | t.Error(err)
161 | }
162 | wantedText := "&beforeHi
&after"
163 | if finalText := buf.String(); finalText != wantedText {
164 | t.Errorf("want %s got %s \n", wantedText, finalText)
165 | }
166 | }
167 |
168 | func TestElement_SetInnerContent(t *testing.T) {
169 | var buf bytes.Buffer
170 | w, err := lolhtml.NewWriter(
171 | &buf,
172 | &lolhtml.Handlers{
173 | ElementContentHandler: []lolhtml.ElementContentHandler{
174 | {
175 | Selector: "div",
176 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
177 | if err := e.SetInnerContentAsText("hey & ya"); err != nil {
178 | t.Error(err)
179 | }
180 | return lolhtml.Continue
181 | },
182 | },
183 | },
184 | },
185 | )
186 | if err != nil {
187 | t.Error(err)
188 | }
189 |
190 | if _, err = w.Write([]byte("42
")); err != nil {
191 | t.Error(err)
192 | }
193 | if err = w.Close(); err != nil {
194 | t.Error(err)
195 | }
196 | wantedText := "hey & ya
"
197 | if finalText := buf.String(); finalText != wantedText {
198 | t.Errorf("want %s got %s \n", wantedText, finalText)
199 | }
200 | }
201 |
202 | func TestElement_Replace(t *testing.T) {
203 | var buf bytes.Buffer
204 | w, err := lolhtml.NewWriter(
205 | &buf,
206 | &lolhtml.Handlers{
207 | ElementContentHandler: []lolhtml.ElementContentHandler{
208 | {
209 | Selector: "div",
210 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
211 | if err := e.ReplaceAsHTML("hey & ya"); err != nil {
212 | t.Error(err)
213 | }
214 | return lolhtml.Continue
215 | },
216 | },
217 | },
218 | },
219 | )
220 | if err != nil {
221 | t.Error(err)
222 | }
223 |
224 | if _, err = w.Write([]byte("42
Hello good bye
Hello2 ")); err != nil {
225 | t.Error(err)
226 | }
227 | if err = w.Close(); err != nil {
228 | t.Error(err)
229 | }
230 | wantedText := "hey & yaHellohey & ya Hello2 "
231 | if finalText := buf.String(); finalText != wantedText {
232 | t.Errorf("want %s got %s \n", wantedText, finalText)
233 | }
234 | }
235 |
236 | func TestElement_Remove(t *testing.T) {
237 | var buf bytes.Buffer
238 | w, err := lolhtml.NewWriter(
239 | &buf,
240 | &lolhtml.Handlers{
241 | ElementContentHandler: []lolhtml.ElementContentHandler{
242 | {
243 | Selector: "h1",
244 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
245 | if e.IsRemoved() {
246 | t.FailNow()
247 | }
248 | e.Remove()
249 | if !e.IsRemoved() {
250 | t.FailNow()
251 | }
252 | return lolhtml.Continue
253 | },
254 | },
255 | },
256 | },
257 | )
258 | if err != nil {
259 | t.Error(err)
260 | }
261 |
262 | if _, err = w.Write([]byte("42
Hello Hello2 ")); err != nil {
263 | t.Error(err)
264 | }
265 | if err = w.Close(); err != nil {
266 | t.Error(err)
267 | }
268 | wantedText := "42
Hello2 "
269 | if finalText := buf.String(); finalText != wantedText {
270 | t.Errorf("want %s got %s \n", wantedText, finalText)
271 | }
272 | }
273 |
274 | func TestElement_RemoveElementAndKeepContent(t *testing.T) {
275 | var buf bytes.Buffer
276 | w, err := lolhtml.NewWriter(
277 | &buf,
278 | &lolhtml.Handlers{
279 | ElementContentHandler: []lolhtml.ElementContentHandler{
280 | {
281 | Selector: "h2",
282 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
283 | if e.IsRemoved() {
284 | t.FailNow()
285 | }
286 | e.RemoveAndKeepContent()
287 | if !e.IsRemoved() {
288 | t.FailNow()
289 | }
290 | return lolhtml.Continue
291 | },
292 | },
293 | },
294 | },
295 | )
296 | if err != nil {
297 | t.Error(err)
298 | }
299 |
300 | if _, err = w.Write([]byte("42Hello1
Hello Hello2 ")); err != nil {
301 | t.Error(err)
302 | }
303 | if err = w.Close(); err != nil {
304 | t.Error(err)
305 | }
306 | wantedText := "42Hello1
Hello Hello2"
307 | if finalText := buf.String(); finalText != wantedText {
308 | t.Errorf("want %s got %s \n", wantedText, finalText)
309 | }
310 | }
311 |
312 | func TestElement_GetEmptyElementAttribute(t *testing.T) {
313 | var buf bytes.Buffer
314 | w, err := lolhtml.NewWriter(
315 | &buf,
316 | &lolhtml.Handlers{
317 | ElementContentHandler: []lolhtml.ElementContentHandler{
318 | {
319 | Selector: "span",
320 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
321 | has, err := e.HasAttribute("foo")
322 | if err != nil {
323 | t.Error(err)
324 | }
325 | if !has {
326 | t.FailNow()
327 | }
328 | value, err := e.AttributeValue("foo")
329 | if err != nil {
330 | t.Error(err)
331 | }
332 | if value != "" {
333 | t.Errorf("got %s; want empty", value)
334 | }
335 | return lolhtml.Continue
336 | },
337 | },
338 | },
339 | },
340 | )
341 | if err != nil {
342 | t.Error(err)
343 | }
344 |
345 | if _, err = w.Write([]byte("")); err != nil {
346 | t.Error(err)
347 | }
348 | if err = w.Close(); err != nil {
349 | t.Error(err)
350 | }
351 | wantedText := ""
352 | if finalText := buf.String(); finalText != wantedText {
353 | t.Errorf("want %s got %s \n", wantedText, finalText)
354 | }
355 | }
356 |
357 | func TestElement_IterateAttributes(t *testing.T) {
358 | w, err := lolhtml.NewWriter(
359 | nil,
360 | &lolhtml.Handlers{
361 | ElementContentHandler: []lolhtml.ElementContentHandler{
362 | {
363 | Selector: "*",
364 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
365 | ai := e.AttributeIterator()
366 |
367 | a := ai.Next()
368 | if name := a.Name(); name != "foo" {
369 | t.Errorf("got %s; want foo", name)
370 | }
371 | if value := a.Value(); value != "42" {
372 | t.Errorf("got %s; want foo", value)
373 | }
374 |
375 | a = ai.Next()
376 | if name := a.Name(); name != "bar" {
377 | t.Errorf("got %s; want bar", name)
378 | }
379 | if value := a.Value(); value != "1337" {
380 | t.Errorf("got %s; want 1337", value)
381 | }
382 |
383 | a = ai.Next()
384 | if a != nil {
385 | t.FailNow()
386 | }
387 |
388 | return lolhtml.Continue
389 | },
390 | },
391 | },
392 | },
393 | )
394 | if err != nil {
395 | t.Error(err)
396 | }
397 |
398 | if _, err = w.Write([]byte("")); err != nil {
399 | t.Error(err)
400 | }
401 | if err = w.Close(); err != nil {
402 | t.Error(err)
403 | }
404 | }
405 |
406 | func TestElement_AssertNsIsHtml(t *testing.T) {
407 | w, err := lolhtml.NewWriter(
408 | nil,
409 | &lolhtml.Handlers{
410 | ElementContentHandler: []lolhtml.ElementContentHandler{
411 | {
412 | Selector: "script",
413 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
414 | wantedText := "http://www.w3.org/1999/xhtml"
415 | if ns := e.NamespaceURI(); ns != wantedText {
416 | t.Errorf("got %s; want %s", ns, wantedText)
417 | }
418 | return lolhtml.Continue
419 | },
420 | },
421 | },
422 | },
423 | )
424 | if err != nil {
425 | t.Error(err)
426 | }
427 |
428 | if _, err = w.Write([]byte("")); err != nil {
429 | t.Error(err)
430 | }
431 | if err = w.Close(); err != nil {
432 | t.Error(err)
433 | }
434 | }
435 |
436 | func TestElement_AssertNsIsSvg(t *testing.T) {
437 | w, err := lolhtml.NewWriter(
438 | nil,
439 | &lolhtml.Handlers{
440 | ElementContentHandler: []lolhtml.ElementContentHandler{
441 | {
442 | Selector: "script",
443 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
444 | wantedText := "http://www.w3.org/2000/svg"
445 | if ns := e.NamespaceURI(); ns != wantedText {
446 | t.Errorf("got %s; want %s", ns, wantedText)
447 | }
448 | return lolhtml.Continue
449 | },
450 | },
451 | },
452 | },
453 | )
454 | if err != nil {
455 | t.Error(err)
456 | }
457 |
458 | if _, err = w.Write([]byte("
")); err != nil {
459 | t.Error(err)
460 | }
461 | if err = w.Close(); err != nil {
462 | t.Error(err)
463 | }
464 | }
465 |
466 | func TestElement_StopRewriting(t *testing.T) {
467 | w, err := lolhtml.NewWriter(
468 | nil,
469 | &lolhtml.Handlers{
470 | ElementContentHandler: []lolhtml.ElementContentHandler{
471 | {
472 | Selector: "span",
473 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
474 | return lolhtml.Stop
475 | },
476 | },
477 | },
478 | },
479 | )
480 | if err != nil {
481 | t.Error(err)
482 | }
483 |
484 | _, err = w.Write([]byte("
"))
485 | if err == nil {
486 | t.FailNow()
487 | }
488 | if err.Error() != "The rewriter has been stopped." {
489 | t.Error(err)
490 | }
491 | err = w.Close()
492 | if err == nil {
493 | t.FailNow()
494 | }
495 | if err.Error() != "The rewriter has been stopped." {
496 | t.Error(err)
497 | }
498 | }
499 |
--------------------------------------------------------------------------------
/error.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include "lol_html.h"
5 | */
6 | import "C"
7 | import "errors"
8 |
9 | // ErrCannotGetErrorMessage indicates getting error code from lol_html, but unable to acquire the concrete
10 | // error message.
11 | var ErrCannotGetErrorMessage = errors.New("cannot get error message from underlying lol_html lib")
12 |
13 | // getError is a helper function that gets error message for the last function call.
14 | // You should make sure there is an error when calling this, or the function interprets
15 | // the NULL error message obtained as ErrCannotGetErrorMessage.
16 | func getError() error {
17 | errC := (*str)(C.lol_html_take_last_error())
18 | defer errC.Free()
19 | if errMsg := errC.String(); errMsg != "" {
20 | return errors.New(errMsg)
21 | }
22 | return ErrCannotGetErrorMessage
23 | }
24 |
--------------------------------------------------------------------------------
/error_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "errors"
5 | "testing"
6 |
7 | "github.com/coolspring8/go-lolhtml"
8 | )
9 |
10 | // TestNullErrorStr tests internal functions for handling a null lol_html_str_t, by calling lol_html_take_last_error()
11 | // when there is no error.
12 | func TestNullErrorStr(t *testing.T) {
13 | err := lolhtml.GetError()
14 | if !errors.Is(err, lolhtml.ErrCannotGetErrorMessage) {
15 | t.Error(err)
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | // This file is for demonstration in godoc. For more examples, see the /examples directory.
2 | package lolhtml_test
3 |
4 | import (
5 | "bytes"
6 | "fmt"
7 | "io"
8 | "log"
9 | "os"
10 | "strings"
11 |
12 | "github.com/coolspring8/go-lolhtml"
13 | )
14 |
15 | func ExampleNewWriter() {
16 | chunk := []byte("Hello, World !")
17 | r := bytes.NewReader(chunk)
18 | w, err := lolhtml.NewWriter(
19 | // output to stdout
20 | os.Stdout,
21 | &lolhtml.Handlers{
22 | ElementContentHandler: []lolhtml.ElementContentHandler{
23 | {
24 | Selector: "span",
25 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
26 | err := e.SetInnerContentAsText("LOL-HTML")
27 | if err != nil {
28 | log.Fatal(err)
29 | }
30 | return lolhtml.Continue
31 | },
32 | },
33 | },
34 | },
35 | )
36 | if err != nil {
37 | log.Fatal(err)
38 | }
39 |
40 | // copy from the bytes reader to lolhtml writer
41 | _, err = io.Copy(w, r)
42 | if err != nil {
43 | log.Fatal(err)
44 | }
45 |
46 | // explicitly close the writer and flush the remaining content
47 | err = w.Close()
48 | if err != nil {
49 | log.Fatal(err)
50 | }
51 | // Output: Hello, LOL-HTML !
52 | }
53 |
54 | func ExampleRewriteString() {
55 | output, err := lolhtml.RewriteString(
56 | ``,
57 | &lolhtml.Handlers{
58 | ElementContentHandler: []lolhtml.ElementContentHandler{
59 | {
60 | Selector: "a[href]",
61 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
62 | href, err := e.AttributeValue("href")
63 | if err != nil {
64 | log.Fatal(err)
65 | }
66 | href = strings.ReplaceAll(href, "http:", "https:")
67 |
68 | err = e.SetAttribute("href", href)
69 | if err != nil {
70 | log.Fatal(err)
71 | }
72 |
73 | return lolhtml.Continue
74 | },
75 | },
76 | },
77 | },
78 | )
79 | if err != nil {
80 | log.Fatal(err)
81 | }
82 |
83 | fmt.Println(output)
84 | // Output:
85 | }
86 |
--------------------------------------------------------------------------------
/examples/defer-scripts/main.go:
--------------------------------------------------------------------------------
1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go
2 | package main
3 |
4 | import (
5 | "io"
6 | "log"
7 | "os"
8 |
9 | "github.com/coolspring8/go-lolhtml"
10 | )
11 |
12 | func main() {
13 | w, err := lolhtml.NewWriter(
14 | os.Stdout,
15 | &lolhtml.Handlers{
16 | ElementContentHandler: []lolhtml.ElementContentHandler{
17 | {
18 | Selector: "script[src]:not([async]):not([defer])",
19 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
20 | err := e.SetAttribute("defer", "")
21 | if err != nil {
22 | log.Fatal(err)
23 | }
24 | return lolhtml.Continue
25 | },
26 | },
27 | },
28 | },
29 | )
30 | if err != nil {
31 | log.Fatal(err)
32 | }
33 |
34 | _, err = io.Copy(w, os.Stdin)
35 | if err != nil {
36 | log.Fatal(err)
37 | }
38 |
39 | err = w.Close()
40 | if err != nil {
41 | log.Fatal(err)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/examples/mixed-content-rewriter/main.go:
--------------------------------------------------------------------------------
1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go
2 | package main
3 |
4 | import (
5 | "io"
6 | "log"
7 | "os"
8 | "strings"
9 |
10 | "github.com/coolspring8/go-lolhtml"
11 | )
12 |
13 | func main() {
14 | w, err := lolhtml.NewWriter(
15 | os.Stdout,
16 | &lolhtml.Handlers{
17 | ElementContentHandler: []lolhtml.ElementContentHandler{
18 | {
19 | Selector: "a[href], link[rel=stylesheet][href]",
20 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
21 | rewriteUrlInAttribute(e, "href")
22 | return lolhtml.Continue
23 | },
24 | },
25 | {
26 | Selector: "script[src], iframe[src], img[src], audio[src], video[src]",
27 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
28 | rewriteUrlInAttribute(e, "src")
29 | return lolhtml.Continue
30 | },
31 | },
32 | },
33 | },
34 | )
35 | if err != nil {
36 | log.Fatal(err)
37 | }
38 |
39 | _, err = io.Copy(w, os.Stdin)
40 | if err != nil {
41 | log.Fatal(err)
42 | }
43 |
44 | err = w.Close()
45 | if err != nil {
46 | log.Fatal(err)
47 | }
48 | }
49 |
50 | func rewriteUrlInAttribute(e *lolhtml.Element, attributeName string) {
51 | attr, err := e.AttributeValue(attributeName)
52 | if err != nil {
53 | log.Fatal(err)
54 | }
55 | attr = strings.ReplaceAll(attr, "http://", "https://")
56 |
57 | err = e.SetAttribute(attributeName, attr)
58 | if err != nil {
59 | log.Fatal(err)
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/examples/web-scraper/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Web Scraper · By Adam Schwartz · Powered by Cloudflare Workers®
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
238 |
239 |
240 |
241 |
242 |
246 |
247 |
291 |
292 |
293 |
294 |
Update preview Scrape
295 |
Permalink
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
About
307 |
308 |
Web Scraper makes it effortless to scrape websites. Just provide a URL and CSS selector and it will return JSON containing the text contents of the matching elements.
309 |
Built by Adam Schwartz using Cloudflare Workers , open-source and available on Github .
310 |
311 |
312 |
313 | Done
314 | View example
315 |
316 |
317 |
322 |
323 |
324 |
325 |
458 |
459 |
--------------------------------------------------------------------------------
/examples/web-scraper/main.go:
--------------------------------------------------------------------------------
1 | // This is a ported Go version of https://web.scraper.workers.dev/, whose source code is
2 | // available at https://github.com/adamschwartz/web.scraper.workers.dev licensed under MIT.
3 | //
4 | // This translation is for demonstration purpose only, so many parts of the code are suboptimal.
5 | //
6 | // Sometimes you may get a "different" result, as Go's encoding/json package always sorts the
7 | // keys of a map (when using multiple selectors), and encodes a nil slice as the null JSON value.
8 | package main
9 |
10 | import (
11 | "encoding/json"
12 | "fmt"
13 | "io"
14 | "log"
15 | "net/http"
16 | "regexp"
17 | "strings"
18 |
19 | "github.com/coolspring8/go-lolhtml"
20 | )
21 |
22 | var (
23 | debug = true
24 | listenAddress = ":80"
25 | mainPageFileName = "index.html"
26 | )
27 |
28 | var (
29 | urlHasPrefix = regexp.MustCompile(`^[a-zA-Z]+://`)
30 | unifyWhitespace = regexp.MustCompile(`\s{2,}`)
31 | )
32 |
33 | // used to separate texts in different elements.
34 | var textSeparator = "TEXT_SEPARATOR_TEXT_SEPARATOR"
35 |
36 | func main() {
37 | log.Printf("Server started at %s", listenAddress)
38 | http.HandleFunc("/", handler)
39 | log.Fatal(http.ListenAndServe(listenAddress, nil))
40 | }
41 |
42 | func handler(w http.ResponseWriter, req *http.Request) {
43 | log.Println(req.URL)
44 |
45 | // 404
46 | if req.URL.Path != "/" {
47 | w.WriteHeader(http.StatusNotFound)
48 | _, _ = w.Write([]byte("Not found"))
49 | return
50 | }
51 |
52 | q := req.URL.Query()
53 |
54 | url := q.Get("url")
55 | if url != "" && !urlHasPrefix.MatchString(url) {
56 | url = "http://" + url
57 | }
58 |
59 | selector := q.Get("selector")
60 |
61 | attr := q.Get("attr")
62 |
63 | var spaced bool
64 | _spaced := q.Get("spaced")
65 | if _spaced != "" {
66 | spaced = true
67 | } else {
68 | spaced = false
69 | }
70 |
71 | var pretty bool
72 | _pretty := q.Get("pretty")
73 | if _pretty != "" {
74 | pretty = true
75 | } else {
76 | pretty = false
77 | }
78 |
79 | // home page
80 | if url == "" && selector == "" {
81 | http.ServeFile(w, req, mainPageFileName)
82 | return
83 | }
84 |
85 | // text or attr: get text, part 1/2
86 | handlers := lolhtml.Handlers{}
87 | // matches and selectors are used by text scraper
88 | matches := make(map[string][]string)
89 | var selectors []string
90 | _selectors := strings.Split(selector, ",")
91 | for _, s := range _selectors {
92 | selectors = append(selectors, strings.TrimSpace(s))
93 | }
94 | // attrValue is used by attribute scraper
95 | var attrValue string
96 | if attr == "" {
97 | nextText := make(map[string]string)
98 |
99 | for _, s := range selectors {
100 | s := s
101 | handlers.ElementContentHandler = append(
102 | handlers.ElementContentHandler,
103 | lolhtml.ElementContentHandler{
104 | Selector: s,
105 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
106 | matches[s] = append(matches[s], textSeparator)
107 | nextText[s] = ""
108 | return lolhtml.Continue
109 | },
110 | TextChunkHandler: func(t *lolhtml.TextChunk) lolhtml.RewriterDirective {
111 | nextText[s] += t.Content()
112 | if t.IsLastInTextNode() {
113 | if spaced {
114 | nextText[s] += " "
115 | }
116 | matches[s] = append(matches[s], nextText[s])
117 | nextText[s] = ""
118 | }
119 | return lolhtml.Continue
120 | },
121 | },
122 | )
123 | }
124 | } else {
125 | handlers = lolhtml.Handlers{
126 | ElementContentHandler: []lolhtml.ElementContentHandler{
127 | {
128 | Selector: selector,
129 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective {
130 | attrValue, _ = e.AttributeValue(attr)
131 | return lolhtml.Stop
132 | },
133 | },
134 | },
135 | }
136 | }
137 |
138 | lolWriter, err := lolhtml.NewWriter(
139 | nil,
140 | &handlers,
141 | )
142 | if err != nil {
143 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
144 | return
145 | }
146 |
147 | // fetch target page content
148 | resp, err := http.Get(url)
149 | if err != nil {
150 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
151 | return
152 | }
153 | if resp.StatusCode != http.StatusOK {
154 | sendError(w, http.StatusBadGateway, fmt.Sprintf("Status %d requesting %s", resp.StatusCode, url), pretty)
155 | return
156 | }
157 | defer resp.Body.Close()
158 |
159 | // might be confusing
160 | _, err = io.Copy(lolWriter, resp.Body)
161 | if err != nil && err.Error() != "The rewriter has been stopped." {
162 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
163 | return
164 | }
165 | if err == nil || err.Error() != "The rewriter has been stopped." {
166 | err = lolWriter.Close()
167 | if err != nil {
168 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
169 | return
170 | }
171 | }
172 |
173 | // text or attr: post-process texts, part 2/2
174 | if attr == "" {
175 | for _, s := range selectors {
176 | var nodeCompleteTexts []string
177 | nextText := ""
178 |
179 | for _, text := range matches[s] {
180 | if text == textSeparator {
181 | if strings.TrimSpace(nextText) != "" {
182 | nodeCompleteTexts = append(nodeCompleteTexts, cleanText(nextText))
183 | nextText = ""
184 | }
185 | } else {
186 | nextText += text
187 | }
188 | }
189 |
190 | lastText := cleanText(nextText)
191 | if lastText != "" {
192 | nodeCompleteTexts = append(nodeCompleteTexts, lastText)
193 | }
194 | matches[s] = nodeCompleteTexts
195 | }
196 | }
197 |
198 | w.WriteHeader(http.StatusOK)
199 |
200 | enc := json.NewEncoder(w)
201 | enc.SetEscapeHTML(false)
202 | if pretty {
203 | enc.SetIndent("", " ")
204 | }
205 |
206 | if attr == "" {
207 | err = enc.Encode(Response{Result: matches})
208 | } else {
209 | err = enc.Encode(Response{Result: attrValue})
210 | }
211 | if err != nil {
212 | sendError(w, http.StatusInternalServerError, err.Error(), pretty)
213 | return
214 | }
215 | }
216 |
217 | type Response struct {
218 | Result interface{} `json:"result,omitempty"`
219 | Error string `json:"error,omitempty"`
220 | }
221 |
222 | func sendError(w http.ResponseWriter, statusCode int, errorText string, pretty bool) {
223 | w.WriteHeader(statusCode)
224 |
225 | enc := json.NewEncoder(w)
226 | enc.SetEscapeHTML(false)
227 | if pretty {
228 | enc.SetIndent("", " ")
229 | }
230 |
231 | // redact concrete error message if debug != true
232 | if !debug && statusCode == http.StatusInternalServerError {
233 | errorText = "Internal server error"
234 | }
235 |
236 | err := enc.Encode(Response{Error: errorText})
237 | if err != nil {
238 | _, _ = w.Write([]byte(errorText))
239 | }
240 | }
241 |
242 | func cleanText(s string) string {
243 | return unifyWhitespace.ReplaceAllString(strings.TrimSpace(s), " ")
244 | }
245 |
--------------------------------------------------------------------------------
/export_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | // just export some internal functions for tests
4 |
5 | var GetError = getError
6 | var NewSelector = newSelector
7 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/coolspring8/go-lolhtml
2 |
3 | go 1.15
4 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/go.sum
--------------------------------------------------------------------------------
/lolhtml.go:
--------------------------------------------------------------------------------
1 | // Package lolhtml provides the ability to parse and rewrite HTML on the fly,
2 | // with a CSS-selector based API.
3 | //
4 | // It is a binding for the Rust crate lol_html.
5 | // https://github.com/cloudflare/lol-html
6 | //
7 | // Please see /examples subdirectory for more detailed examples.
8 | package lolhtml
9 |
10 | /*
11 | #cgo CFLAGS:-I${SRCDIR}/build/include
12 | #cgo LDFLAGS:-llolhtml
13 | #cgo !windows LDFLAGS:-lm
14 | #cgo linux,amd64 LDFLAGS:-L${SRCDIR}/build/linux-x86_64
15 | #cgo darwin,amd64 LDFLAGS:-L${SRCDIR}/build/macos-x86_64
16 | #cgo windows,amd64 LDFLAGS:-L${SRCDIR}/build/windows-x86_64
17 | #include
18 | #include "lol_html.h"
19 | */
20 | import "C"
21 |
--------------------------------------------------------------------------------
/pointer.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | // Credit to https://github.com/mattn/go-pointer.
4 |
5 | // #include
6 | import "C"
7 | import (
8 | "sync"
9 | "unsafe"
10 | )
11 |
12 | // sync.Map documentation states that it is optimized for "when the entry for a given key is only
13 | // ever written once but read many times, as in caches that only grow". My benchmarks show that sync.Map
14 | // version rewriter is slower in single-goroutine calls, but faster when used in multiple goroutines
15 | // (and personally I think the latter is more important).
16 | var store sync.Map
17 |
18 | func savePointer(v interface{}) unsafe.Pointer {
19 | if v == nil {
20 | return nil
21 | }
22 |
23 | ptr := C.malloc(C.size_t(1))
24 | if ptr == nil {
25 | panic(`can't allocate "cgo-pointer hack index pointer": ptr == nil`)
26 | }
27 |
28 | store.Store(ptr, v)
29 |
30 | return ptr
31 | }
32 |
33 | func restorePointer(ptr unsafe.Pointer) (v interface{}) {
34 | if ptr == nil {
35 | return nil
36 | }
37 |
38 | if v, ok := store.Load(ptr); ok {
39 | return v
40 | }
41 | return nil
42 | }
43 |
44 | func unrefPointer(ptr unsafe.Pointer) {
45 | if ptr == nil {
46 | return
47 | }
48 |
49 | store.Delete(ptr)
50 |
51 | C.free(ptr)
52 | }
53 |
54 | func unrefPointers(ptrs []unsafe.Pointer) {
55 | for _, ptr := range ptrs {
56 | unrefPointer(ptr)
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/rewriter.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include
5 | #include "lol_html.h"
6 | */
7 | import "C"
8 | import (
9 | "unsafe"
10 | )
11 |
12 | // rewriter represents an actual HTML rewriter.
13 | // rewriterBuilder, rewriter and selector are kept private to simplify public API.
14 | // If you find it useful to use them publicly, please inform me.
15 | type rewriter struct {
16 | rewriter *C.lol_html_rewriter_t
17 | pointers []unsafe.Pointer
18 | // TODO: unrecoverable bool
19 | }
20 |
21 | func (r *rewriter) Write(p []byte) (n int, err error) {
22 | pLen := len(p)
23 | // avoid 0-sized array
24 | if pLen == 0 {
25 | p = []byte("\x00")
26 | }
27 | pC := (*C.char)(unsafe.Pointer(&p[0]))
28 | errCode := C.lol_html_rewriter_write(r.rewriter, pC, C.size_t(pLen))
29 | if errCode == 0 {
30 | return pLen, nil
31 | }
32 | return 0, getError()
33 | }
34 |
35 | func (r *rewriter) WriteString(chunk string) (n int, err error) {
36 | chunkC := C.CString(chunk)
37 | defer C.free(unsafe.Pointer(chunkC))
38 | chunkLen := len(chunk)
39 | errCode := C.lol_html_rewriter_write(r.rewriter, chunkC, C.size_t(chunkLen))
40 | if errCode == 0 {
41 | return chunkLen, nil
42 | }
43 | return 0, getError()
44 | }
45 |
46 | func (r *rewriter) End() error {
47 | errCode := C.lol_html_rewriter_end(r.rewriter)
48 | if errCode == 0 {
49 | return nil
50 | }
51 | return getError()
52 | }
53 |
54 | func (r *rewriter) Free() {
55 | if r != nil {
56 | C.lol_html_rewriter_free(r.rewriter)
57 | unrefPointers(r.pointers)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/rewriter_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/coolspring8/go-lolhtml"
7 | )
8 |
9 | func TestRewriter_NonAsciiEncoding(t *testing.T) {
10 | w, err := lolhtml.NewWriter(
11 | nil,
12 | nil,
13 | lolhtml.Config{
14 | Encoding: "UTF-16",
15 | Memory: &lolhtml.MemorySettings{
16 | PreallocatedParsingBufferSize: 0,
17 | MaxAllowedMemoryUsage: 16,
18 | },
19 | Strict: true,
20 | })
21 | if w != nil || err == nil {
22 | t.FailNow()
23 | }
24 | if err.Error() != "Expected ASCII-compatible encoding." {
25 | t.Error(err)
26 | }
27 | err = w.Close()
28 | if err != nil {
29 | t.Error(err)
30 | }
31 | }
32 |
33 | func TestRewriter_MemoryLimiting(t *testing.T) {
34 | w, err := lolhtml.NewWriter(
35 | nil,
36 | &lolhtml.Handlers{
37 | ElementContentHandler: []lolhtml.ElementContentHandler{
38 | {
39 | "span",
40 | nil,
41 | nil,
42 | nil,
43 | },
44 | },
45 | },
46 | lolhtml.Config{
47 | Encoding: "utf-8",
48 | Memory: &lolhtml.MemorySettings{
49 | PreallocatedParsingBufferSize: 0,
50 | MaxAllowedMemoryUsage: 5,
51 | },
52 | Strict: true,
53 | },
54 | )
55 | if err != nil {
56 | t.Error(err)
57 | }
58 | _, err = w.Write([]byte("len from size_t (uint) to int (int32) on 32-bit machines?
25 | func (s *str) String() string {
26 | if s == nil {
27 | return ""
28 | }
29 | return C.GoStringN(s.data, C.int(s.len))
30 | }
31 |
32 | func (s *textChunkContent) String() string {
33 | //var nullTextChunkContent textChunkContent
34 | //if s == nullTextChunkContent {
35 | // return ""
36 | //}
37 | if s == nil {
38 | return ""
39 | }
40 | return C.GoStringN(s.data, C.int(s.len))
41 | }
42 |
--------------------------------------------------------------------------------
/textchunk.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | /*
4 | #include
5 | #include "lol_html.h"
6 | */
7 | import "C"
8 | import "unsafe"
9 |
10 | // TextChunk represents a text chunk.
11 | type TextChunk C.lol_html_text_chunk_t
12 |
13 | // TextChunkHandlerFunc is a callback handler function to do something with a TextChunk.
14 | type TextChunkHandlerFunc func(*TextChunk) RewriterDirective
15 |
16 | // Content returns the text chunk's content.
17 | func (t *TextChunk) Content() string {
18 | text := (textChunkContent)(C.lol_html_text_chunk_content_get((*C.lol_html_text_chunk_t)(t)))
19 | return text.String()
20 | }
21 |
22 | // IsLastInTextNode returns whether the text chunk is the last in the text node.
23 | func (t *TextChunk) IsLastInTextNode() bool {
24 | return (bool)(C.lol_html_text_chunk_is_last_in_text_node((*C.lol_html_text_chunk_t)(t)))
25 | }
26 |
27 | type textChunkAlter int
28 |
29 | const (
30 | textChunkInsertBefore textChunkAlter = iota
31 | textChunkInsertAfter
32 | textChunkReplace
33 | )
34 |
35 | func (t *TextChunk) alter(content string, alter textChunkAlter, isHTML bool) error {
36 | contentC := C.CString(content)
37 | defer C.free(unsafe.Pointer(contentC))
38 | contentLen := len(content)
39 | var errCode C.int
40 | switch alter {
41 | case textChunkInsertBefore:
42 | errCode = C.lol_html_text_chunk_before((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML))
43 | case textChunkInsertAfter:
44 | errCode = C.lol_html_text_chunk_after((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML))
45 | case textChunkReplace:
46 | errCode = C.lol_html_text_chunk_replace((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML))
47 | default:
48 | panic("not implemented")
49 | }
50 | if errCode == 0 {
51 | return nil
52 | }
53 | return getError()
54 | }
55 |
56 | // InsertBeforeAsText inserts the given content before the text chunk.
57 | //
58 | // The rewriter will HTML-escape the content before insertion:
59 | //
60 | // `<` will be replaced with `<`
61 | //
62 | // `>` will be replaced with `>`
63 | //
64 | // `&` will be replaced with `&`
65 | func (t *TextChunk) InsertBeforeAsText(content string) error {
66 | return t.alter(content, textChunkInsertBefore, false)
67 | }
68 |
69 | // InsertBeforeAsHTML inserts the given content before the text chunk.
70 | // The content is inserted as is.
71 | func (t *TextChunk) InsertBeforeAsHTML(content string) error {
72 | return t.alter(content, textChunkInsertBefore, true)
73 | }
74 |
75 | // InsertAfterAsText inserts the given content after the text chunk.
76 | //
77 | // The rewriter will HTML-escape the content before insertion:
78 | //
79 | // `<` will be replaced with `<`
80 | //
81 | // `>` will be replaced with `>`
82 | //
83 | // `&` will be replaced with `&`
84 | func (t *TextChunk) InsertAfterAsText(content string) error {
85 | return t.alter(content, textChunkInsertAfter, false)
86 | }
87 |
88 | // InsertAfterAsHTML inserts the given content after the text chunk.
89 | // The content is inserted as is.
90 | func (t *TextChunk) InsertAfterAsHTML(content string) error {
91 | return t.alter(content, textChunkInsertAfter, true)
92 | }
93 |
94 | // ReplaceAsText replace the text chunk with the supplied content.
95 | //
96 | // The rewriter will HTML-escape the content:
97 | //
98 | // `<` will be replaced with `<`
99 | //
100 | // `>` will be replaced with `>`
101 | //
102 | // `&` will be replaced with `&`
103 | func (t *TextChunk) ReplaceAsText(content string) error {
104 | return t.alter(content, textChunkReplace, false)
105 | }
106 |
107 | // ReplaceAsHTML replace the text chunk with the supplied content.
108 | // The content is kept as is.
109 | func (t *TextChunk) ReplaceAsHTML(content string) error {
110 | return t.alter(content, textChunkReplace, true)
111 | }
112 |
113 | // Remove removes the text chunk.
114 | func (t *TextChunk) Remove() {
115 | C.lol_html_text_chunk_remove((*C.lol_html_text_chunk_t)(t))
116 | }
117 |
118 | // IsRemoved returns whether the text chunk is removed or not.
119 | func (t *TextChunk) IsRemoved() bool {
120 | return (bool)(C.lol_html_text_chunk_is_removed((*C.lol_html_text_chunk_t)(t)))
121 | }
122 |
--------------------------------------------------------------------------------
/textchunk_test.go:
--------------------------------------------------------------------------------
1 | package lolhtml_test
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/coolspring8/go-lolhtml"
8 | )
9 |
10 | func TestTextChunk_InsertBeforeAndAfter(t *testing.T) {
11 | var buf bytes.Buffer
12 | w, err := lolhtml.NewWriter(
13 | &buf,
14 | &lolhtml.Handlers{
15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
16 | {
17 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
18 | content := tc.Content()
19 | if len(content) > 0 {
20 | if content != "Hey 42" {
21 | t.Errorf("got %s, want Hey 42", content)
22 | }
23 | if tc.IsLastInTextNode() {
24 | t.Error("text chunk last in text node flag incorrect, expected false, got true")
25 | }
26 | if tc.IsRemoved() {
27 | t.Error("text chunk removed flag incorrect, expected false, got true")
28 | }
29 | if err := tc.InsertBeforeAsHTML(""); err != nil {
30 | t.Error(err)
31 | }
32 | if err := tc.InsertAfterAsText("
"); err != nil {
33 | t.Error(err)
34 | }
35 | } else {
36 | if !tc.IsLastInTextNode() {
37 | t.Error("text chunk last in text node flag incorrect, expected true, got false")
38 | }
39 | }
40 | return lolhtml.Continue
41 | },
42 | },
43 | },
44 | },
45 | )
46 | if err != nil {
47 | t.Error(err)
48 | }
49 |
50 | if _, err := w.Write([]byte("Hey 42")); err != nil {
51 | t.Error(err)
52 | }
53 | if err := w.Close(); err != nil {
54 | t.Error(err)
55 | }
56 | wantedText := "Hey 42</div>"
57 | if finalText := buf.String(); finalText != wantedText {
58 | t.Errorf("want %s got %s \n", wantedText, finalText)
59 | }
60 | }
61 |
62 | func TestTextChunk_Replace(t *testing.T) {
63 | var buf bytes.Buffer
64 | w, err := lolhtml.NewWriter(
65 | &buf,
66 | &lolhtml.Handlers{
67 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
68 | {
69 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
70 | if len(tc.Content()) > 0 {
71 | if err := tc.ReplaceAsHTML("
"); err != nil {
72 | t.Error(err)
73 | }
74 | if !tc.IsRemoved() {
75 | t.FailNow()
76 | }
77 | }
78 | return lolhtml.Continue
79 | },
80 | },
81 | },
82 | },
83 | )
84 | if err != nil {
85 | t.Error(err)
86 | }
87 |
88 | if _, err := w.Write([]byte("Hello
")); err != nil {
89 | t.Error(err)
90 | }
91 | if err := w.Close(); err != nil {
92 | t.Error(err)
93 | }
94 | wantedText := "
"
95 | if finalText := buf.String(); finalText != wantedText {
96 | t.Errorf("want %s got %s \n", wantedText, finalText)
97 | }
98 | }
99 |
100 | func TestTextChunk_InsertAfter(t *testing.T) {
101 | var buf bytes.Buffer
102 | w, err := lolhtml.NewWriter(
103 | &buf,
104 | &lolhtml.Handlers{
105 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
106 | {
107 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
108 | if len(tc.Content()) > 0 {
109 | if err := tc.InsertAfterAsHTML(""); err != nil {
110 | t.Error(err)
111 | }
112 | }
113 | return lolhtml.Continue
114 | },
115 | },
116 | },
117 | },
118 | )
119 | if err != nil {
120 | t.Error(err)
121 | }
122 |
123 | if _, err := w.Write([]byte("hello
")); err != nil {
124 | t.Error(err)
125 | }
126 | if err := w.Close(); err != nil {
127 | t.Error(err)
128 | }
129 | wantedText := ""
130 | if finalText := buf.String(); finalText != wantedText {
131 | t.Errorf("want %s got %s \n", wantedText, finalText)
132 | }
133 | }
134 |
135 | func TestTextChunk_Remove(t *testing.T) {
136 | var buf bytes.Buffer
137 | w, err := lolhtml.NewWriter(
138 | &buf,
139 | &lolhtml.Handlers{
140 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
141 | {
142 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
143 | if tc.IsRemoved() {
144 | t.FailNow()
145 | }
146 | tc.Remove()
147 | if !tc.IsRemoved() {
148 | t.FailNow()
149 | }
150 | return lolhtml.Continue
151 | },
152 | },
153 | },
154 | },
155 | )
156 | if err != nil {
157 | t.Error(err)
158 | }
159 |
160 | if _, err := w.Write([]byte("0_0 ")); err != nil {
161 | t.Error(err)
162 | }
163 | if err := w.Close(); err != nil {
164 | t.Error(err)
165 | }
166 | wantedText := " "
167 | if finalText := buf.String(); finalText != wantedText {
168 | t.Errorf("want %s got %s \n", wantedText, finalText)
169 | }
170 | }
171 |
172 | func TestTextChunk_StopRewriting(t *testing.T) {
173 | var buf bytes.Buffer
174 | w, err := lolhtml.NewWriter(
175 | &buf,
176 | &lolhtml.Handlers{
177 | DocumentContentHandler: []lolhtml.DocumentContentHandler{
178 | {
179 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
180 | return lolhtml.Stop
181 | },
182 | },
183 | },
184 | },
185 | )
186 | if err != nil {
187 | t.Error(err)
188 | }
189 |
190 | _, err = w.Write([]byte("42"))
191 | if err == nil {
192 | t.FailNow()
193 | }
194 | if err.Error() != "The rewriter has been stopped." {
195 | t.Error(err)
196 | }
197 | }
198 |
199 | func TestTextChunk_StopRewritingWithSelector(t *testing.T) {
200 | var buf bytes.Buffer
201 | w, err := lolhtml.NewWriter(
202 | &buf,
203 | &lolhtml.Handlers{
204 | ElementContentHandler: []lolhtml.ElementContentHandler{
205 | {
206 | Selector: "*",
207 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective {
208 | return lolhtml.Stop
209 | },
210 | },
211 | },
212 | },
213 | )
214 | if err != nil {
215 | t.Error(err)
216 | }
217 |
218 | _, err = w.Write([]byte("42
"))
219 | if err == nil {
220 | t.FailNow()
221 | }
222 | if err.Error() != "The rewriter has been stopped." {
223 | t.Error(err)
224 | }
225 | }
226 |
--------------------------------------------------------------------------------
/writer.go:
--------------------------------------------------------------------------------
1 | package lolhtml
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | )
7 |
8 | // Writer takes data written to it and writes the rewritten form of that data to an
9 | // underlying writer (see NewWriter).
10 | type Writer struct {
11 | w io.Writer
12 | rewriter *rewriter
13 | err error
14 | closed bool
15 | }
16 |
17 | // NewWriter returns a new Writer with Handlers and an optional Config configured.
18 | // Writes to the returned Writer are rewritten and written to w.
19 | //
20 | // It is the caller's responsibility to call Close on the Writer when done.
21 | // Writes may be buffered and not flushed until Close. There is no Flush method,
22 | // so before using the content written by w, it is necessary to call Close
23 | // to ensure w has finished writing.
24 | func NewWriter(w io.Writer, handlers *Handlers, config ...Config) (*Writer, error) {
25 | var c Config
26 | var sink OutputSink
27 | if config != nil {
28 | c = config[0]
29 | if c.Sink != nil {
30 | sink = c.Sink
31 | } else if w == nil {
32 | sink = func([]byte) {}
33 | } else {
34 | sink = func(p []byte) {
35 | _, _ = w.Write(p)
36 | }
37 | }
38 | } else {
39 | c = newDefaultConfig()
40 | if w == nil {
41 | sink = func([]byte) {}
42 | } else {
43 | sink = func(p []byte) {
44 | _, _ = w.Write(p)
45 | }
46 | }
47 | }
48 |
49 | rb := newRewriterBuilder()
50 | var selectors []*selector
51 | if handlers != nil {
52 | for _, dh := range handlers.DocumentContentHandler {
53 | rb.AddDocumentContentHandlers(
54 | dh.DoctypeHandler,
55 | dh.CommentHandler,
56 | dh.TextChunkHandler,
57 | dh.DocumentEndHandler,
58 | )
59 | }
60 | for _, eh := range handlers.ElementContentHandler {
61 | s, err := newSelector(eh.Selector)
62 | if err != nil {
63 | return nil, err
64 | }
65 | selectors = append(selectors, s)
66 | rb.AddElementContentHandlers(
67 | s,
68 | eh.ElementHandler,
69 | eh.CommentHandler,
70 | eh.TextChunkHandler,
71 | )
72 | }
73 | }
74 | r, err := rb.Build(sink, c)
75 | if err != nil {
76 | return nil, err
77 | }
78 | rb.Free()
79 | for _, s := range selectors {
80 | s.Free()
81 | }
82 |
83 | return &Writer{w: w, rewriter: r}, nil
84 | }
85 |
86 | func (w *Writer) Write(p []byte) (n int, err error) {
87 | if w.err != nil {
88 | return 0, w.err
89 | }
90 | if len(p) == 0 {
91 | return 0, nil
92 | }
93 | n, err = w.rewriter.Write(p)
94 | if err != nil {
95 | w.err = err
96 | return
97 | }
98 | return
99 | }
100 |
101 | // WriteString writes a string to the Writer.
102 | func (w *Writer) WriteString(s string) (n int, err error) {
103 | if w.err != nil {
104 | return 0, w.err
105 | }
106 | if len(s) == 0 {
107 | return 0, nil
108 | }
109 | n, err = w.rewriter.WriteString(s)
110 | if err != nil {
111 | w.err = err
112 | return
113 | }
114 | return
115 | }
116 |
117 | // Close closes the Writer, flushing any unwritten data to the underlying io.Writer,
118 | // but does not close the underlying io.Writer.
119 | // Subsequent calls to Close is a no-op.
120 | func (w *Writer) Close() error {
121 | if w == nil || w.closed {
122 | return nil
123 | }
124 | w.closed = true
125 | if w.err == nil {
126 | w.err = w.rewriter.End()
127 | }
128 | w.rewriter.Free()
129 | return w.err
130 | }
131 |
132 | // RewriteString rewrites the given string with the provided Handlers and Config.
133 | func RewriteString(s string, handlers *Handlers, config ...Config) (string, error) {
134 | var buf bytes.Buffer
135 | var w *Writer
136 | var err error
137 | if config != nil {
138 | w, err = NewWriter(&buf, handlers, config[0])
139 | } else {
140 | w, err = NewWriter(&buf, handlers)
141 | }
142 | if err != nil {
143 | return "", err
144 | }
145 |
146 | _, err = w.WriteString(s)
147 | if err != nil {
148 | return "", err
149 | }
150 |
151 | err = w.Close()
152 | if err != nil {
153 | return "", err
154 | }
155 |
156 | return buf.String(), nil
157 | }
158 |
--------------------------------------------------------------------------------