├── go.sum ├── .gitmodules ├── go.mod ├── .gitattributes ├── export_test.go ├── .idea ├── codeStyles │ ├── codeStyleConfig.xml │ └── Project.xml ├── misc.xml ├── vcs.xml ├── .gitignore ├── modules.xml ├── inspectionProfiles │ └── Project_Default.xml └── watcherTasks.xml ├── selector_test.go ├── .github └── workflows │ ├── golangci-lint.yml │ ├── go.yml │ └── build_static_lib.yml ├── error_test.go ├── const.go ├── lolhtml.go ├── selector.go ├── error.go ├── examples ├── defer-scripts │ └── main.go ├── mixed-content-rewriter │ └── main.go └── web-scraper │ ├── main.go │ └── index.html ├── doctype.go ├── string.go ├── pointer.go ├── rewriter.go ├── documentend.go ├── attribute.go ├── LICENSE ├── rewriter_test.go ├── callback.go ├── doctype_test.go ├── example_test.go ├── documentend_test.go ├── .gitignore ├── config.go ├── writer.go ├── textchunk.go ├── comment.go ├── README.md ├── textchunk_test.go ├── comment_test.go ├── benchmark_test.go ├── element.go └── element_test.go /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/coolspring8/go-lolhtml 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | build/* linguist-vendored 2 | examples/web-scraper/index.html linguist-vendored 3 | -------------------------------------------------------------------------------- /export_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | // just export some internal functions for tests 4 | 5 | var GetError = getError 6 | var NewSelector = newSelector 7 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /../../../../../../../:\Users\cools\Documents\Projects\hello-cgo\.idea/dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /selector_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/coolspring8/go-lolhtml" 7 | ) 8 | 9 | func TestNewSelector_UnsupportedSelector(t *testing.T) { 10 | s, err := lolhtml.NewSelector("p:last-child") 11 | if s != nil || err == nil { 12 | t.FailNow() 13 | } 14 | if err.Error() != "Unsupported pseudo-class or pseudo-element in selector." { 15 | t.Error(err) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/golangci-lint.yml: -------------------------------------------------------------------------------- 1 | name: golangci-lint 2 | on: 3 | push: 4 | tags: 5 | - v* 6 | branches: 7 | - main 8 | pull_request: 9 | jobs: 10 | golangci: 11 | name: lint 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: golangci-lint 16 | uses: golangci/golangci-lint-action@v2 17 | with: 18 | version: v1.32 19 | -------------------------------------------------------------------------------- /error_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | // TestNullErrorStr tests internal functions for handling a null lol_html_str_t, by calling lol_html_take_last_error() 11 | // when there is no error. 12 | func TestNullErrorStr(t *testing.T) { 13 | err := lolhtml.GetError() 14 | if !errors.Is(err, lolhtml.ErrCannotGetErrorMessage) { 15 | t.Error(err) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /const.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | import "C" 4 | 5 | // RewriterDirective is a "status code“ that should be returned by callback handlers, to inform the 6 | // rewriter to continue or stop parsing. 7 | type RewriterDirective int 8 | 9 | const ( 10 | // Continue lets the normal parsing process continue. 11 | Continue RewriterDirective = iota 12 | 13 | // Stop stops the rewriter immediately. Content currently buffered is discarded, and an error is returned. 14 | // After stopping, the Writer should not be used anymore except for Close(). 15 | Stop 16 | ) 17 | -------------------------------------------------------------------------------- /lolhtml.go: -------------------------------------------------------------------------------- 1 | // Package lolhtml provides the ability to parse and rewrite HTML on the fly, 2 | // with a CSS-selector based API. 3 | // 4 | // It is a binding for the Rust crate lol_html. 5 | // https://github.com/cloudflare/lol-html 6 | // 7 | // Please see /examples subdirectory for more detailed examples. 8 | package lolhtml 9 | 10 | /* 11 | #cgo CFLAGS:-I${SRCDIR}/build/include 12 | #cgo LDFLAGS:-llolhtml 13 | #cgo !windows LDFLAGS:-lm 14 | #cgo linux,amd64 LDFLAGS:-L${SRCDIR}/build/linux-x86_64 15 | #cgo darwin,amd64 LDFLAGS:-L${SRCDIR}/build/macos-x86_64 16 | #cgo windows,amd64 LDFLAGS:-L${SRCDIR}/build/windows-x86_64 17 | #include 18 | #include "lol_html.h" 19 | */ 20 | import "C" 21 | -------------------------------------------------------------------------------- /selector.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // selector represents a parsed CSS selector. 11 | type selector C.lol_html_selector_t 12 | 13 | func newSelector(cssSelector string) (*selector, error) { 14 | selectorC := C.CString(cssSelector) 15 | defer C.free(unsafe.Pointer(selectorC)) 16 | selectorLen := len(cssSelector) 17 | s := (*selector)(C.lol_html_selector_parse(selectorC, C.size_t(selectorLen))) 18 | if s != nil { 19 | return s, nil 20 | } 21 | return nil, getError() 22 | } 23 | 24 | func (s *selector) Free() { 25 | if s != nil { 26 | C.lol_html_selector_free((*C.lol_html_selector_t)(s)) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /error.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | import "errors" 8 | 9 | // ErrCannotGetErrorMessage indicates getting error code from lol_html, but unable to acquire the concrete 10 | // error message. 11 | var ErrCannotGetErrorMessage = errors.New("cannot get error message from underlying lol_html lib") 12 | 13 | // getError is a helper function that gets error message for the last function call. 14 | // You should make sure there is an error when calling this, or the function interprets 15 | // the NULL error message obtained as ErrCannotGetErrorMessage. 16 | func getError() error { 17 | errC := (*str)(C.lol_html_take_last_error()) 18 | defer errC.Free() 19 | if errMsg := errC.String(); errMsg != "" { 20 | return errors.New(errMsg) 21 | } 22 | return ErrCannotGetErrorMessage 23 | } 24 | -------------------------------------------------------------------------------- /examples/defer-scripts/main.go: -------------------------------------------------------------------------------- 1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go 2 | package main 3 | 4 | import ( 5 | "io" 6 | "log" 7 | "os" 8 | 9 | "github.com/coolspring8/go-lolhtml" 10 | ) 11 | 12 | func main() { 13 | w, err := lolhtml.NewWriter( 14 | os.Stdout, 15 | &lolhtml.Handlers{ 16 | ElementContentHandler: []lolhtml.ElementContentHandler{ 17 | { 18 | Selector: "script[src]:not([async]):not([defer])", 19 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 20 | err := e.SetAttribute("defer", "") 21 | if err != nil { 22 | log.Fatal(err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | log.Fatal(err) 32 | } 33 | 34 | _, err = io.Copy(w, os.Stdin) 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | 39 | err = w.Close() 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /doctype.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | 8 | // Doctype represents the document's doctype. 9 | type Doctype C.lol_html_doctype_t 10 | 11 | // DoctypeHandlerFunc is a callback handler function to do something with a Comment. 12 | type DoctypeHandlerFunc func(*Doctype) RewriterDirective 13 | 14 | // Name returns doctype name. 15 | func (d *Doctype) Name() string { 16 | nameC := (*str)(C.lol_html_doctype_name_get((*C.lol_html_doctype_t)(d))) 17 | defer nameC.Free() 18 | return nameC.String() 19 | } 20 | 21 | // PublicID returns doctype public ID. 22 | func (d *Doctype) PublicID() string { 23 | nameC := (*str)(C.lol_html_doctype_public_id_get((*C.lol_html_doctype_t)(d))) 24 | defer nameC.Free() 25 | return nameC.String() 26 | } 27 | 28 | // SystemID returns doctype system ID. 29 | func (d *Doctype) SystemID() string { 30 | nameC := (*str)(C.lol_html_doctype_system_id_get((*C.lol_html_doctype_t)(d))) 31 | defer nameC.Free() 32 | return nameC.String() 33 | } 34 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | name: Build 13 | runs-on: ubuntu-latest 14 | steps: 15 | 16 | - name: Set up Go 1.x 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: ^1.13 20 | id: go 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v2 24 | 25 | - name: Get dependencies 26 | run: | 27 | go get -v -t -d ./... 28 | if [ -f Gopkg.toml ]; then 29 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 30 | dep ensure 31 | fi 32 | 33 | - name: Build 34 | run: go build -v . 35 | 36 | - name: Test 37 | run: go test -v . -race -coverprofile coverage.txt --covermode atomic 38 | 39 | - name: Upload reports to Codecov 40 | run: bash <(curl -s https://codecov.io/bash) 41 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /string.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | // some string types passed by c api, and their helper functions 4 | 5 | /* 6 | #include "lol_html.h" 7 | */ 8 | import "C" 9 | 10 | type str C.lol_html_str_t 11 | 12 | // textChunkContent does not need to be de-allocated manually. 13 | type textChunkContent C.lol_html_text_chunk_content_t 14 | 15 | func (s *str) Free() { 16 | if s != nil { 17 | C.lol_html_str_free(*(*C.lol_html_str_t)(s)) 18 | } 19 | } 20 | 21 | // String is a helper function that translates the underlying-library-defined lol_html_str_t data to Go string. 22 | // It is the caller's responsibility to arrange for lol_html_str_t to be freed, 23 | // by calling str.Free() or lol_html_str_free(). 24 | // Potential issue: lol_html_str_t->len from size_t (uint) to int (int32) on 32-bit machines? 25 | func (s *str) String() string { 26 | if s == nil { 27 | return "" 28 | } 29 | return C.GoStringN(s.data, C.int(s.len)) 30 | } 31 | 32 | func (s *textChunkContent) String() string { 33 | //var nullTextChunkContent textChunkContent 34 | //if s == nullTextChunkContent { 35 | // return "" 36 | //} 37 | if s == nil { 38 | return "" 39 | } 40 | return C.GoStringN(s.data, C.int(s.len)) 41 | } 42 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 20 | -------------------------------------------------------------------------------- /pointer.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | // Credit to https://github.com/mattn/go-pointer. 4 | 5 | // #include 6 | import "C" 7 | import ( 8 | "sync" 9 | "unsafe" 10 | ) 11 | 12 | // sync.Map documentation states that it is optimized for "when the entry for a given key is only 13 | // ever written once but read many times, as in caches that only grow". My benchmarks show that sync.Map 14 | // version rewriter is slower in single-goroutine calls, but faster when used in multiple goroutines 15 | // (and personally I think the latter is more important). 16 | var store sync.Map 17 | 18 | func savePointer(v interface{}) unsafe.Pointer { 19 | if v == nil { 20 | return nil 21 | } 22 | 23 | ptr := C.malloc(C.size_t(1)) 24 | if ptr == nil { 25 | panic(`can't allocate "cgo-pointer hack index pointer": ptr == nil`) 26 | } 27 | 28 | store.Store(ptr, v) 29 | 30 | return ptr 31 | } 32 | 33 | func restorePointer(ptr unsafe.Pointer) (v interface{}) { 34 | if ptr == nil { 35 | return nil 36 | } 37 | 38 | if v, ok := store.Load(ptr); ok { 39 | return v 40 | } 41 | return nil 42 | } 43 | 44 | func unrefPointer(ptr unsafe.Pointer) { 45 | if ptr == nil { 46 | return 47 | } 48 | 49 | store.Delete(ptr) 50 | 51 | C.free(ptr) 52 | } 53 | 54 | func unrefPointers(ptrs []unsafe.Pointer) { 55 | for _, ptr := range ptrs { 56 | unrefPointer(ptr) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /examples/mixed-content-rewriter/main.go: -------------------------------------------------------------------------------- 1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go 2 | package main 3 | 4 | import ( 5 | "io" 6 | "log" 7 | "os" 8 | "strings" 9 | 10 | "github.com/coolspring8/go-lolhtml" 11 | ) 12 | 13 | func main() { 14 | w, err := lolhtml.NewWriter( 15 | os.Stdout, 16 | &lolhtml.Handlers{ 17 | ElementContentHandler: []lolhtml.ElementContentHandler{ 18 | { 19 | Selector: "a[href], link[rel=stylesheet][href]", 20 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 21 | rewriteUrlInAttribute(e, "href") 22 | return lolhtml.Continue 23 | }, 24 | }, 25 | { 26 | Selector: "script[src], iframe[src], img[src], audio[src], video[src]", 27 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 28 | rewriteUrlInAttribute(e, "src") 29 | return lolhtml.Continue 30 | }, 31 | }, 32 | }, 33 | }, 34 | ) 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | 39 | _, err = io.Copy(w, os.Stdin) 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | 44 | err = w.Close() 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | } 49 | 50 | func rewriteUrlInAttribute(e *lolhtml.Element, attributeName string) { 51 | attr, err := e.AttributeValue(attributeName) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | attr = strings.ReplaceAll(attr, "http://", "https://") 56 | 57 | err = e.SetAttribute(attributeName, attr) 58 | if err != nil { 59 | log.Fatal(err) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /rewriter.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // rewriter represents an actual HTML rewriter. 13 | // rewriterBuilder, rewriter and selector are kept private to simplify public API. 14 | // If you find it useful to use them publicly, please inform me. 15 | type rewriter struct { 16 | rewriter *C.lol_html_rewriter_t 17 | pointers []unsafe.Pointer 18 | // TODO: unrecoverable bool 19 | } 20 | 21 | func (r *rewriter) Write(p []byte) (n int, err error) { 22 | pLen := len(p) 23 | // avoid 0-sized array 24 | if pLen == 0 { 25 | p = []byte("\x00") 26 | } 27 | pC := (*C.char)(unsafe.Pointer(&p[0])) 28 | errCode := C.lol_html_rewriter_write(r.rewriter, pC, C.size_t(pLen)) 29 | if errCode == 0 { 30 | return pLen, nil 31 | } 32 | return 0, getError() 33 | } 34 | 35 | func (r *rewriter) WriteString(chunk string) (n int, err error) { 36 | chunkC := C.CString(chunk) 37 | defer C.free(unsafe.Pointer(chunkC)) 38 | chunkLen := len(chunk) 39 | errCode := C.lol_html_rewriter_write(r.rewriter, chunkC, C.size_t(chunkLen)) 40 | if errCode == 0 { 41 | return chunkLen, nil 42 | } 43 | return 0, getError() 44 | } 45 | 46 | func (r *rewriter) End() error { 47 | errCode := C.lol_html_rewriter_end(r.rewriter) 48 | if errCode == 0 { 49 | return nil 50 | } 51 | return getError() 52 | } 53 | 54 | func (r *rewriter) Free() { 55 | if r != nil { 56 | C.lol_html_rewriter_free(r.rewriter) 57 | unrefPointers(r.pointers) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /documentend.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // DocumentEnd represents the end of the document. 11 | type DocumentEnd C.lol_html_doc_end_t 12 | 13 | // DocumentEndHandlerFunc is a callback handler function to do something with a DocumentEnd. 14 | type DocumentEndHandlerFunc func(*DocumentEnd) RewriterDirective 15 | 16 | // AppendAsText appends the given content at the end of the document. 17 | // 18 | // The rewriter will HTML-escape the content before appending: 19 | // 20 | // `<` will be replaced with `<` 21 | // 22 | // `>` will be replaced with `>` 23 | // 24 | // `&` will be replaced with `&` 25 | func (d *DocumentEnd) AppendAsText(content string) error { 26 | contentC := C.CString(content) 27 | defer C.free(unsafe.Pointer(contentC)) 28 | contentLen := len(content) 29 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), false) 30 | if errCode == 0 { 31 | return nil 32 | } 33 | return getError() 34 | } 35 | 36 | // AppendAsHTML appends the given content at the end of the document. 37 | // The content is appended as is. 38 | func (d *DocumentEnd) AppendAsHTML(content string) error { 39 | contentC := C.CString(content) 40 | defer C.free(unsafe.Pointer(contentC)) 41 | contentLen := len(content) 42 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), true) 43 | if errCode == 0 { 44 | return nil 45 | } 46 | return getError() 47 | } 48 | -------------------------------------------------------------------------------- /attribute.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | 9 | // AttributeIterator can be used to iterate over all attributes of an element. The only way to 10 | // get an AttributeIterator is by calling AttributeIterator() on an Element. Note the "range" syntax is not 11 | // applicable here, use AttributeIterator.Next() instead. 12 | type AttributeIterator C.lol_html_attributes_iterator_t 13 | 14 | // Attribute represents an HTML element attribute. Obtained by calling Next() on an AttributeIterator. 15 | type Attribute C.lol_html_attribute_t 16 | 17 | // Free frees the memory held by the AttributeIterator. 18 | func (ai *AttributeIterator) Free() { 19 | C.lol_html_attributes_iterator_free((*C.lol_html_attributes_iterator_t)(ai)) 20 | } 21 | 22 | // Next advances the iterator and returns next attribute. 23 | // Returns nil if the iterator has been exhausted. 24 | func (ai *AttributeIterator) Next() *Attribute { 25 | return (*Attribute)(C.lol_html_attributes_iterator_next((*C.lol_html_attributes_iterator_t)(ai))) 26 | } 27 | 28 | // Name returns the name of the attribute. 29 | func (a *Attribute) Name() string { 30 | nameC := (str)(C.lol_html_attribute_name_get((*C.lol_html_attribute_t)(a))) 31 | defer nameC.Free() 32 | return nameC.String() 33 | } 34 | 35 | // Value returns the value of the attribute. 36 | func (a *Attribute) Value() string { 37 | valueC := (str)(C.lol_html_attribute_value_get((*C.lol_html_attribute_t)(a))) 38 | defer valueC.Free() 39 | return valueC.String() 40 | } 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, CoolSpring8 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /rewriter_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/coolspring8/go-lolhtml" 7 | ) 8 | 9 | func TestRewriter_NonAsciiEncoding(t *testing.T) { 10 | w, err := lolhtml.NewWriter( 11 | nil, 12 | nil, 13 | lolhtml.Config{ 14 | Encoding: "UTF-16", 15 | Memory: &lolhtml.MemorySettings{ 16 | PreallocatedParsingBufferSize: 0, 17 | MaxAllowedMemoryUsage: 16, 18 | }, 19 | Strict: true, 20 | }) 21 | if w != nil || err == nil { 22 | t.FailNow() 23 | } 24 | if err.Error() != "Expected ASCII-compatible encoding." { 25 | t.Error(err) 26 | } 27 | err = w.Close() 28 | if err != nil { 29 | t.Error(err) 30 | } 31 | } 32 | 33 | func TestRewriter_MemoryLimiting(t *testing.T) { 34 | w, err := lolhtml.NewWriter( 35 | nil, 36 | &lolhtml.Handlers{ 37 | ElementContentHandler: []lolhtml.ElementContentHandler{ 38 | { 39 | "span", 40 | nil, 41 | nil, 42 | nil, 43 | }, 44 | }, 45 | }, 46 | lolhtml.Config{ 47 | Encoding: "utf-8", 48 | Memory: &lolhtml.MemorySettings{ 49 | PreallocatedParsingBufferSize: 0, 50 | MaxAllowedMemoryUsage: 5, 51 | }, 52 | Strict: true, 53 | }, 54 | ) 55 | if err != nil { 56 | t.Error(err) 57 | } 58 | _, err = w.Write([]byte("`)) 36 | if err != nil { 37 | t.Error(err) 38 | } 39 | err = w.Close() 40 | if err != nil { 41 | t.Error(err) 42 | } 43 | } 44 | 45 | func TestDoctype_StopRewriting(t *testing.T) { 46 | w, err := lolhtml.NewWriter( 47 | nil, 48 | &lolhtml.Handlers{ 49 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 50 | { 51 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective { 52 | return lolhtml.Stop 53 | }, 54 | }, 55 | }, 56 | }, 57 | ) 58 | if err != nil { 59 | t.Error(err) 60 | } 61 | 62 | _, err = w.Write([]byte("")) 63 | if err == nil { 64 | t.FailNow() 65 | } 66 | if err.Error() != "The rewriter has been stopped." { 67 | t.Error(err) 68 | } 69 | err = w.Close() 70 | if err == nil { 71 | t.FailNow() 72 | } 73 | if err.Error() != "The rewriter has been stopped." { 74 | t.Error(err) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // This file is for demonstration in godoc. For more examples, see the /examples directory. 2 | package lolhtml_test 3 | 4 | import ( 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | "strings" 11 | 12 | "github.com/coolspring8/go-lolhtml" 13 | ) 14 | 15 | func ExampleNewWriter() { 16 | chunk := []byte("Hello, World!") 17 | r := bytes.NewReader(chunk) 18 | w, err := lolhtml.NewWriter( 19 | // output to stdout 20 | os.Stdout, 21 | &lolhtml.Handlers{ 22 | ElementContentHandler: []lolhtml.ElementContentHandler{ 23 | { 24 | Selector: "span", 25 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 26 | err := e.SetInnerContentAsText("LOL-HTML") 27 | if err != nil { 28 | log.Fatal(err) 29 | } 30 | return lolhtml.Continue 31 | }, 32 | }, 33 | }, 34 | }, 35 | ) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | 40 | // copy from the bytes reader to lolhtml writer 41 | _, err = io.Copy(w, r) 42 | if err != nil { 43 | log.Fatal(err) 44 | } 45 | 46 | // explicitly close the writer and flush the remaining content 47 | err = w.Close() 48 | if err != nil { 49 | log.Fatal(err) 50 | } 51 | // Output: Hello, LOL-HTML! 52 | } 53 | 54 | func ExampleRewriteString() { 55 | output, err := lolhtml.RewriteString( 56 | `
`, 57 | &lolhtml.Handlers{ 58 | ElementContentHandler: []lolhtml.ElementContentHandler{ 59 | { 60 | Selector: "a[href]", 61 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 62 | href, err := e.AttributeValue("href") 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | href = strings.ReplaceAll(href, "http:", "https:") 67 | 68 | err = e.SetAttribute("href", href) 69 | if err != nil { 70 | log.Fatal(err) 71 | } 72 | 73 | return lolhtml.Continue 74 | }, 75 | }, 76 | }, 77 | }, 78 | ) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | 83 | fmt.Println(output) 84 | // Output:
85 | } 86 | -------------------------------------------------------------------------------- /documentend_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestDocumentEnd_AppendToEmptyDoc(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective { 18 | if err := docEnd.AppendAsHTML(""); err != nil { 19 | t.Error(err) 20 | } 21 | if err := docEnd.AppendAsText("hello & world"); err != nil { 22 | t.Error(err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | if _, err = w.Write([]byte("")); err != nil { 35 | t.Error(err) 36 | } 37 | if err = w.Close(); err != nil { 38 | t.Error(err) 39 | } 40 | wantedText := "hello & world" 41 | if finalText := buf.String(); finalText != wantedText { 42 | t.Errorf("want %s got %s \n", wantedText, finalText) 43 | } 44 | } 45 | 46 | func TestDocumentEnd_AppendAtEnd(t *testing.T) { 47 | var buf bytes.Buffer 48 | w, err := lolhtml.NewWriter( 49 | &buf, 50 | &lolhtml.Handlers{ 51 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 52 | { 53 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective { 54 | if err := docEnd.AppendAsHTML(""); err != nil { 55 | t.Error(err) 56 | } 57 | if err := docEnd.AppendAsText("hello & world"); err != nil { 58 | t.Error(err) 59 | } 60 | return lolhtml.Continue 61 | }, 62 | }, 63 | }, 64 | }, 65 | ) 66 | if err != nil { 67 | t.Error(err) 68 | } 69 | 70 | if _, err = w.Write([]byte("
Hello
")); err != nil { 71 | t.Error(err) 72 | } 73 | if err = w.Close(); err != nil { 74 | t.Error(err) 75 | } 76 | wantedText := "
Hello
hello & world" 77 | if finalText := buf.String(); finalText != wantedText { 78 | t.Errorf("want %s got %s \n", wantedText, finalText) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains.gitignore 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 4 | 5 | # User-specific stuff 6 | .idea/**/workspace.xml 7 | .idea/**/tasks.xml 8 | .idea/**/usage.statistics.xml 9 | .idea/**/dictionaries 10 | .idea/**/shelf 11 | 12 | # Generated files 13 | .idea/**/contentModel.xml 14 | 15 | # Sensitive or high-churn files 16 | .idea/**/dataSources/ 17 | .idea/**/dataSources.ids 18 | .idea/**/dataSources.local.xml 19 | .idea/**/sqlDataSources.xml 20 | .idea/**/dynamic.xml 21 | .idea/**/uiDesigner.xml 22 | .idea/**/dbnavigator.xml 23 | 24 | # Gradle 25 | .idea/**/gradle.xml 26 | .idea/**/libraries 27 | 28 | # Gradle and Maven with auto-import 29 | # When using Gradle or Maven with auto-import, you should exclude module files, 30 | # since they will be recreated, and may cause churn. Uncomment if using 31 | # auto-import. 32 | # .idea/artifacts 33 | # .idea/compiler.xml 34 | # .idea/jarRepositories.xml 35 | # .idea/modules.xml 36 | # .idea/*.iml 37 | # .idea/modules 38 | *.iml 39 | # *.ipr 40 | 41 | # CMake 42 | cmake-build-*/ 43 | 44 | # Mongo Explorer plugin 45 | .idea/**/mongoSettings.xml 46 | 47 | # File-based project format 48 | *.iws 49 | 50 | # IntelliJ 51 | out/ 52 | 53 | # mpeltonen/sbt-idea plugin 54 | .idea_modules/ 55 | 56 | # JIRA plugin 57 | atlassian-ide-plugin.xml 58 | 59 | # Cursive Clojure plugin 60 | .idea/replstate.xml 61 | 62 | # Crashlytics plugin (for Android Studio and IntelliJ) 63 | com_crashlytics_export_strings.xml 64 | crashlytics.properties 65 | crashlytics-build.properties 66 | fabric.properties 67 | 68 | # Editor-based Rest Client 69 | .idea/httpRequests 70 | 71 | # Android studio 3.1+ serialized cache file 72 | .idea/caches/build_file_checksums.ser 73 | 74 | 75 | # Go.gitignore 76 | # Binaries for programs and plugins 77 | *.exe 78 | *.exe~ 79 | *.dll 80 | *.so 81 | *.dylib 82 | 83 | # Test binary, built with `go test -c` 84 | *.test 85 | 86 | # Output of the go coverage tool, specifically when used with LiteIDE 87 | *.out 88 | 89 | # Dependency directories (remove the comment below to include it) 90 | # vendor/ 91 | 92 | 93 | # Other files 94 | release/ 95 | *.def 96 | .rustc_info.json 97 | -------------------------------------------------------------------------------- /.idea/watcherTasks.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 28 | 29 | 40 | 52 | 53 | 64 | 76 | 77 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | import ( 8 | "unsafe" 9 | ) 10 | 11 | // Config defines settings for the rewriter. 12 | type Config struct { 13 | // defaults to "utf-8". 14 | Encoding string 15 | // defaults to PreallocatedParsingBufferSize: 1024, MaxAllowedMemoryUsage: 1<<63 - 1. 16 | Memory *MemorySettings 17 | // defaults to func([]byte) {}. In other words, totally discard output. 18 | Sink OutputSink 19 | // defaults to true. If true, bail out for security reasons when ambiguous. 20 | Strict bool 21 | } 22 | 23 | func newDefaultConfig() Config { 24 | return Config{ 25 | Encoding: "utf-8", 26 | Memory: &MemorySettings{ 27 | PreallocatedParsingBufferSize: 1024, 28 | MaxAllowedMemoryUsage: 1<<63 - 1, 29 | }, 30 | Sink: func([]byte) {}, 31 | Strict: true, 32 | } 33 | } 34 | 35 | // MemorySettings sets the memory limitations for the rewriter. 36 | type MemorySettings struct { 37 | PreallocatedParsingBufferSize int // defaults to 1024 38 | MaxAllowedMemoryUsage int // defaults to 1<<63 -1 39 | } 40 | 41 | // OutputSink is a callback function where output is written to. A byte slice is passed each time, 42 | // representing a chunk of output. 43 | // 44 | // Exported for special usages which require each output chunk to be identified and processed 45 | // individually. For most common uses, NewWriter would be more convenient. 46 | type OutputSink func([]byte) 47 | 48 | // DocumentContentHandler is a group of handlers that would be applied to the whole HTML document. 49 | type DocumentContentHandler struct { 50 | DoctypeHandler DoctypeHandlerFunc 51 | CommentHandler CommentHandlerFunc 52 | TextChunkHandler TextChunkHandlerFunc 53 | DocumentEndHandler DocumentEndHandlerFunc 54 | } 55 | 56 | // ElementContentHandler is a group of handlers that would be applied to the content matched by 57 | // the given selector. 58 | type ElementContentHandler struct { 59 | Selector string 60 | ElementHandler ElementHandlerFunc 61 | CommentHandler CommentHandlerFunc 62 | TextChunkHandler TextChunkHandlerFunc 63 | } 64 | 65 | // Handlers contain DocumentContentHandlers and ElementContentHandlers. Can contain arbitrary numbers 66 | // of them, including zero (nil slice). 67 | type Handlers struct { 68 | DocumentContentHandler []DocumentContentHandler 69 | ElementContentHandler []ElementContentHandler 70 | } 71 | 72 | //export callbackSink 73 | func callbackSink(chunk *C.char, chunkLen C.size_t, userData unsafe.Pointer) { 74 | c := C.GoBytes(unsafe.Pointer(chunk), C.int(chunkLen)) 75 | cb := restorePointer(userData).(OutputSink) 76 | cb(c) 77 | } 78 | 79 | //export callbackDoctype 80 | func callbackDoctype(doctype *Doctype, userData unsafe.Pointer) RewriterDirective { 81 | cb := restorePointer(userData).(DoctypeHandlerFunc) 82 | return cb(doctype) 83 | } 84 | 85 | //export callbackComment 86 | func callbackComment(comment *Comment, userData unsafe.Pointer) RewriterDirective { 87 | cb := restorePointer(userData).(CommentHandlerFunc) 88 | return cb(comment) 89 | } 90 | 91 | //export callbackTextChunk 92 | func callbackTextChunk(textChunk *TextChunk, userData unsafe.Pointer) RewriterDirective { 93 | cb := restorePointer(userData).(TextChunkHandlerFunc) 94 | return cb(textChunk) 95 | } 96 | 97 | //export callbackElement 98 | func callbackElement(element *Element, userData unsafe.Pointer) RewriterDirective { 99 | cb := restorePointer(userData).(ElementHandlerFunc) 100 | return cb(element) 101 | } 102 | 103 | //export callbackDocumentEnd 104 | func callbackDocumentEnd(documentEnd *DocumentEnd, userData unsafe.Pointer) RewriterDirective { 105 | cb := restorePointer(userData).(DocumentEndHandlerFunc) 106 | return cb(documentEnd) 107 | } 108 | -------------------------------------------------------------------------------- /writer.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | ) 7 | 8 | // Writer takes data written to it and writes the rewritten form of that data to an 9 | // underlying writer (see NewWriter). 10 | type Writer struct { 11 | w io.Writer 12 | rewriter *rewriter 13 | err error 14 | closed bool 15 | } 16 | 17 | // NewWriter returns a new Writer with Handlers and an optional Config configured. 18 | // Writes to the returned Writer are rewritten and written to w. 19 | // 20 | // It is the caller's responsibility to call Close on the Writer when done. 21 | // Writes may be buffered and not flushed until Close. There is no Flush method, 22 | // so before using the content written by w, it is necessary to call Close 23 | // to ensure w has finished writing. 24 | func NewWriter(w io.Writer, handlers *Handlers, config ...Config) (*Writer, error) { 25 | var c Config 26 | var sink OutputSink 27 | if config != nil { 28 | c = config[0] 29 | if c.Sink != nil { 30 | sink = c.Sink 31 | } else if w == nil { 32 | sink = func([]byte) {} 33 | } else { 34 | sink = func(p []byte) { 35 | _, _ = w.Write(p) 36 | } 37 | } 38 | } else { 39 | c = newDefaultConfig() 40 | if w == nil { 41 | sink = func([]byte) {} 42 | } else { 43 | sink = func(p []byte) { 44 | _, _ = w.Write(p) 45 | } 46 | } 47 | } 48 | 49 | rb := newRewriterBuilder() 50 | var selectors []*selector 51 | if handlers != nil { 52 | for _, dh := range handlers.DocumentContentHandler { 53 | rb.AddDocumentContentHandlers( 54 | dh.DoctypeHandler, 55 | dh.CommentHandler, 56 | dh.TextChunkHandler, 57 | dh.DocumentEndHandler, 58 | ) 59 | } 60 | for _, eh := range handlers.ElementContentHandler { 61 | s, err := newSelector(eh.Selector) 62 | if err != nil { 63 | return nil, err 64 | } 65 | selectors = append(selectors, s) 66 | rb.AddElementContentHandlers( 67 | s, 68 | eh.ElementHandler, 69 | eh.CommentHandler, 70 | eh.TextChunkHandler, 71 | ) 72 | } 73 | } 74 | r, err := rb.Build(sink, c) 75 | if err != nil { 76 | return nil, err 77 | } 78 | rb.Free() 79 | for _, s := range selectors { 80 | s.Free() 81 | } 82 | 83 | return &Writer{w: w, rewriter: r}, nil 84 | } 85 | 86 | func (w *Writer) Write(p []byte) (n int, err error) { 87 | if w.err != nil { 88 | return 0, w.err 89 | } 90 | if len(p) == 0 { 91 | return 0, nil 92 | } 93 | n, err = w.rewriter.Write(p) 94 | if err != nil { 95 | w.err = err 96 | return 97 | } 98 | return 99 | } 100 | 101 | // WriteString writes a string to the Writer. 102 | func (w *Writer) WriteString(s string) (n int, err error) { 103 | if w.err != nil { 104 | return 0, w.err 105 | } 106 | if len(s) == 0 { 107 | return 0, nil 108 | } 109 | n, err = w.rewriter.WriteString(s) 110 | if err != nil { 111 | w.err = err 112 | return 113 | } 114 | return 115 | } 116 | 117 | // Close closes the Writer, flushing any unwritten data to the underlying io.Writer, 118 | // but does not close the underlying io.Writer. 119 | // Subsequent calls to Close is a no-op. 120 | func (w *Writer) Close() error { 121 | if w == nil || w.closed { 122 | return nil 123 | } 124 | w.closed = true 125 | if w.err == nil { 126 | w.err = w.rewriter.End() 127 | } 128 | w.rewriter.Free() 129 | return w.err 130 | } 131 | 132 | // RewriteString rewrites the given string with the provided Handlers and Config. 133 | func RewriteString(s string, handlers *Handlers, config ...Config) (string, error) { 134 | var buf bytes.Buffer 135 | var w *Writer 136 | var err error 137 | if config != nil { 138 | w, err = NewWriter(&buf, handlers, config[0]) 139 | } else { 140 | w, err = NewWriter(&buf, handlers) 141 | } 142 | if err != nil { 143 | return "", err 144 | } 145 | 146 | _, err = w.WriteString(s) 147 | if err != nil { 148 | return "", err 149 | } 150 | 151 | err = w.Close() 152 | if err != nil { 153 | return "", err 154 | } 155 | 156 | return buf.String(), nil 157 | } 158 | -------------------------------------------------------------------------------- /textchunk.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // TextChunk represents a text chunk. 11 | type TextChunk C.lol_html_text_chunk_t 12 | 13 | // TextChunkHandlerFunc is a callback handler function to do something with a TextChunk. 14 | type TextChunkHandlerFunc func(*TextChunk) RewriterDirective 15 | 16 | // Content returns the text chunk's content. 17 | func (t *TextChunk) Content() string { 18 | text := (textChunkContent)(C.lol_html_text_chunk_content_get((*C.lol_html_text_chunk_t)(t))) 19 | return text.String() 20 | } 21 | 22 | // IsLastInTextNode returns whether the text chunk is the last in the text node. 23 | func (t *TextChunk) IsLastInTextNode() bool { 24 | return (bool)(C.lol_html_text_chunk_is_last_in_text_node((*C.lol_html_text_chunk_t)(t))) 25 | } 26 | 27 | type textChunkAlter int 28 | 29 | const ( 30 | textChunkInsertBefore textChunkAlter = iota 31 | textChunkInsertAfter 32 | textChunkReplace 33 | ) 34 | 35 | func (t *TextChunk) alter(content string, alter textChunkAlter, isHTML bool) error { 36 | contentC := C.CString(content) 37 | defer C.free(unsafe.Pointer(contentC)) 38 | contentLen := len(content) 39 | var errCode C.int 40 | switch alter { 41 | case textChunkInsertBefore: 42 | errCode = C.lol_html_text_chunk_before((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 43 | case textChunkInsertAfter: 44 | errCode = C.lol_html_text_chunk_after((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 45 | case textChunkReplace: 46 | errCode = C.lol_html_text_chunk_replace((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 47 | default: 48 | panic("not implemented") 49 | } 50 | if errCode == 0 { 51 | return nil 52 | } 53 | return getError() 54 | } 55 | 56 | // InsertBeforeAsText inserts the given content before the text chunk. 57 | // 58 | // The rewriter will HTML-escape the content before insertion: 59 | // 60 | // `<` will be replaced with `<` 61 | // 62 | // `>` will be replaced with `>` 63 | // 64 | // `&` will be replaced with `&` 65 | func (t *TextChunk) InsertBeforeAsText(content string) error { 66 | return t.alter(content, textChunkInsertBefore, false) 67 | } 68 | 69 | // InsertBeforeAsHTML inserts the given content before the text chunk. 70 | // The content is inserted as is. 71 | func (t *TextChunk) InsertBeforeAsHTML(content string) error { 72 | return t.alter(content, textChunkInsertBefore, true) 73 | } 74 | 75 | // InsertAfterAsText inserts the given content after the text chunk. 76 | // 77 | // The rewriter will HTML-escape the content before insertion: 78 | // 79 | // `<` will be replaced with `<` 80 | // 81 | // `>` will be replaced with `>` 82 | // 83 | // `&` will be replaced with `&` 84 | func (t *TextChunk) InsertAfterAsText(content string) error { 85 | return t.alter(content, textChunkInsertAfter, false) 86 | } 87 | 88 | // InsertAfterAsHTML inserts the given content after the text chunk. 89 | // The content is inserted as is. 90 | func (t *TextChunk) InsertAfterAsHTML(content string) error { 91 | return t.alter(content, textChunkInsertAfter, true) 92 | } 93 | 94 | // ReplaceAsText replace the text chunk with the supplied content. 95 | // 96 | // The rewriter will HTML-escape the content: 97 | // 98 | // `<` will be replaced with `<` 99 | // 100 | // `>` will be replaced with `>` 101 | // 102 | // `&` will be replaced with `&` 103 | func (t *TextChunk) ReplaceAsText(content string) error { 104 | return t.alter(content, textChunkReplace, false) 105 | } 106 | 107 | // ReplaceAsHTML replace the text chunk with the supplied content. 108 | // The content is kept as is. 109 | func (t *TextChunk) ReplaceAsHTML(content string) error { 110 | return t.alter(content, textChunkReplace, true) 111 | } 112 | 113 | // Remove removes the text chunk. 114 | func (t *TextChunk) Remove() { 115 | C.lol_html_text_chunk_remove((*C.lol_html_text_chunk_t)(t)) 116 | } 117 | 118 | // IsRemoved returns whether the text chunk is removed or not. 119 | func (t *TextChunk) IsRemoved() bool { 120 | return (bool)(C.lol_html_text_chunk_is_removed((*C.lol_html_text_chunk_t)(t))) 121 | } 122 | -------------------------------------------------------------------------------- /comment.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // Comment represents an HTML comment. 11 | type Comment C.lol_html_comment_t 12 | 13 | // CommentHandlerFunc is a callback handler function to do something with a Comment. 14 | // Expected to return a RewriterDirective as instruction to continue or stop. 15 | type CommentHandlerFunc func(*Comment) RewriterDirective 16 | 17 | // Text returns the comment's text. 18 | func (c *Comment) Text() string { 19 | textC := (str)(C.lol_html_comment_text_get((*C.lol_html_comment_t)(c))) 20 | defer textC.Free() 21 | return textC.String() 22 | } 23 | 24 | // SetText sets the comment's text and returns an error if there is one. 25 | func (c *Comment) SetText(text string) error { 26 | textC := C.CString(text) 27 | defer C.free(unsafe.Pointer(textC)) 28 | textLen := len(text) 29 | errCode := C.lol_html_comment_text_set((*C.lol_html_comment_t)(c), textC, C.size_t(textLen)) 30 | if errCode == 0 { 31 | return nil 32 | } 33 | return getError() 34 | } 35 | 36 | type commentAlter int 37 | 38 | const ( 39 | commentInsertBefore commentAlter = iota 40 | commentInsertAfter 41 | commentReplace 42 | ) 43 | 44 | func (c *Comment) alter(content string, alter commentAlter, isHTML bool) error { 45 | contentC := C.CString(content) 46 | defer C.free(unsafe.Pointer(contentC)) 47 | contentLen := len(content) 48 | var errCode C.int 49 | switch alter { 50 | case commentInsertBefore: 51 | errCode = C.lol_html_comment_before((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 52 | case commentInsertAfter: 53 | errCode = C.lol_html_comment_after((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 54 | case commentReplace: 55 | errCode = C.lol_html_comment_replace((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 56 | default: 57 | panic("not implemented") 58 | } 59 | if errCode == 0 { 60 | return nil 61 | } 62 | return getError() 63 | } 64 | 65 | // InsertBeforeAsText inserts the given content before the comment. 66 | // 67 | // The rewriter will HTML-escape the content before insertion: 68 | // 69 | // `<` will be replaced with `<` 70 | // 71 | // `>` will be replaced with `>` 72 | // 73 | // `&` will be replaced with `&` 74 | func (c *Comment) InsertBeforeAsText(content string) error { 75 | return c.alter(content, commentInsertAfter, false) 76 | } 77 | 78 | // InsertBeforeAsHTML inserts the given content before the comment. 79 | // The content is inserted as is. 80 | func (c *Comment) InsertBeforeAsHTML(content string) error { 81 | return c.alter(content, commentInsertBefore, true) 82 | } 83 | 84 | // InsertAfterAsText inserts the given content before the comment. 85 | // 86 | // The rewriter will HTML-escape the content before insertion: 87 | // 88 | // `<` will be replaced with `<` 89 | // 90 | // `>` will be replaced with `>` 91 | // 92 | // `&` will be replaced with `&` 93 | func (c *Comment) InsertAfterAsText(content string) error { 94 | return c.alter(content, commentInsertAfter, false) 95 | } 96 | 97 | // InsertAfterAsHTML inserts the given content before the comment. 98 | // The content is inserted as is. 99 | func (c *Comment) InsertAfterAsHTML(content string) error { 100 | return c.alter(content, commentInsertAfter, true) 101 | } 102 | 103 | // ReplaceAsText replace the comment with the supplied content. 104 | // 105 | // The rewriter will HTML-escape the content: 106 | // 107 | // `<` will be replaced with `<` 108 | // 109 | // `>` will be replaced with `>` 110 | // 111 | // `&` will be replaced with `&` 112 | func (c *Comment) ReplaceAsText(content string) error { 113 | return c.alter(content, commentReplace, false) 114 | } 115 | 116 | // ReplaceAsHTML replace the comment with the supplied content. 117 | // The content is kept as is. 118 | func (c *Comment) ReplaceAsHTML(content string) error { 119 | return c.alter(content, commentReplace, true) 120 | } 121 | 122 | // Remove removes the comment. 123 | func (c *Comment) Remove() { 124 | C.lol_html_comment_remove((*C.lol_html_comment_t)(c)) 125 | } 126 | 127 | // IsRemoved returns whether the comment is removed or not. 128 | func (c *Comment) IsRemoved() bool { 129 | return (bool)(C.lol_html_comment_is_removed((*C.lol_html_comment_t)(c))) 130 | } 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-lolhtml 2 | 3 | ![GitHub Workflow Status](https://img.shields.io/github/workflow/status/coolspring8/go-lolhtml/Go) [![codecov](https://codecov.io/gh/CoolSpring8/go-lolhtml/branch/main/graph/badge.svg)](https://codecov.io/gh/CoolSpring8/go-lolhtml) [![Go Report Card](https://goreportcard.com/badge/github.com/coolspring8/go-lolhtml)](https://goreportcard.com/report/github.com/coolspring8/go-lolhtml) [![PkgGoDev](https://pkg.go.dev/badge/github.com/coolspring8/go-lolhtml)](https://pkg.go.dev/github.com/coolspring8/go-lolhtml) 4 | 5 | Go bindings for the Rust crate [cloudflare/lol-html](https://github.com/cloudflare/lol-html/), the *Low Output Latency streaming HTML rewriter/parser with CSS-selector based API*, talking via cgo. 6 | 7 | **Status:** 8 | 9 | **All abilities provided by lol_html's c-api are available**, except for customized user data in handlers. The original tests included in c-api package have also been translated to examine this binding's functionality. 10 | 11 | The code is at its early stage and **breaking changes might be introduced**. If you have any ideas on how the public API can be better structured, feel free to open a PR or an issue. 12 | 13 | * [go-lolhtml](#go-lolhtml) 14 | * [Installation](#installation) 15 | * [Features](#features) 16 | * [Getting Started](#getting-started) 17 | * [Examples](#examples) 18 | * [Documentation](#documentation) 19 | * [Other Bindings](#other-bindings) 20 | * [Versioning](#versioning) 21 | * [Help Wanted!](#help-wanted) 22 | * [License](#license) 23 | * [Disclaimer](#disclaimer) 24 | 25 | ## Installation 26 | 27 | For Linux/macOS/Windows x86_64 platform users, installation is as simple as a single `go get` command: 28 | 29 | ```shell 30 | $ go get github.com/coolspring8/go-lolhtml 31 | ``` 32 | 33 | Installing Rust is not a necessary step. That's because lol-html could be prebuilt into static libraries, stored and shipped in `/build` folder, so that cgo can handle other compilation matters naturally and smoothly, without intervention. 34 | 35 | For other platforms, you will have to compile it yourself. 36 | 37 | ## Features 38 | 39 | - Fast: A Go (cgo) wrapper built around the highly-optimized Rust HTML parsing crate lol_html. 40 | - Easy to use: Utilizing Go's idiomatic I/O methods, [lolhtml.Writer](https://pkg.go.dev/github.com/coolspring8/go-lolhtml#Writer) implements [io.Writer](https://golang.org/pkg/io/#Writer) interface. 41 | 42 | ## Getting Started 43 | 44 | Now let's initialize a project and create `main.go`: 45 | 46 | ```go 47 | package main 48 | 49 | import ( 50 | "bytes" 51 | "io" 52 | "log" 53 | "os" 54 | 55 | "github.com/coolspring8/go-lolhtml" 56 | ) 57 | 58 | func main() { 59 | chunk := []byte("Hello, World!") 60 | r := bytes.NewReader(chunk) 61 | w, err := lolhtml.NewWriter( 62 | // output to stdout 63 | os.Stdout, 64 | &lolhtml.Handlers{ 65 | ElementContentHandler: []lolhtml.ElementContentHandler{ 66 | { 67 | Selector: "span", 68 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 69 | err := e.SetInnerContentAsText("LOL-HTML") 70 | if err != nil { 71 | log.Fatal(err) 72 | } 73 | return lolhtml.Continue 74 | }, 75 | }, 76 | }, 77 | }, 78 | ) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | 83 | // copy from the bytes reader to lolhtml writer 84 | _, err = io.Copy(w, r) 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | 89 | // explicitly close the writer and flush the remaining content 90 | err = w.Close() 91 | if err != nil { 92 | log.Fatal(err) 93 | } 94 | // Output: Hello, LOL-HTML! 95 | } 96 | ``` 97 | 98 | The above program creates a new Writer configured to rewrite all texts in `span` tags to "LOL-HTML". It takes the chunk `Hello, World!` as input, and prints the result to standard output. 99 | 100 | And the result is `Hello, LOL-HTML!` . 101 | 102 | ## Examples 103 | 104 | example_test.go contains two examples. 105 | 106 | For more detailed examples, please visit the `/examples` subdirectory. 107 | 108 | - defer-scripts 109 | 110 | Usage: curl -NL https://git.io/JeOSZ | go run main.go 111 | 112 | - mixed-content-rewriter 113 | 114 | Usage: curl -NL https://git.io/JeOSZ | go run main.go 115 | 116 | - web-scraper 117 | 118 | A ported Go version of https://web.scraper.workers.dev/. 119 | 120 | ## Documentation 121 | 122 | Available at [pkg.go.dev](https://pkg.go.dev/github.com/coolspring8/go-lolhtml). 123 | 124 | ## Other Bindings 125 | 126 | - Rust (native), C, JavaScript - [cloudflare/lol-html](https://github.com/cloudflare/lol-html/) 127 | - Lua - [jdesgats/lua-lolhtml](https://github.com/jdesgats/lua-lolhtml/) 128 | 129 | ## Versioning 130 | 131 | This package does not really follow [Semantic Versioning](https://semver.org/). The current strategy is to follow lol_html's major and minor version, and the patch version number is reserved for this binding's updates, for Go Modul to upgrade correctly. 132 | 133 | ## Help Wanted! 134 | 135 | There are a few interesting things at [Projects](https://github.com/coolspring8/go-lolhtml/projects/1) panel that I have considered but is not yet implemented. Other contributions and suggestions are also welcome! 136 | 137 | ## License 138 | 139 | BSD 3-Clause "New" or "Revised" License 140 | 141 | ## Disclaimer 142 | 143 | This is an unofficial binding. 144 | 145 | Cloudflare is a registered trademark of Cloudflare, Inc. Cloudflare names used in this project are for identification purposes only. The project is not associated in any way with Cloudflare Inc. -------------------------------------------------------------------------------- /textchunk_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestTextChunk_InsertBeforeAndAfter(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 18 | content := tc.Content() 19 | if len(content) > 0 { 20 | if content != "Hey 42" { 21 | t.Errorf("got %s, want Hey 42", content) 22 | } 23 | if tc.IsLastInTextNode() { 24 | t.Error("text chunk last in text node flag incorrect, expected false, got true") 25 | } 26 | if tc.IsRemoved() { 27 | t.Error("text chunk removed flag incorrect, expected false, got true") 28 | } 29 | if err := tc.InsertBeforeAsHTML("
"); err != nil { 30 | t.Error(err) 31 | } 32 | if err := tc.InsertAfterAsText("
"); err != nil { 33 | t.Error(err) 34 | } 35 | } else { 36 | if !tc.IsLastInTextNode() { 37 | t.Error("text chunk last in text node flag incorrect, expected true, got false") 38 | } 39 | } 40 | return lolhtml.Continue 41 | }, 42 | }, 43 | }, 44 | }, 45 | ) 46 | if err != nil { 47 | t.Error(err) 48 | } 49 | 50 | if _, err := w.Write([]byte("Hey 42")); err != nil { 51 | t.Error(err) 52 | } 53 | if err := w.Close(); err != nil { 54 | t.Error(err) 55 | } 56 | wantedText := "
Hey 42</div>" 57 | if finalText := buf.String(); finalText != wantedText { 58 | t.Errorf("want %s got %s \n", wantedText, finalText) 59 | } 60 | } 61 | 62 | func TestTextChunk_Replace(t *testing.T) { 63 | var buf bytes.Buffer 64 | w, err := lolhtml.NewWriter( 65 | &buf, 66 | &lolhtml.Handlers{ 67 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 68 | { 69 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 70 | if len(tc.Content()) > 0 { 71 | if err := tc.ReplaceAsHTML(""); err != nil { 72 | t.Error(err) 73 | } 74 | if !tc.IsRemoved() { 75 | t.FailNow() 76 | } 77 | } 78 | return lolhtml.Continue 79 | }, 80 | }, 81 | }, 82 | }, 83 | ) 84 | if err != nil { 85 | t.Error(err) 86 | } 87 | 88 | if _, err := w.Write([]byte("
Hello
")); err != nil { 89 | t.Error(err) 90 | } 91 | if err := w.Close(); err != nil { 92 | t.Error(err) 93 | } 94 | wantedText := "
" 95 | if finalText := buf.String(); finalText != wantedText { 96 | t.Errorf("want %s got %s \n", wantedText, finalText) 97 | } 98 | } 99 | 100 | func TestTextChunk_InsertAfter(t *testing.T) { 101 | var buf bytes.Buffer 102 | w, err := lolhtml.NewWriter( 103 | &buf, 104 | &lolhtml.Handlers{ 105 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 106 | { 107 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 108 | if len(tc.Content()) > 0 { 109 | if err := tc.InsertAfterAsHTML(""); err != nil { 110 | t.Error(err) 111 | } 112 | } 113 | return lolhtml.Continue 114 | }, 115 | }, 116 | }, 117 | }, 118 | ) 119 | if err != nil { 120 | t.Error(err) 121 | } 122 | 123 | if _, err := w.Write([]byte("
hello
")); err != nil { 124 | t.Error(err) 125 | } 126 | if err := w.Close(); err != nil { 127 | t.Error(err) 128 | } 129 | wantedText := "
hello
" 130 | if finalText := buf.String(); finalText != wantedText { 131 | t.Errorf("want %s got %s \n", wantedText, finalText) 132 | } 133 | } 134 | 135 | func TestTextChunk_Remove(t *testing.T) { 136 | var buf bytes.Buffer 137 | w, err := lolhtml.NewWriter( 138 | &buf, 139 | &lolhtml.Handlers{ 140 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 141 | { 142 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 143 | if tc.IsRemoved() { 144 | t.FailNow() 145 | } 146 | tc.Remove() 147 | if !tc.IsRemoved() { 148 | t.FailNow() 149 | } 150 | return lolhtml.Continue 151 | }, 152 | }, 153 | }, 154 | }, 155 | ) 156 | if err != nil { 157 | t.Error(err) 158 | } 159 | 160 | if _, err := w.Write([]byte("0_0")); err != nil { 161 | t.Error(err) 162 | } 163 | if err := w.Close(); err != nil { 164 | t.Error(err) 165 | } 166 | wantedText := "" 167 | if finalText := buf.String(); finalText != wantedText { 168 | t.Errorf("want %s got %s \n", wantedText, finalText) 169 | } 170 | } 171 | 172 | func TestTextChunk_StopRewriting(t *testing.T) { 173 | var buf bytes.Buffer 174 | w, err := lolhtml.NewWriter( 175 | &buf, 176 | &lolhtml.Handlers{ 177 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 178 | { 179 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 180 | return lolhtml.Stop 181 | }, 182 | }, 183 | }, 184 | }, 185 | ) 186 | if err != nil { 187 | t.Error(err) 188 | } 189 | 190 | _, err = w.Write([]byte("42")) 191 | if err == nil { 192 | t.FailNow() 193 | } 194 | if err.Error() != "The rewriter has been stopped." { 195 | t.Error(err) 196 | } 197 | } 198 | 199 | func TestTextChunk_StopRewritingWithSelector(t *testing.T) { 200 | var buf bytes.Buffer 201 | w, err := lolhtml.NewWriter( 202 | &buf, 203 | &lolhtml.Handlers{ 204 | ElementContentHandler: []lolhtml.ElementContentHandler{ 205 | { 206 | Selector: "*", 207 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 208 | return lolhtml.Stop 209 | }, 210 | }, 211 | }, 212 | }, 213 | ) 214 | if err != nil { 215 | t.Error(err) 216 | } 217 | 218 | _, err = w.Write([]byte("
42
")) 219 | if err == nil { 220 | t.FailNow() 221 | } 222 | if err.Error() != "The rewriter has been stopped." { 223 | t.Error(err) 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /comment_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestComment_GetSetText(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | CommentHandler: func(comment *lolhtml.Comment) lolhtml.RewriterDirective { 18 | if text := comment.Text(); text != "Hey 42" { 19 | t.Errorf("wrong text %s\n", text) 20 | } 21 | if err := comment.SetText("Yo"); err != nil { 22 | t.Errorf("set text error %s\n", err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | if _, err = w.Write([]byte("")); err != nil { 35 | t.Error(err) 36 | } 37 | if err = w.Close(); err != nil { 38 | t.Error(err) 39 | } 40 | wantedText := "" 41 | if finalText := buf.String(); finalText != wantedText { 42 | t.Errorf("want %s got %s \n", wantedText, finalText) 43 | } 44 | } 45 | 46 | func TestComment_Replace(t *testing.T) { 47 | var buf bytes.Buffer 48 | w, err := lolhtml.NewWriter( 49 | &buf, 50 | &lolhtml.Handlers{ 51 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 52 | { 53 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 54 | if err := c.ReplaceAsHTML(""); err != nil { 55 | t.Error(err) 56 | } 57 | if !c.IsRemoved() { 58 | t.FailNow() 59 | } 60 | return lolhtml.Continue 61 | }, 62 | }, 63 | }, 64 | }, 65 | ) 66 | if err != nil { 67 | t.Error(err) 68 | } 69 | 70 | if _, err := w.Write([]byte("
")); err != nil { 71 | t.Error(err) 72 | } 73 | if err := w.Close(); err != nil { 74 | t.Error(err) 75 | } 76 | wantedText := "
" 77 | if finalText := buf.String(); finalText != wantedText { 78 | t.Errorf("want %s got %s \n", wantedText, finalText) 79 | } 80 | } 81 | 82 | func TestComment_InsertAfter(t *testing.T) { 83 | var buf bytes.Buffer 84 | w, err := lolhtml.NewWriter( 85 | &buf, 86 | &lolhtml.Handlers{ 87 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 88 | { 89 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 90 | if err := c.InsertAfterAsHTML(""); err != nil { 91 | t.Error(err) 92 | } 93 | return lolhtml.Continue 94 | }, 95 | }, 96 | }, 97 | }, 98 | ) 99 | if err != nil { 100 | t.Error(err) 101 | } 102 | 103 | if _, err := w.Write([]byte("
")); err != nil { 104 | t.Error(err) 105 | } 106 | if err := w.Close(); err != nil { 107 | t.Error(err) 108 | } 109 | wantedText := "
" 110 | if finalText := buf.String(); finalText != wantedText { 111 | t.Errorf("want %s got %s \n", wantedText, finalText) 112 | } 113 | } 114 | 115 | func TestComment_Remove(t *testing.T) { 116 | var buf bytes.Buffer 117 | w, err := lolhtml.NewWriter( 118 | &buf, 119 | &lolhtml.Handlers{ 120 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 121 | { 122 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 123 | if c.IsRemoved() { 124 | t.FailNow() 125 | } 126 | c.Remove() 127 | if !c.IsRemoved() { 128 | t.FailNow() 129 | } 130 | return lolhtml.Continue 131 | }, 132 | }, 133 | }, 134 | }, 135 | ) 136 | if err != nil { 137 | t.Error(err) 138 | } 139 | 140 | if _, err := w.Write([]byte("<>")); err != nil { 141 | t.Error(err) 142 | } 143 | if err := w.Close(); err != nil { 144 | t.Error(err) 145 | } 146 | wantedText := "<>" 147 | if finalText := buf.String(); finalText != wantedText { 148 | t.Errorf("want %s got %s \n", wantedText, finalText) 149 | } 150 | } 151 | 152 | func TestComment_InsertBeforeAndAfter(t *testing.T) { 153 | var buf bytes.Buffer 154 | w, err := lolhtml.NewWriter( 155 | &buf, 156 | &lolhtml.Handlers{ 157 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 158 | { 159 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 160 | if err := c.InsertBeforeAsHTML("
"); err != nil { 161 | t.Error(err) 162 | } 163 | if err := c.InsertAfterAsText("
"); err != nil { 164 | t.Error(err) 165 | } 166 | return lolhtml.Continue 167 | }, 168 | }, 169 | }, 170 | }, 171 | ) 172 | if err != nil { 173 | t.Error(err) 174 | } 175 | 176 | if _, err := w.Write([]byte("")); err != nil { 177 | t.Error(err) 178 | } 179 | if err := w.Close(); err != nil { 180 | t.Error(err) 181 | } 182 | wantedText := "
</div>" 183 | if finalText := buf.String(); finalText != wantedText { 184 | t.Errorf("want %s got %s \n", wantedText, finalText) 185 | } 186 | } 187 | 188 | func TestComment_StopRewriting(t *testing.T) { 189 | var buf bytes.Buffer 190 | w, err := lolhtml.NewWriter( 191 | &buf, 192 | &lolhtml.Handlers{ 193 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 194 | { 195 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 196 | return lolhtml.Stop 197 | }, 198 | }, 199 | }, 200 | }, 201 | ) 202 | if err != nil { 203 | t.Error(err) 204 | } 205 | 206 | _, err = w.Write([]byte("
")) 207 | if err == nil { 208 | t.FailNow() 209 | } 210 | if err.Error() != "The rewriter has been stopped." { 211 | t.Error(err) 212 | } 213 | } 214 | 215 | func TestComment_StopRewritingWithSelector(t *testing.T) { 216 | var buf bytes.Buffer 217 | w, err := lolhtml.NewWriter( 218 | &buf, 219 | &lolhtml.Handlers{ 220 | ElementContentHandler: []lolhtml.ElementContentHandler{ 221 | { 222 | Selector: "*", 223 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 224 | return lolhtml.Stop 225 | }, 226 | }, 227 | }, 228 | }, 229 | ) 230 | if err != nil { 231 | t.Error(err) 232 | } 233 | 234 | _, err = w.Write([]byte("
")) 235 | if err == nil { 236 | t.FailNow() 237 | } 238 | if err.Error() != "The rewriter has been stopped." { 239 | t.Error(err) 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "path/filepath" 9 | "runtime" 10 | "testing" 11 | 12 | "github.com/coolspring8/go-lolhtml" 13 | ) 14 | 15 | const dataDir = "testdata" 16 | 17 | const ChunkSize = 1024 18 | 19 | func BenchmarkNewWriter(b *testing.B) { 20 | benchmarks := []struct { 21 | category string 22 | name string 23 | handlers *lolhtml.Handlers 24 | }{ 25 | { 26 | "Parsing", 27 | "TagScanner", 28 | nil, 29 | }, 30 | { 31 | "Parsing", 32 | "Lexer", 33 | &lolhtml.Handlers{ 34 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 35 | { 36 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective { 37 | return lolhtml.Continue 38 | }, 39 | }, 40 | }, 41 | }, 42 | }, 43 | { 44 | "Parsing", 45 | "TextRewritableUnitParsingAndDecoding", 46 | &lolhtml.Handlers{ 47 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 48 | { 49 | TextChunkHandler: func(c *lolhtml.TextChunk) lolhtml.RewriterDirective { 50 | return lolhtml.Continue 51 | }, 52 | }, 53 | }, 54 | }, 55 | }, 56 | { 57 | "Rewriting", 58 | "ModificationOfTagsOfAnElementWithLotsOfContent", 59 | &lolhtml.Handlers{ 60 | ElementContentHandler: []lolhtml.ElementContentHandler{ 61 | { 62 | Selector: "body", 63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 64 | err := e.SetTagName("body1") 65 | if err != nil { 66 | b.Fatal(err) 67 | } 68 | err = e.InsertAfterEndTagAsText("test") 69 | if err != nil { 70 | b.Fatal(err) 71 | } 72 | return lolhtml.Continue 73 | }, 74 | }, 75 | }, 76 | }, 77 | }, 78 | { 79 | "Rewriting", 80 | "RemoveContentOfAnElement", 81 | &lolhtml.Handlers{ 82 | ElementContentHandler: []lolhtml.ElementContentHandler{ 83 | { 84 | Selector: "ul", 85 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 86 | err := e.SetInnerContentAsText("") 87 | if err != nil { 88 | b.Fatal(err) 89 | } 90 | return lolhtml.Continue 91 | }, 92 | }, 93 | }, 94 | }, 95 | }, 96 | { 97 | "SelectorMatching", 98 | "MatchAllSelector", 99 | &lolhtml.Handlers{ 100 | ElementContentHandler: []lolhtml.ElementContentHandler{ 101 | { 102 | Selector: "*", 103 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 104 | return lolhtml.Continue 105 | }, 106 | }, 107 | }, 108 | }, 109 | }, 110 | { 111 | "SelectorMatching", 112 | "TagNameSelector", 113 | &lolhtml.Handlers{ 114 | ElementContentHandler: []lolhtml.ElementContentHandler{ 115 | { 116 | Selector: "div", 117 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 118 | return lolhtml.Continue 119 | }, 120 | }, 121 | }, 122 | }, 123 | }, 124 | { 125 | "SelectorMatching", 126 | "ClassSelector", 127 | &lolhtml.Handlers{ 128 | ElementContentHandler: []lolhtml.ElementContentHandler{ 129 | { 130 | Selector: ".note", 131 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 132 | return lolhtml.Continue 133 | }, 134 | }, 135 | }, 136 | }, 137 | }, 138 | { 139 | "SelectorMatching", 140 | "AttributeSelector", 141 | &lolhtml.Handlers{ 142 | ElementContentHandler: []lolhtml.ElementContentHandler{ 143 | { 144 | Selector: "[href]", 145 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 146 | return lolhtml.Continue 147 | }, 148 | }, 149 | }, 150 | }, 151 | }, 152 | { 153 | "SelectorMatching", 154 | "MultipleSelectors", 155 | &lolhtml.Handlers{ 156 | ElementContentHandler: []lolhtml.ElementContentHandler{ 157 | { 158 | Selector: "ul", 159 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 160 | return lolhtml.Continue 161 | }, 162 | }, 163 | { 164 | Selector: "ul > li", 165 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 166 | return lolhtml.Continue 167 | }, 168 | }, 169 | { 170 | Selector: "table > tbody td dfn", 171 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 172 | return lolhtml.Continue 173 | }, 174 | }, 175 | { 176 | Selector: "body table > tbody tr", 177 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 178 | return lolhtml.Continue 179 | }, 180 | }, 181 | { 182 | Selector: "body [href]", 183 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 184 | return lolhtml.Continue 185 | }, 186 | }, 187 | { 188 | Selector: "div img", 189 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 190 | return lolhtml.Continue 191 | }, 192 | }, 193 | { 194 | Selector: "div.note span", 195 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 196 | return lolhtml.Continue 197 | }, 198 | }, 199 | }, 200 | }, 201 | }, 202 | } 203 | 204 | files, err := ioutil.ReadDir(dataDir) 205 | if err != nil { 206 | b.Fatal("benchmark data files not found", err) 207 | } 208 | 209 | for _, file := range files { 210 | data, err := ioutil.ReadFile(filepath.Join(dataDir, file.Name())) 211 | if err != nil { 212 | b.Fatal("cannot read benchmark data files", err) 213 | } 214 | 215 | for _, bm := range benchmarks { 216 | b.Run(fmt.Sprintf("%s-%s-%s", bm.category, bm.name, file.Name()), func(b *testing.B) { 217 | b.SetBytes(int64(len(data))) 218 | b.ReportAllocs() 219 | runtime.GC() 220 | b.ResetTimer() 221 | for i := 0; i < b.N; i++ { 222 | w, err := lolhtml.NewWriter(nil, bm.handlers) 223 | if err != nil { 224 | b.Fatal(err) 225 | } 226 | 227 | r := bytes.NewReader(data) 228 | copyBuf := make([]byte, ChunkSize) 229 | _, err = io.CopyBuffer(w, r, copyBuf) 230 | if err != nil { 231 | b.Fatal(err) 232 | } 233 | 234 | err = w.Close() 235 | if err != nil { 236 | b.Fatal(err) 237 | } 238 | } 239 | }) 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /examples/web-scraper/main.go: -------------------------------------------------------------------------------- 1 | // This is a ported Go version of https://web.scraper.workers.dev/, whose source code is 2 | // available at https://github.com/adamschwartz/web.scraper.workers.dev licensed under MIT. 3 | // 4 | // This translation is for demonstration purpose only, so many parts of the code are suboptimal. 5 | // 6 | // Sometimes you may get a "different" result, as Go's encoding/json package always sorts the 7 | // keys of a map (when using multiple selectors), and encodes a nil slice as the null JSON value. 8 | package main 9 | 10 | import ( 11 | "encoding/json" 12 | "fmt" 13 | "io" 14 | "log" 15 | "net/http" 16 | "regexp" 17 | "strings" 18 | 19 | "github.com/coolspring8/go-lolhtml" 20 | ) 21 | 22 | var ( 23 | debug = true 24 | listenAddress = ":80" 25 | mainPageFileName = "index.html" 26 | ) 27 | 28 | var ( 29 | urlHasPrefix = regexp.MustCompile(`^[a-zA-Z]+://`) 30 | unifyWhitespace = regexp.MustCompile(`\s{2,}`) 31 | ) 32 | 33 | // used to separate texts in different elements. 34 | var textSeparator = "TEXT_SEPARATOR_TEXT_SEPARATOR" 35 | 36 | func main() { 37 | log.Printf("Server started at %s", listenAddress) 38 | http.HandleFunc("/", handler) 39 | log.Fatal(http.ListenAndServe(listenAddress, nil)) 40 | } 41 | 42 | func handler(w http.ResponseWriter, req *http.Request) { 43 | log.Println(req.URL) 44 | 45 | // 404 46 | if req.URL.Path != "/" { 47 | w.WriteHeader(http.StatusNotFound) 48 | _, _ = w.Write([]byte("Not found")) 49 | return 50 | } 51 | 52 | q := req.URL.Query() 53 | 54 | url := q.Get("url") 55 | if url != "" && !urlHasPrefix.MatchString(url) { 56 | url = "http://" + url 57 | } 58 | 59 | selector := q.Get("selector") 60 | 61 | attr := q.Get("attr") 62 | 63 | var spaced bool 64 | _spaced := q.Get("spaced") 65 | if _spaced != "" { 66 | spaced = true 67 | } else { 68 | spaced = false 69 | } 70 | 71 | var pretty bool 72 | _pretty := q.Get("pretty") 73 | if _pretty != "" { 74 | pretty = true 75 | } else { 76 | pretty = false 77 | } 78 | 79 | // home page 80 | if url == "" && selector == "" { 81 | http.ServeFile(w, req, mainPageFileName) 82 | return 83 | } 84 | 85 | // text or attr: get text, part 1/2 86 | handlers := lolhtml.Handlers{} 87 | // matches and selectors are used by text scraper 88 | matches := make(map[string][]string) 89 | var selectors []string 90 | _selectors := strings.Split(selector, ",") 91 | for _, s := range _selectors { 92 | selectors = append(selectors, strings.TrimSpace(s)) 93 | } 94 | // attrValue is used by attribute scraper 95 | var attrValue string 96 | if attr == "" { 97 | nextText := make(map[string]string) 98 | 99 | for _, s := range selectors { 100 | s := s 101 | handlers.ElementContentHandler = append( 102 | handlers.ElementContentHandler, 103 | lolhtml.ElementContentHandler{ 104 | Selector: s, 105 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 106 | matches[s] = append(matches[s], textSeparator) 107 | nextText[s] = "" 108 | return lolhtml.Continue 109 | }, 110 | TextChunkHandler: func(t *lolhtml.TextChunk) lolhtml.RewriterDirective { 111 | nextText[s] += t.Content() 112 | if t.IsLastInTextNode() { 113 | if spaced { 114 | nextText[s] += " " 115 | } 116 | matches[s] = append(matches[s], nextText[s]) 117 | nextText[s] = "" 118 | } 119 | return lolhtml.Continue 120 | }, 121 | }, 122 | ) 123 | } 124 | } else { 125 | handlers = lolhtml.Handlers{ 126 | ElementContentHandler: []lolhtml.ElementContentHandler{ 127 | { 128 | Selector: selector, 129 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 130 | attrValue, _ = e.AttributeValue(attr) 131 | return lolhtml.Stop 132 | }, 133 | }, 134 | }, 135 | } 136 | } 137 | 138 | lolWriter, err := lolhtml.NewWriter( 139 | nil, 140 | &handlers, 141 | ) 142 | if err != nil { 143 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 144 | return 145 | } 146 | 147 | // fetch target page content 148 | resp, err := http.Get(url) 149 | if err != nil { 150 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 151 | return 152 | } 153 | if resp.StatusCode != http.StatusOK { 154 | sendError(w, http.StatusBadGateway, fmt.Sprintf("Status %d requesting %s", resp.StatusCode, url), pretty) 155 | return 156 | } 157 | defer resp.Body.Close() 158 | 159 | // might be confusing 160 | _, err = io.Copy(lolWriter, resp.Body) 161 | if err != nil && err.Error() != "The rewriter has been stopped." { 162 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 163 | return 164 | } 165 | if err == nil || err.Error() != "The rewriter has been stopped." { 166 | err = lolWriter.Close() 167 | if err != nil { 168 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 169 | return 170 | } 171 | } 172 | 173 | // text or attr: post-process texts, part 2/2 174 | if attr == "" { 175 | for _, s := range selectors { 176 | var nodeCompleteTexts []string 177 | nextText := "" 178 | 179 | for _, text := range matches[s] { 180 | if text == textSeparator { 181 | if strings.TrimSpace(nextText) != "" { 182 | nodeCompleteTexts = append(nodeCompleteTexts, cleanText(nextText)) 183 | nextText = "" 184 | } 185 | } else { 186 | nextText += text 187 | } 188 | } 189 | 190 | lastText := cleanText(nextText) 191 | if lastText != "" { 192 | nodeCompleteTexts = append(nodeCompleteTexts, lastText) 193 | } 194 | matches[s] = nodeCompleteTexts 195 | } 196 | } 197 | 198 | w.WriteHeader(http.StatusOK) 199 | 200 | enc := json.NewEncoder(w) 201 | enc.SetEscapeHTML(false) 202 | if pretty { 203 | enc.SetIndent("", " ") 204 | } 205 | 206 | if attr == "" { 207 | err = enc.Encode(Response{Result: matches}) 208 | } else { 209 | err = enc.Encode(Response{Result: attrValue}) 210 | } 211 | if err != nil { 212 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 213 | return 214 | } 215 | } 216 | 217 | type Response struct { 218 | Result interface{} `json:"result,omitempty"` 219 | Error string `json:"error,omitempty"` 220 | } 221 | 222 | func sendError(w http.ResponseWriter, statusCode int, errorText string, pretty bool) { 223 | w.WriteHeader(statusCode) 224 | 225 | enc := json.NewEncoder(w) 226 | enc.SetEscapeHTML(false) 227 | if pretty { 228 | enc.SetIndent("", " ") 229 | } 230 | 231 | // redact concrete error message if debug != true 232 | if !debug && statusCode == http.StatusInternalServerError { 233 | errorText = "Internal server error" 234 | } 235 | 236 | err := enc.Encode(Response{Error: errorText}) 237 | if err != nil { 238 | _, _ = w.Write([]byte(errorText)) 239 | } 240 | } 241 | 242 | func cleanText(s string) string { 243 | return unifyWhitespace.ReplaceAllString(strings.TrimSpace(s), " ") 244 | } 245 | -------------------------------------------------------------------------------- /element.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import ( 9 | "errors" 10 | "unsafe" 11 | ) 12 | 13 | // Element represents an HTML element. 14 | type Element C.lol_html_element_t 15 | 16 | // ElementHandlerFunc is a callback handler function to do something with an Element. 17 | type ElementHandlerFunc func(*Element) RewriterDirective 18 | 19 | // TagName gets the element's tag name. 20 | func (e *Element) TagName() string { 21 | tagNameC := (str)(C.lol_html_element_tag_name_get((*C.lol_html_element_t)(e))) 22 | defer tagNameC.Free() 23 | return tagNameC.String() 24 | } 25 | 26 | // SetTagName sets the element's tag name. 27 | func (e *Element) SetTagName(name string) error { 28 | nameC := C.CString(name) 29 | defer C.free(unsafe.Pointer(nameC)) 30 | nameLen := len(name) 31 | errCode := C.lol_html_element_tag_name_set((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 32 | if errCode == 0 { 33 | return nil 34 | } 35 | return getError() 36 | } 37 | 38 | // NamespaceURI gets the element's namespace URI. 39 | func (e *Element) NamespaceURI() string { 40 | // don't need to be freed 41 | namespaceURIC := C.lol_html_element_namespace_uri_get((*C.lol_html_element_t)(e)) 42 | return C.GoString(namespaceURIC) 43 | } 44 | 45 | // AttributeIterator returns a pointer to an AttributeIterator. Can be used to iterate 46 | // over all attributes of the element. 47 | func (e *Element) AttributeIterator() *AttributeIterator { 48 | return (*AttributeIterator)(C.lol_html_attributes_iterator_get((*C.lol_html_element_t)(e))) 49 | } 50 | 51 | // AttributeValue returns the value of the attribute on this element. 52 | func (e *Element) AttributeValue(name string) (string, error) { 53 | nameC := C.CString(name) 54 | defer C.free(unsafe.Pointer(nameC)) 55 | nameLen := len(name) 56 | valueC := (*str)(C.lol_html_element_get_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))) 57 | defer valueC.Free() 58 | // always check error, so not using getError() 59 | errC := (*str)(C.lol_html_take_last_error()) 60 | defer errC.Free() 61 | errMsg := errC.String() 62 | if errMsg != "" { 63 | return "", errors.New(errMsg) 64 | } 65 | return valueC.String(), nil 66 | } 67 | 68 | // HasAttribute returns whether the element has the attribute of this name or not. 69 | func (e *Element) HasAttribute(name string) (bool, error) { 70 | nameC := C.CString(name) 71 | defer C.free(unsafe.Pointer(nameC)) 72 | nameLen := len(name) 73 | codeC := C.lol_html_element_has_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 74 | if codeC == 1 { 75 | return true, nil 76 | } else if codeC == 0 { 77 | return false, nil 78 | } 79 | return false, getError() 80 | } 81 | 82 | // SetAttribute updates or creates the attribute with name and value on the element. 83 | func (e *Element) SetAttribute(name string, value string) error { 84 | nameC := C.CString(name) 85 | defer C.free(unsafe.Pointer(nameC)) 86 | nameLen := len(name) 87 | valueC := C.CString(value) 88 | defer C.free(unsafe.Pointer(valueC)) 89 | valueLen := len(value) 90 | errCode := C.lol_html_element_set_attribute( 91 | (*C.lol_html_element_t)(e), 92 | nameC, 93 | C.size_t(nameLen), 94 | valueC, 95 | C.size_t(valueLen), 96 | ) 97 | if errCode == 0 { 98 | return nil 99 | } 100 | return getError() 101 | } 102 | 103 | // RemoveAttribute removes the attribute with the name from the element. 104 | func (e *Element) RemoveAttribute(name string) error { 105 | nameC := C.CString(name) 106 | defer C.free(unsafe.Pointer(nameC)) 107 | nameLen := len(name) 108 | errCode := C.lol_html_element_remove_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 109 | if errCode == 0 { 110 | return nil 111 | } 112 | return getError() 113 | } 114 | 115 | type elementAlter int 116 | 117 | const ( 118 | elementInsertBeforeStartTag elementAlter = iota 119 | elementInsertAfterStartTag 120 | elementInsertBeforeEndTag 121 | elementInsertAfterEndTag 122 | elementSetInnerContent 123 | elementReplace 124 | ) 125 | 126 | func (e *Element) alter(content string, alter elementAlter, isHTML bool) error { 127 | contentC := C.CString(content) 128 | defer C.free(unsafe.Pointer(contentC)) 129 | contentLen := len(content) 130 | var errCode C.int 131 | switch alter { 132 | case elementInsertBeforeStartTag: 133 | errCode = C.lol_html_element_before((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 134 | case elementInsertAfterStartTag: 135 | errCode = C.lol_html_element_prepend((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 136 | case elementInsertBeforeEndTag: 137 | errCode = C.lol_html_element_append((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 138 | case elementInsertAfterEndTag: 139 | errCode = C.lol_html_element_after((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 140 | case elementSetInnerContent: 141 | errCode = C.lol_html_element_set_inner_content((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 142 | case elementReplace: 143 | errCode = C.lol_html_element_replace((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 144 | default: 145 | panic("not implemented") 146 | } 147 | if errCode == 0 { 148 | return nil 149 | } 150 | return getError() 151 | } 152 | 153 | // InsertBeforeStartTagAsText inserts the given content before the element's start tag. 154 | // 155 | // The rewriter will HTML-escape the content before insertion: 156 | // 157 | // `<` will be replaced with `<` 158 | // 159 | // `>` will be replaced with `>` 160 | // 161 | // `&` will be replaced with `&` 162 | func (e *Element) InsertBeforeStartTagAsText(content string) error { 163 | return e.alter(content, elementInsertBeforeStartTag, false) 164 | } 165 | 166 | // InsertBeforeStartTagAsHTML inserts the given content before the element's start tag. 167 | // The content is inserted as is. 168 | func (e *Element) InsertBeforeStartTagAsHTML(content string) error { 169 | return e.alter(content, elementInsertBeforeStartTag, true) 170 | } 171 | 172 | // InsertAfterStartTagAsText inserts (prepend) the given content after the element's start tag. 173 | // 174 | // The rewriter will HTML-escape the content before insertion: 175 | // 176 | // `<` will be replaced with `<` 177 | // 178 | // `>` will be replaced with `>` 179 | // 180 | // `&` will be replaced with `&` 181 | func (e *Element) InsertAfterStartTagAsText(content string) error { 182 | return e.alter(content, elementInsertAfterStartTag, false) 183 | } 184 | 185 | // InsertAfterStartTagAsHTML inserts (prepend) the given content after the element's start tag. 186 | // The content is inserted as is. 187 | func (e *Element) InsertAfterStartTagAsHTML(content string) error { 188 | return e.alter(content, elementInsertAfterStartTag, true) 189 | } 190 | 191 | // InsertBeforeEndTagAsText inserts (append) the given content after the element's end tag. 192 | // 193 | // The rewriter will HTML-escape the content before insertion: 194 | // 195 | // `<` will be replaced with `<` 196 | // 197 | // `>` will be replaced with `>` 198 | // 199 | // `&` will be replaced with `&` 200 | func (e *Element) InsertBeforeEndTagAsText(content string) error { 201 | return e.alter(content, elementInsertBeforeEndTag, false) 202 | } 203 | 204 | // InsertBeforeEndTagAsHTML inserts (append) the given content before the element's end tag. 205 | // The content is inserted as is. 206 | func (e *Element) InsertBeforeEndTagAsHTML(content string) error { 207 | return e.alter(content, elementInsertBeforeEndTag, true) 208 | } 209 | 210 | // InsertAfterEndTagAsText inserts the given content after the element's end tag. 211 | // 212 | // The rewriter will HTML-escape the content before insertion: 213 | // 214 | // `<` will be replaced with `<` 215 | // 216 | // `>` will be replaced with `>` 217 | // 218 | // `&` will be replaced with `&` 219 | func (e *Element) InsertAfterEndTagAsText(content string) error { 220 | return e.alter(content, elementInsertAfterEndTag, false) 221 | } 222 | 223 | // InsertAfterEndTagAsHTML inserts the given content after the element's end tag. 224 | // The content is inserted as is. 225 | func (e *Element) InsertAfterEndTagAsHTML(content string) error { 226 | return e.alter(content, elementInsertAfterEndTag, true) 227 | } 228 | 229 | // SetInnerContentAsText overwrites the element's inner content. 230 | // 231 | // The rewriter will HTML-escape the content: 232 | // 233 | // `<` will be replaced with `<` 234 | // 235 | // `>` will be replaced with `>` 236 | // 237 | // `&` will be replaced with `&` 238 | func (e *Element) SetInnerContentAsText(content string) error { 239 | return e.alter(content, elementSetInnerContent, false) 240 | } 241 | 242 | // SetInnerContentAsHTML overwrites the element's inner content. 243 | // The content is kept as is. 244 | func (e *Element) SetInnerContentAsHTML(content string) error { 245 | return e.alter(content, elementSetInnerContent, true) 246 | } 247 | 248 | // ReplaceAsText replace the whole element with the supplied content. 249 | // 250 | // The rewriter will HTML-escape the content: 251 | // 252 | // `<` will be replaced with `<` 253 | // 254 | // `>` will be replaced with `>` 255 | // 256 | // `&` will be replaced with `&` 257 | func (e *Element) ReplaceAsText(content string) error { 258 | return e.alter(content, elementReplace, false) 259 | } 260 | 261 | // ReplaceAsHTML replace the whole element with the supplied content. 262 | // The content is kept as is. 263 | func (e *Element) ReplaceAsHTML(content string) error { 264 | return e.alter(content, elementReplace, true) 265 | } 266 | 267 | // Remove completely removes the element. 268 | func (e *Element) Remove() { 269 | C.lol_html_element_remove((*C.lol_html_element_t)(e)) 270 | } 271 | 272 | // RemoveAndKeepContent removes the element but keeps the inner content. 273 | func (e *Element) RemoveAndKeepContent() { 274 | C.lol_html_element_remove_and_keep_content((*C.lol_html_element_t)(e)) 275 | } 276 | 277 | // IsRemoved returns whether the element is removed or not. 278 | func (e *Element) IsRemoved() bool { 279 | return (bool)(C.lol_html_element_is_removed((*C.lol_html_element_t)(e))) 280 | } 281 | -------------------------------------------------------------------------------- /element_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestElement_ModifyTagName(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | ElementContentHandler: []lolhtml.ElementContentHandler{ 16 | { 17 | Selector: "*", 18 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 19 | wantName := "div" 20 | if name := e.TagName(); name != wantName { 21 | t.Errorf("got %s want %s\n", name, wantName) 22 | } 23 | err := e.SetTagName("") 24 | if err == nil { 25 | t.FailNow() 26 | } 27 | if err.Error() != "Tag name can't be empty." { 28 | t.Error(err) 29 | } 30 | if err = e.SetTagName("span"); err != nil { 31 | t.Error(err) 32 | } 33 | return lolhtml.Continue 34 | }, 35 | }, 36 | }, 37 | }, 38 | ) 39 | if err != nil { 40 | t.Error(err) 41 | } 42 | 43 | if _, err = w.Write([]byte("Hi
")); err != nil { 44 | t.Error(err) 45 | } 46 | if err = w.Close(); err != nil { 47 | t.Error(err) 48 | } 49 | wantedText := "Hi " 50 | if finalText := buf.String(); finalText != wantedText { 51 | t.Errorf("want %s got %s \n", wantedText, finalText) 52 | } 53 | } 54 | 55 | func TestElement_ModifyAttributes(t *testing.T) { 56 | var buf bytes.Buffer 57 | w, err := lolhtml.NewWriter( 58 | &buf, 59 | &lolhtml.Handlers{ 60 | ElementContentHandler: []lolhtml.ElementContentHandler{ 61 | { 62 | Selector: "*", 63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 64 | has, err := e.HasAttribute("foo") 65 | if err != nil { 66 | t.Error(err) 67 | } 68 | if !has { 69 | t.FailNow() 70 | } 71 | has, err = e.HasAttribute("Bar") 72 | if err != nil { 73 | t.Error(err) 74 | } 75 | if has { 76 | t.FailNow() 77 | } 78 | 79 | a, err := e.AttributeValue("foo") 80 | if err != nil { 81 | t.Error(err) 82 | } 83 | wantValue := "42" 84 | if a != wantValue { 85 | t.Errorf("got %s; want %s", a, wantValue) 86 | } 87 | a, err = e.AttributeValue("Bar") 88 | if err != nil { 89 | t.Error(err) 90 | } 91 | if a != "" { 92 | t.Errorf("got %s; want empty", a) 93 | } 94 | 95 | if err := e.SetAttribute("Bar", "hey"); err != nil { 96 | t.Error(err) 97 | } 98 | 99 | if err := e.RemoveAttribute("foo"); err != nil { 100 | t.Error(err) 101 | } 102 | 103 | return lolhtml.Continue 104 | }, 105 | }, 106 | }, 107 | }, 108 | ) 109 | if err != nil { 110 | t.Error(err) 111 | } 112 | 113 | if _, err = w.Write([]byte("")); err != nil { 114 | t.Error(err) 115 | } 116 | if err = w.Close(); err != nil { 117 | t.Error(err) 118 | } 119 | wantedText := "" 120 | if finalText := buf.String(); finalText != wantedText { 121 | t.Errorf("want %s got %s \n", wantedText, finalText) 122 | } 123 | } 124 | 125 | func TestElement_InsertContentAroundElement(t *testing.T) { 126 | var buf bytes.Buffer 127 | w, err := lolhtml.NewWriter( 128 | &buf, 129 | &lolhtml.Handlers{ 130 | ElementContentHandler: []lolhtml.ElementContentHandler{ 131 | { 132 | Selector: "*", 133 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 134 | if err := e.InsertBeforeStartTagAsText("&before"); err != nil { 135 | t.Error(err) 136 | } 137 | if err := e.InsertAfterStartTagAsHTML(""); err != nil { 138 | t.Error(err) 139 | } 140 | if err := e.InsertBeforeEndTagAsHTML(""); err != nil { 141 | t.Error(err) 142 | } 143 | if err := e.InsertAfterEndTagAsText("&after"); err != nil { 144 | t.Error(err) 145 | } 146 | return lolhtml.Continue 147 | }, 148 | }, 149 | }, 150 | }, 151 | ) 152 | if err != nil { 153 | t.Error(err) 154 | } 155 | 156 | if _, err = w.Write([]byte("
Hi
")); err != nil { 157 | t.Error(err) 158 | } 159 | if err = w.Close(); err != nil { 160 | t.Error(err) 161 | } 162 | wantedText := "&before
Hi
&after" 163 | if finalText := buf.String(); finalText != wantedText { 164 | t.Errorf("want %s got %s \n", wantedText, finalText) 165 | } 166 | } 167 | 168 | func TestElement_SetInnerContent(t *testing.T) { 169 | var buf bytes.Buffer 170 | w, err := lolhtml.NewWriter( 171 | &buf, 172 | &lolhtml.Handlers{ 173 | ElementContentHandler: []lolhtml.ElementContentHandler{ 174 | { 175 | Selector: "div", 176 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 177 | if err := e.SetInnerContentAsText("hey & ya"); err != nil { 178 | t.Error(err) 179 | } 180 | return lolhtml.Continue 181 | }, 182 | }, 183 | }, 184 | }, 185 | ) 186 | if err != nil { 187 | t.Error(err) 188 | } 189 | 190 | if _, err = w.Write([]byte("
42
")); err != nil { 191 | t.Error(err) 192 | } 193 | if err = w.Close(); err != nil { 194 | t.Error(err) 195 | } 196 | wantedText := "
hey & ya
" 197 | if finalText := buf.String(); finalText != wantedText { 198 | t.Errorf("want %s got %s \n", wantedText, finalText) 199 | } 200 | } 201 | 202 | func TestElement_Replace(t *testing.T) { 203 | var buf bytes.Buffer 204 | w, err := lolhtml.NewWriter( 205 | &buf, 206 | &lolhtml.Handlers{ 207 | ElementContentHandler: []lolhtml.ElementContentHandler{ 208 | { 209 | Selector: "div", 210 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 211 | if err := e.ReplaceAsHTML("hey & ya"); err != nil { 212 | t.Error(err) 213 | } 214 | return lolhtml.Continue 215 | }, 216 | }, 217 | }, 218 | }, 219 | ) 220 | if err != nil { 221 | t.Error(err) 222 | } 223 | 224 | if _, err = w.Write([]byte("
42

Hello
good bye

Hello2

")); err != nil { 225 | t.Error(err) 226 | } 227 | if err = w.Close(); err != nil { 228 | t.Error(err) 229 | } 230 | wantedText := "hey & ya

Hellohey & ya

Hello2

" 231 | if finalText := buf.String(); finalText != wantedText { 232 | t.Errorf("want %s got %s \n", wantedText, finalText) 233 | } 234 | } 235 | 236 | func TestElement_Remove(t *testing.T) { 237 | var buf bytes.Buffer 238 | w, err := lolhtml.NewWriter( 239 | &buf, 240 | &lolhtml.Handlers{ 241 | ElementContentHandler: []lolhtml.ElementContentHandler{ 242 | { 243 | Selector: "h1", 244 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 245 | if e.IsRemoved() { 246 | t.FailNow() 247 | } 248 | e.Remove() 249 | if !e.IsRemoved() { 250 | t.FailNow() 251 | } 252 | return lolhtml.Continue 253 | }, 254 | }, 255 | }, 256 | }, 257 | ) 258 | if err != nil { 259 | t.Error(err) 260 | } 261 | 262 | if _, err = w.Write([]byte("
42

Hello

Hello2

")); err != nil { 263 | t.Error(err) 264 | } 265 | if err = w.Close(); err != nil { 266 | t.Error(err) 267 | } 268 | wantedText := "
42

Hello2

" 269 | if finalText := buf.String(); finalText != wantedText { 270 | t.Errorf("want %s got %s \n", wantedText, finalText) 271 | } 272 | } 273 | 274 | func TestElement_RemoveElementAndKeepContent(t *testing.T) { 275 | var buf bytes.Buffer 276 | w, err := lolhtml.NewWriter( 277 | &buf, 278 | &lolhtml.Handlers{ 279 | ElementContentHandler: []lolhtml.ElementContentHandler{ 280 | { 281 | Selector: "h2", 282 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 283 | if e.IsRemoved() { 284 | t.FailNow() 285 | } 286 | e.RemoveAndKeepContent() 287 | if !e.IsRemoved() { 288 | t.FailNow() 289 | } 290 | return lolhtml.Continue 291 | }, 292 | }, 293 | }, 294 | }, 295 | ) 296 | if err != nil { 297 | t.Error(err) 298 | } 299 | 300 | if _, err = w.Write([]byte("
42

Hello1

Hello

Hello2

")); err != nil { 301 | t.Error(err) 302 | } 303 | if err = w.Close(); err != nil { 304 | t.Error(err) 305 | } 306 | wantedText := "
42Hello1

Hello

Hello2" 307 | if finalText := buf.String(); finalText != wantedText { 308 | t.Errorf("want %s got %s \n", wantedText, finalText) 309 | } 310 | } 311 | 312 | func TestElement_GetEmptyElementAttribute(t *testing.T) { 313 | var buf bytes.Buffer 314 | w, err := lolhtml.NewWriter( 315 | &buf, 316 | &lolhtml.Handlers{ 317 | ElementContentHandler: []lolhtml.ElementContentHandler{ 318 | { 319 | Selector: "span", 320 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 321 | has, err := e.HasAttribute("foo") 322 | if err != nil { 323 | t.Error(err) 324 | } 325 | if !has { 326 | t.FailNow() 327 | } 328 | value, err := e.AttributeValue("foo") 329 | if err != nil { 330 | t.Error(err) 331 | } 332 | if value != "" { 333 | t.Errorf("got %s; want empty", value) 334 | } 335 | return lolhtml.Continue 336 | }, 337 | }, 338 | }, 339 | }, 340 | ) 341 | if err != nil { 342 | t.Error(err) 343 | } 344 | 345 | if _, err = w.Write([]byte("")); err != nil { 346 | t.Error(err) 347 | } 348 | if err = w.Close(); err != nil { 349 | t.Error(err) 350 | } 351 | wantedText := "" 352 | if finalText := buf.String(); finalText != wantedText { 353 | t.Errorf("want %s got %s \n", wantedText, finalText) 354 | } 355 | } 356 | 357 | func TestElement_IterateAttributes(t *testing.T) { 358 | w, err := lolhtml.NewWriter( 359 | nil, 360 | &lolhtml.Handlers{ 361 | ElementContentHandler: []lolhtml.ElementContentHandler{ 362 | { 363 | Selector: "*", 364 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 365 | ai := e.AttributeIterator() 366 | 367 | a := ai.Next() 368 | if name := a.Name(); name != "foo" { 369 | t.Errorf("got %s; want foo", name) 370 | } 371 | if value := a.Value(); value != "42" { 372 | t.Errorf("got %s; want foo", value) 373 | } 374 | 375 | a = ai.Next() 376 | if name := a.Name(); name != "bar" { 377 | t.Errorf("got %s; want bar", name) 378 | } 379 | if value := a.Value(); value != "1337" { 380 | t.Errorf("got %s; want 1337", value) 381 | } 382 | 383 | a = ai.Next() 384 | if a != nil { 385 | t.FailNow() 386 | } 387 | 388 | return lolhtml.Continue 389 | }, 390 | }, 391 | }, 392 | }, 393 | ) 394 | if err != nil { 395 | t.Error(err) 396 | } 397 | 398 | if _, err = w.Write([]byte("
")); err != nil { 399 | t.Error(err) 400 | } 401 | if err = w.Close(); err != nil { 402 | t.Error(err) 403 | } 404 | } 405 | 406 | func TestElement_AssertNsIsHtml(t *testing.T) { 407 | w, err := lolhtml.NewWriter( 408 | nil, 409 | &lolhtml.Handlers{ 410 | ElementContentHandler: []lolhtml.ElementContentHandler{ 411 | { 412 | Selector: "script", 413 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 414 | wantedText := "http://www.w3.org/1999/xhtml" 415 | if ns := e.NamespaceURI(); ns != wantedText { 416 | t.Errorf("got %s; want %s", ns, wantedText) 417 | } 418 | return lolhtml.Continue 419 | }, 420 | }, 421 | }, 422 | }, 423 | ) 424 | if err != nil { 425 | t.Error(err) 426 | } 427 | 428 | if _, err = w.Write([]byte("")); err != nil { 429 | t.Error(err) 430 | } 431 | if err = w.Close(); err != nil { 432 | t.Error(err) 433 | } 434 | } 435 | 436 | func TestElement_AssertNsIsSvg(t *testing.T) { 437 | w, err := lolhtml.NewWriter( 438 | nil, 439 | &lolhtml.Handlers{ 440 | ElementContentHandler: []lolhtml.ElementContentHandler{ 441 | { 442 | Selector: "script", 443 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 444 | wantedText := "http://www.w3.org/2000/svg" 445 | if ns := e.NamespaceURI(); ns != wantedText { 446 | t.Errorf("got %s; want %s", ns, wantedText) 447 | } 448 | return lolhtml.Continue 449 | }, 450 | }, 451 | }, 452 | }, 453 | ) 454 | if err != nil { 455 | t.Error(err) 456 | } 457 | 458 | if _, err = w.Write([]byte("")); err != nil { 459 | t.Error(err) 460 | } 461 | if err = w.Close(); err != nil { 462 | t.Error(err) 463 | } 464 | } 465 | 466 | func TestElement_StopRewriting(t *testing.T) { 467 | w, err := lolhtml.NewWriter( 468 | nil, 469 | &lolhtml.Handlers{ 470 | ElementContentHandler: []lolhtml.ElementContentHandler{ 471 | { 472 | Selector: "span", 473 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 474 | return lolhtml.Stop 475 | }, 476 | }, 477 | }, 478 | }, 479 | ) 480 | if err != nil { 481 | t.Error(err) 482 | } 483 | 484 | _, err = w.Write([]byte("")) 485 | if err == nil { 486 | t.FailNow() 487 | } 488 | if err.Error() != "The rewriter has been stopped." { 489 | t.Error(err) 490 | } 491 | err = w.Close() 492 | if err == nil { 493 | t.FailNow() 494 | } 495 | if err.Error() != "The rewriter has been stopped." { 496 | t.Error(err) 497 | } 498 | } 499 | -------------------------------------------------------------------------------- /examples/web-scraper/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Web Scraper · By Adam Schwartz · Powered by Cloudflare Workers® 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 238 | 239 | 240 |
241 |
242 |
243 |

244 | 245 |
246 |
247 |
248 |
249 |
250 | 251 |
252 | 253 |
254 |
255 |
256 | 257 |
258 | 259 |
260 |
261 |
262 |
263 | 264 | 265 |
266 |
267 | 268 | 269 |
270 |
271 |
272 |
273 |
274 | 275 | 276 |
277 |
278 |
279 |
280 | 281 |
282 | 283 |
284 |
285 |
286 | 287 | 288 |
289 |
290 |
291 |
292 |
293 |
294 | 295 | Permalink 296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 | 323 |
324 |
325 | 458 | 459 | --------------------------------------------------------------------------------