├── .gitattributes ├── .github └── workflows │ ├── build_static_lib.yml │ ├── go.yml │ └── golangci-lint.yml ├── .gitignore ├── .gitmodules ├── .idea ├── .gitignore ├── codeStyles │ ├── Project.xml │ └── codeStyleConfig.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── watcherTasks.xml ├── LICENSE ├── README.md ├── attribute.go ├── benchmark_test.go ├── build ├── LICENSE ├── include │ └── lol_html.h ├── linux-x86_64 │ └── liblolhtml.a ├── macos-x86_64 │ └── liblolhtml.a └── windows-x86_64 │ └── liblolhtml.a ├── builder.go ├── builder_test.go ├── callback.go ├── comment.go ├── comment_test.go ├── config.go ├── const.go ├── doctype.go ├── doctype_test.go ├── documentend.go ├── documentend_test.go ├── element.go ├── element_test.go ├── error.go ├── error_test.go ├── example_test.go ├── examples ├── defer-scripts │ └── main.go ├── mixed-content-rewriter │ └── main.go └── web-scraper │ ├── index.html │ └── main.go ├── export_test.go ├── go.mod ├── go.sum ├── lolhtml.go ├── pointer.go ├── rewriter.go ├── rewriter_test.go ├── selector.go ├── selector_test.go ├── string.go ├── testdata ├── cloudflare.com.html ├── ecma402-spec.html └── html-parsing-spec.html ├── textchunk.go ├── textchunk_test.go └── writer.go /.gitattributes: -------------------------------------------------------------------------------- 1 | build/* linguist-vendored 2 | examples/web-scraper/index.html linguist-vendored 3 | -------------------------------------------------------------------------------- /.github/workflows/build_static_lib.yml: -------------------------------------------------------------------------------- 1 | name: Build static library 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build: 8 | name: Build lol_html crate 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | include: 13 | - build: linux-x86_64 14 | os: ubuntu-latest 15 | - build: macos-x86_64 16 | os: macos-latest 17 | - build: windows-x86_64 18 | os: windows-latest 19 | target: x86_64-pc-windows-gnu 20 | steps: 21 | - uses: actions/checkout@v2 22 | with: 23 | repository: "cloudflare/lol-html" 24 | 25 | - name: Configure Cargo target 26 | run: | 27 | echo CARGO_BUILD_TARGET=${{ matrix.target }} >> $GITHUB_ENV 28 | rustup target add ${{ matrix.target }} 29 | if: matrix.target != '' 30 | 31 | - run: cargo build --lib --release --manifest-path c-api/Cargo.toml 32 | if: matrix.os != 'windows-latest' 33 | - run: cargo build --lib --release --target ${{ matrix.target }} --manifest-path c-api/Cargo.toml 34 | if: matrix.os == 'windows-latest' 35 | 36 | - run: mkdir dist 37 | 38 | - run: cp c-api/target/release/liblolhtml.a dist 39 | if: matrix.os != 'windows-latest' 40 | - run: cp c-api/target/${{ matrix.target }}/release/liblolhtml.a dist 41 | if: matrix.build == 'windows-x86_64' 42 | 43 | - uses: actions/upload-artifact@v2 44 | with: 45 | name: ${{ matrix.build }} 46 | path: dist 47 | 48 | - uses: actions/upload-artifact@v2 49 | with: 50 | name: include 51 | path: c-api/include 52 | if: matrix.os == 'ubuntu-latest' 53 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | name: Build 13 | runs-on: ubuntu-latest 14 | steps: 15 | 16 | - name: Set up Go 1.x 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: ^1.13 20 | id: go 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v2 24 | 25 | - name: Get dependencies 26 | run: | 27 | go get -v -t -d ./... 28 | if [ -f Gopkg.toml ]; then 29 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 30 | dep ensure 31 | fi 32 | 33 | - name: Build 34 | run: go build -v . 35 | 36 | - name: Test 37 | run: go test -v . -race -coverprofile coverage.txt --covermode atomic 38 | 39 | - name: Upload reports to Codecov 40 | run: bash <(curl -s https://codecov.io/bash) 41 | -------------------------------------------------------------------------------- /.github/workflows/golangci-lint.yml: -------------------------------------------------------------------------------- 1 | name: golangci-lint 2 | on: 3 | push: 4 | tags: 5 | - v* 6 | branches: 7 | - main 8 | pull_request: 9 | jobs: 10 | golangci: 11 | name: lint 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: golangci-lint 16 | uses: golangci/golangci-lint-action@v2 17 | with: 18 | version: v1.32 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains.gitignore 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 4 | 5 | # User-specific stuff 6 | .idea/**/workspace.xml 7 | .idea/**/tasks.xml 8 | .idea/**/usage.statistics.xml 9 | .idea/**/dictionaries 10 | .idea/**/shelf 11 | 12 | # Generated files 13 | .idea/**/contentModel.xml 14 | 15 | # Sensitive or high-churn files 16 | .idea/**/dataSources/ 17 | .idea/**/dataSources.ids 18 | .idea/**/dataSources.local.xml 19 | .idea/**/sqlDataSources.xml 20 | .idea/**/dynamic.xml 21 | .idea/**/uiDesigner.xml 22 | .idea/**/dbnavigator.xml 23 | 24 | # Gradle 25 | .idea/**/gradle.xml 26 | .idea/**/libraries 27 | 28 | # Gradle and Maven with auto-import 29 | # When using Gradle or Maven with auto-import, you should exclude module files, 30 | # since they will be recreated, and may cause churn. Uncomment if using 31 | # auto-import. 32 | # .idea/artifacts 33 | # .idea/compiler.xml 34 | # .idea/jarRepositories.xml 35 | # .idea/modules.xml 36 | # .idea/*.iml 37 | # .idea/modules 38 | *.iml 39 | # *.ipr 40 | 41 | # CMake 42 | cmake-build-*/ 43 | 44 | # Mongo Explorer plugin 45 | .idea/**/mongoSettings.xml 46 | 47 | # File-based project format 48 | *.iws 49 | 50 | # IntelliJ 51 | out/ 52 | 53 | # mpeltonen/sbt-idea plugin 54 | .idea_modules/ 55 | 56 | # JIRA plugin 57 | atlassian-ide-plugin.xml 58 | 59 | # Cursive Clojure plugin 60 | .idea/replstate.xml 61 | 62 | # Crashlytics plugin (for Android Studio and IntelliJ) 63 | com_crashlytics_export_strings.xml 64 | crashlytics.properties 65 | crashlytics-build.properties 66 | fabric.properties 67 | 68 | # Editor-based Rest Client 69 | .idea/httpRequests 70 | 71 | # Android studio 3.1+ serialized cache file 72 | .idea/caches/build_file_checksums.ser 73 | 74 | 75 | # Go.gitignore 76 | # Binaries for programs and plugins 77 | *.exe 78 | *.exe~ 79 | *.dll 80 | *.so 81 | *.dylib 82 | 83 | # Test binary, built with `go test -c` 84 | *.test 85 | 86 | # Output of the go coverage tool, specifically when used with LiteIDE 87 | *.out 88 | 89 | # Dependency directories (remove the comment below to include it) 90 | # vendor/ 91 | 92 | 93 | # Other files 94 | release/ 95 | *.def 96 | .rustc_info.json 97 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/.gitmodules -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /../../../../../../../:\Users\cools\Documents\Projects\hello-cgo\.idea/dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 20 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/watcherTasks.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 28 | 29 | 40 | 52 | 53 | 64 | 76 | 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, CoolSpring8 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-lolhtml 2 | 3 | ![GitHub Workflow Status](https://img.shields.io/github/workflow/status/coolspring8/go-lolhtml/Go) [![codecov](https://codecov.io/gh/CoolSpring8/go-lolhtml/branch/main/graph/badge.svg)](https://codecov.io/gh/CoolSpring8/go-lolhtml) [![Go Report Card](https://goreportcard.com/badge/github.com/coolspring8/go-lolhtml)](https://goreportcard.com/report/github.com/coolspring8/go-lolhtml) [![PkgGoDev](https://pkg.go.dev/badge/github.com/coolspring8/go-lolhtml)](https://pkg.go.dev/github.com/coolspring8/go-lolhtml) 4 | 5 | Go bindings for the Rust crate [cloudflare/lol-html](https://github.com/cloudflare/lol-html/), the *Low Output Latency streaming HTML rewriter/parser with CSS-selector based API*, talking via cgo. 6 | 7 | **Status:** 8 | 9 | **All abilities provided by lol_html's c-api are available**, except for customized user data in handlers. The original tests included in c-api package have also been translated to examine this binding's functionality. 10 | 11 | The code is at its early stage and **breaking changes might be introduced**. If you have any ideas on how the public API can be better structured, feel free to open a PR or an issue. 12 | 13 | * [go-lolhtml](#go-lolhtml) 14 | * [Installation](#installation) 15 | * [Features](#features) 16 | * [Getting Started](#getting-started) 17 | * [Examples](#examples) 18 | * [Documentation](#documentation) 19 | * [Other Bindings](#other-bindings) 20 | * [Versioning](#versioning) 21 | * [Help Wanted!](#help-wanted) 22 | * [License](#license) 23 | * [Disclaimer](#disclaimer) 24 | 25 | ## Installation 26 | 27 | For Linux/macOS/Windows x86_64 platform users, installation is as simple as a single `go get` command: 28 | 29 | ```shell 30 | $ go get github.com/coolspring8/go-lolhtml 31 | ``` 32 | 33 | Installing Rust is not a necessary step. That's because lol-html could be prebuilt into static libraries, stored and shipped in `/build` folder, so that cgo can handle other compilation matters naturally and smoothly, without intervention. 34 | 35 | For other platforms, you will have to compile it yourself. 36 | 37 | ## Features 38 | 39 | - Fast: A Go (cgo) wrapper built around the highly-optimized Rust HTML parsing crate lol_html. 40 | - Easy to use: Utilizing Go's idiomatic I/O methods, [lolhtml.Writer](https://pkg.go.dev/github.com/coolspring8/go-lolhtml#Writer) implements [io.Writer](https://golang.org/pkg/io/#Writer) interface. 41 | 42 | ## Getting Started 43 | 44 | Now let's initialize a project and create `main.go`: 45 | 46 | ```go 47 | package main 48 | 49 | import ( 50 | "bytes" 51 | "io" 52 | "log" 53 | "os" 54 | 55 | "github.com/coolspring8/go-lolhtml" 56 | ) 57 | 58 | func main() { 59 | chunk := []byte("Hello, World!") 60 | r := bytes.NewReader(chunk) 61 | w, err := lolhtml.NewWriter( 62 | // output to stdout 63 | os.Stdout, 64 | &lolhtml.Handlers{ 65 | ElementContentHandler: []lolhtml.ElementContentHandler{ 66 | { 67 | Selector: "span", 68 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 69 | err := e.SetInnerContentAsText("LOL-HTML") 70 | if err != nil { 71 | log.Fatal(err) 72 | } 73 | return lolhtml.Continue 74 | }, 75 | }, 76 | }, 77 | }, 78 | ) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | 83 | // copy from the bytes reader to lolhtml writer 84 | _, err = io.Copy(w, r) 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | 89 | // explicitly close the writer and flush the remaining content 90 | err = w.Close() 91 | if err != nil { 92 | log.Fatal(err) 93 | } 94 | // Output: Hello, LOL-HTML! 95 | } 96 | ``` 97 | 98 | The above program creates a new Writer configured to rewrite all texts in `span` tags to "LOL-HTML". It takes the chunk `Hello, World!` as input, and prints the result to standard output. 99 | 100 | And the result is `Hello, LOL-HTML!` . 101 | 102 | ## Examples 103 | 104 | example_test.go contains two examples. 105 | 106 | For more detailed examples, please visit the `/examples` subdirectory. 107 | 108 | - defer-scripts 109 | 110 | Usage: curl -NL https://git.io/JeOSZ | go run main.go 111 | 112 | - mixed-content-rewriter 113 | 114 | Usage: curl -NL https://git.io/JeOSZ | go run main.go 115 | 116 | - web-scraper 117 | 118 | A ported Go version of https://web.scraper.workers.dev/. 119 | 120 | ## Documentation 121 | 122 | Available at [pkg.go.dev](https://pkg.go.dev/github.com/coolspring8/go-lolhtml). 123 | 124 | ## Other Bindings 125 | 126 | - Rust (native), C, JavaScript - [cloudflare/lol-html](https://github.com/cloudflare/lol-html/) 127 | - Lua - [jdesgats/lua-lolhtml](https://github.com/jdesgats/lua-lolhtml/) 128 | 129 | ## Versioning 130 | 131 | This package does not really follow [Semantic Versioning](https://semver.org/). The current strategy is to follow lol_html's major and minor version, and the patch version number is reserved for this binding's updates, for Go Modul to upgrade correctly. 132 | 133 | ## Help Wanted! 134 | 135 | There are a few interesting things at [Projects](https://github.com/coolspring8/go-lolhtml/projects/1) panel that I have considered but is not yet implemented. Other contributions and suggestions are also welcome! 136 | 137 | ## License 138 | 139 | BSD 3-Clause "New" or "Revised" License 140 | 141 | ## Disclaimer 142 | 143 | This is an unofficial binding. 144 | 145 | Cloudflare is a registered trademark of Cloudflare, Inc. Cloudflare names used in this project are for identification purposes only. The project is not associated in any way with Cloudflare Inc. -------------------------------------------------------------------------------- /attribute.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | 9 | // AttributeIterator can be used to iterate over all attributes of an element. The only way to 10 | // get an AttributeIterator is by calling AttributeIterator() on an Element. Note the "range" syntax is not 11 | // applicable here, use AttributeIterator.Next() instead. 12 | type AttributeIterator C.lol_html_attributes_iterator_t 13 | 14 | // Attribute represents an HTML element attribute. Obtained by calling Next() on an AttributeIterator. 15 | type Attribute C.lol_html_attribute_t 16 | 17 | // Free frees the memory held by the AttributeIterator. 18 | func (ai *AttributeIterator) Free() { 19 | C.lol_html_attributes_iterator_free((*C.lol_html_attributes_iterator_t)(ai)) 20 | } 21 | 22 | // Next advances the iterator and returns next attribute. 23 | // Returns nil if the iterator has been exhausted. 24 | func (ai *AttributeIterator) Next() *Attribute { 25 | return (*Attribute)(C.lol_html_attributes_iterator_next((*C.lol_html_attributes_iterator_t)(ai))) 26 | } 27 | 28 | // Name returns the name of the attribute. 29 | func (a *Attribute) Name() string { 30 | nameC := (str)(C.lol_html_attribute_name_get((*C.lol_html_attribute_t)(a))) 31 | defer nameC.Free() 32 | return nameC.String() 33 | } 34 | 35 | // Value returns the value of the attribute. 36 | func (a *Attribute) Value() string { 37 | valueC := (str)(C.lol_html_attribute_value_get((*C.lol_html_attribute_t)(a))) 38 | defer valueC.Free() 39 | return valueC.String() 40 | } 41 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "path/filepath" 9 | "runtime" 10 | "testing" 11 | 12 | "github.com/coolspring8/go-lolhtml" 13 | ) 14 | 15 | const dataDir = "testdata" 16 | 17 | const ChunkSize = 1024 18 | 19 | func BenchmarkNewWriter(b *testing.B) { 20 | benchmarks := []struct { 21 | category string 22 | name string 23 | handlers *lolhtml.Handlers 24 | }{ 25 | { 26 | "Parsing", 27 | "TagScanner", 28 | nil, 29 | }, 30 | { 31 | "Parsing", 32 | "Lexer", 33 | &lolhtml.Handlers{ 34 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 35 | { 36 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective { 37 | return lolhtml.Continue 38 | }, 39 | }, 40 | }, 41 | }, 42 | }, 43 | { 44 | "Parsing", 45 | "TextRewritableUnitParsingAndDecoding", 46 | &lolhtml.Handlers{ 47 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 48 | { 49 | TextChunkHandler: func(c *lolhtml.TextChunk) lolhtml.RewriterDirective { 50 | return lolhtml.Continue 51 | }, 52 | }, 53 | }, 54 | }, 55 | }, 56 | { 57 | "Rewriting", 58 | "ModificationOfTagsOfAnElementWithLotsOfContent", 59 | &lolhtml.Handlers{ 60 | ElementContentHandler: []lolhtml.ElementContentHandler{ 61 | { 62 | Selector: "body", 63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 64 | err := e.SetTagName("body1") 65 | if err != nil { 66 | b.Fatal(err) 67 | } 68 | err = e.InsertAfterEndTagAsText("test") 69 | if err != nil { 70 | b.Fatal(err) 71 | } 72 | return lolhtml.Continue 73 | }, 74 | }, 75 | }, 76 | }, 77 | }, 78 | { 79 | "Rewriting", 80 | "RemoveContentOfAnElement", 81 | &lolhtml.Handlers{ 82 | ElementContentHandler: []lolhtml.ElementContentHandler{ 83 | { 84 | Selector: "ul", 85 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 86 | err := e.SetInnerContentAsText("") 87 | if err != nil { 88 | b.Fatal(err) 89 | } 90 | return lolhtml.Continue 91 | }, 92 | }, 93 | }, 94 | }, 95 | }, 96 | { 97 | "SelectorMatching", 98 | "MatchAllSelector", 99 | &lolhtml.Handlers{ 100 | ElementContentHandler: []lolhtml.ElementContentHandler{ 101 | { 102 | Selector: "*", 103 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 104 | return lolhtml.Continue 105 | }, 106 | }, 107 | }, 108 | }, 109 | }, 110 | { 111 | "SelectorMatching", 112 | "TagNameSelector", 113 | &lolhtml.Handlers{ 114 | ElementContentHandler: []lolhtml.ElementContentHandler{ 115 | { 116 | Selector: "div", 117 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 118 | return lolhtml.Continue 119 | }, 120 | }, 121 | }, 122 | }, 123 | }, 124 | { 125 | "SelectorMatching", 126 | "ClassSelector", 127 | &lolhtml.Handlers{ 128 | ElementContentHandler: []lolhtml.ElementContentHandler{ 129 | { 130 | Selector: ".note", 131 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 132 | return lolhtml.Continue 133 | }, 134 | }, 135 | }, 136 | }, 137 | }, 138 | { 139 | "SelectorMatching", 140 | "AttributeSelector", 141 | &lolhtml.Handlers{ 142 | ElementContentHandler: []lolhtml.ElementContentHandler{ 143 | { 144 | Selector: "[href]", 145 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 146 | return lolhtml.Continue 147 | }, 148 | }, 149 | }, 150 | }, 151 | }, 152 | { 153 | "SelectorMatching", 154 | "MultipleSelectors", 155 | &lolhtml.Handlers{ 156 | ElementContentHandler: []lolhtml.ElementContentHandler{ 157 | { 158 | Selector: "ul", 159 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 160 | return lolhtml.Continue 161 | }, 162 | }, 163 | { 164 | Selector: "ul > li", 165 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 166 | return lolhtml.Continue 167 | }, 168 | }, 169 | { 170 | Selector: "table > tbody td dfn", 171 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 172 | return lolhtml.Continue 173 | }, 174 | }, 175 | { 176 | Selector: "body table > tbody tr", 177 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 178 | return lolhtml.Continue 179 | }, 180 | }, 181 | { 182 | Selector: "body [href]", 183 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 184 | return lolhtml.Continue 185 | }, 186 | }, 187 | { 188 | Selector: "div img", 189 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 190 | return lolhtml.Continue 191 | }, 192 | }, 193 | { 194 | Selector: "div.note span", 195 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 196 | return lolhtml.Continue 197 | }, 198 | }, 199 | }, 200 | }, 201 | }, 202 | } 203 | 204 | files, err := ioutil.ReadDir(dataDir) 205 | if err != nil { 206 | b.Fatal("benchmark data files not found", err) 207 | } 208 | 209 | for _, file := range files { 210 | data, err := ioutil.ReadFile(filepath.Join(dataDir, file.Name())) 211 | if err != nil { 212 | b.Fatal("cannot read benchmark data files", err) 213 | } 214 | 215 | for _, bm := range benchmarks { 216 | b.Run(fmt.Sprintf("%s-%s-%s", bm.category, bm.name, file.Name()), func(b *testing.B) { 217 | b.SetBytes(int64(len(data))) 218 | b.ReportAllocs() 219 | runtime.GC() 220 | b.ResetTimer() 221 | for i := 0; i < b.N; i++ { 222 | w, err := lolhtml.NewWriter(nil, bm.handlers) 223 | if err != nil { 224 | b.Fatal(err) 225 | } 226 | 227 | r := bytes.NewReader(data) 228 | copyBuf := make([]byte, ChunkSize) 229 | _, err = io.CopyBuffer(w, r, copyBuf) 230 | if err != nil { 231 | b.Fatal(err) 232 | } 233 | 234 | err = w.Close() 235 | if err != nil { 236 | b.Fatal(err) 237 | } 238 | } 239 | }) 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /build/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2019, Cloudflare, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /build/include/lol_html.h: -------------------------------------------------------------------------------- 1 | #ifndef LOL_HTML_H 2 | #define LOL_HTML_H 3 | 4 | #if defined(__cplusplus) 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | #include 10 | 11 | // NOTE: all functions that accept pointers will panic abort the thread 12 | // if NULL pointer is passed (with an exception for the cases where 13 | // explicitly stated that function can accept NULL pointers). 14 | 15 | // NOTE: all UTF8-strings passed to the API functions allow interior '\0's 16 | // and their length determined by the corresponding length parameter only. 17 | 18 | // Opaque structures used by the rewriter. 19 | // WARNING: these structures should never be deallocated by the C code. 20 | // There are appropriate methods exposed that take care of these structures 21 | // deallocation. 22 | typedef struct lol_html_HtmlRewriterBuilder lol_html_rewriter_builder_t; 23 | typedef struct lol_html_HtmlRewriter lol_html_rewriter_t; 24 | typedef struct lol_html_Doctype lol_html_doctype_t; 25 | typedef struct lol_html_DocumentEnd lol_html_doc_end_t; 26 | typedef struct lol_html_Comment lol_html_comment_t; 27 | typedef struct lol_html_TextChunk lol_html_text_chunk_t; 28 | typedef struct lol_html_Element lol_html_element_t; 29 | typedef struct lol_html_AttributesIterator lol_html_attributes_iterator_t; 30 | typedef struct lol_html_Attribute lol_html_attribute_t; 31 | typedef struct lol_html_Selector lol_html_selector_t; 32 | 33 | // Library-allocated UTF8 string fat pointer. 34 | // 35 | // The string is not NULL-terminated. 36 | // 37 | // Should NEVER be deallocated in the C code. Use special `lol_html_str_free` 38 | // function instead. 39 | typedef struct { 40 | // String data pointer. 41 | const char *data; 42 | 43 | // The length of the string in bytes. 44 | size_t len; 45 | } lol_html_str_t; 46 | 47 | // A fat pointer to text chunk content. 48 | // 49 | // The difference between this struct and `lol_html_str_t` is 50 | // that text chunk content shouldn't be deallocated manually via 51 | // `lol_html_str_free` method call. Instead the pointer becomes 52 | // invalid ones related `lol_html_text_chunk_t` struct goes out 53 | // of scope. 54 | typedef struct { 55 | // String data pointer. 56 | const char *data; 57 | 58 | // The length of the string in bytes. 59 | size_t len; 60 | } lol_html_text_chunk_content_t; 61 | 62 | // Utilities 63 | //--------------------------------------------------------------------- 64 | 65 | // Frees the memory held by the library-allocated string. 66 | void lol_html_str_free(lol_html_str_t str); 67 | 68 | // Returns the last error message and resets last error to NULL. 69 | // 70 | // Return NULL if there was no error. 71 | lol_html_str_t *lol_html_take_last_error(); 72 | 73 | // Creates new HTML rewriter builder. 74 | lol_html_rewriter_builder_t *lol_html_rewriter_builder_new(); 75 | 76 | // Content handlers 77 | //--------------------------------------------------------------------- 78 | // Rewriter directive that should be returned from each content handler. 79 | // If LOL_HTML_STOP directive is returned then rewriting stops immediately 80 | // and `write()` or `end()` methods of the rewriter return an error code. 81 | typedef enum { 82 | LOL_HTML_CONTINUE, 83 | LOL_HTML_STOP 84 | } lol_html_rewriter_directive_t; 85 | 86 | typedef lol_html_rewriter_directive_t (*lol_html_doctype_handler_t)( 87 | lol_html_doctype_t *doctype, 88 | void *user_data 89 | ); 90 | 91 | typedef lol_html_rewriter_directive_t (*lol_html_comment_handler_t)( 92 | lol_html_comment_t *comment, 93 | void *user_data 94 | ); 95 | 96 | typedef lol_html_rewriter_directive_t (*lol_html_text_handler_handler_t)( 97 | lol_html_text_chunk_t *chunk, 98 | void *user_data 99 | ); 100 | 101 | typedef lol_html_rewriter_directive_t (*lol_html_element_handler_t)( 102 | lol_html_element_t *element, 103 | void *user_data 104 | ); 105 | 106 | typedef lol_html_rewriter_directive_t (*lol_html_doc_end_handler_t)( 107 | lol_html_doc_end_t *doc_end, 108 | void *user_data 109 | ); 110 | 111 | // Selector 112 | //--------------------------------------------------------------------- 113 | 114 | // Parses given CSS selector string. 115 | // 116 | // Returns NULL if parsing error occurs. The actual error message 117 | // can be obtained using `lol_html_take_last_error` function. 118 | // 119 | // WARNING: Selector SHOULD NOT be deallocated if there are any active rewriter 120 | // builders that accepted it as an argument to `lol_html_rewriter_builder_add_element_content_handlers()` 121 | // method. Deallocate all dependant rewriter builders first and then 122 | // use `lol_html_selector_free` function to free the selector. 123 | lol_html_selector_t *lol_html_selector_parse( 124 | const char *selector, 125 | size_t selector_len 126 | ); 127 | 128 | // Frees the memory held by the parsed selector object. 129 | void lol_html_selector_free(lol_html_selector_t *selector); 130 | 131 | 132 | // Rewriter builder 133 | //--------------------------------------------------------------------- 134 | 135 | // Adds document-level content handlers to the builder. 136 | // 137 | // If a particular handler is not required then NULL can be passed 138 | // instead. Don't use stub handlers in this case as this affects 139 | // performance - rewriter skips parsing of the content that doesn't 140 | // need to be processed. 141 | // 142 | // Each handler can optionally have associated user data which will be 143 | // passed to the handler on each invocation along with the rewritable 144 | // unit argument. 145 | // 146 | // If any of handlers return LOL_HTML_STOP directive then rewriting 147 | // stops immediately and `write()` or `end()` of the rewriter methods 148 | // return an error code. 149 | // 150 | // WARNING: Pointers passed to handlers are valid only during the 151 | // handler execution. So they should never be leaked outside of handlers. 152 | void lol_html_rewriter_builder_add_document_content_handlers( 153 | lol_html_rewriter_builder_t *builder, 154 | lol_html_doctype_handler_t doctype_handler, 155 | void *doctype_handler_user_data, 156 | lol_html_comment_handler_t comment_handler, 157 | void *comment_handler_user_data, 158 | lol_html_text_handler_handler_t text_handler, 159 | void *text_handler_user_data, 160 | lol_html_doc_end_handler_t doc_end_handler, 161 | void *doc_end_user_data 162 | ); 163 | 164 | // Adds element content handlers to the builder for the 165 | // given CSS selector. 166 | // 167 | // Selector should be a valid UTF8-string. 168 | // 169 | // If a particular handler is not required then NULL can be passed 170 | // instead. Don't use stub handlers in this case as this affects 171 | // performance - rewriter skips parsing of the content that doesn't 172 | // need to be processed. 173 | // 174 | // Each handler can optionally have associated user data which will be 175 | // passed to the handler on each invocation along with the rewritable 176 | // unit argument. 177 | // 178 | // If any of handlers return LOL_HTML_STOP directive then rewriting 179 | // stops immediately and `write()` or `end()` of the rewriter methods 180 | // return an error code. 181 | // 182 | // Returns 0 in case of success and -1 otherwise. The actual error message 183 | // can be obtained using `lol_html_take_last_error` function. 184 | // 185 | // WARNING: Pointers passed to handlers are valid only during the 186 | // handler execution. So they should never be leaked outside of handlers. 187 | int lol_html_rewriter_builder_add_element_content_handlers( 188 | lol_html_rewriter_builder_t *builder, 189 | const lol_html_selector_t *selector, 190 | lol_html_element_handler_t element_handler, 191 | void *element_handler_user_data, 192 | lol_html_comment_handler_t comment_handler, 193 | void *comment_handler_user_data, 194 | lol_html_text_handler_handler_t text_handler, 195 | void *text_handler_user_data 196 | ); 197 | 198 | // Frees the memory held by the builder. 199 | // 200 | // Note that builder can be freed before any rewriters constructed from 201 | // it if it's not intended to be used anymore. 202 | void lol_html_rewriter_builder_free(lol_html_rewriter_builder_t *builder); 203 | 204 | 205 | // Rewriter 206 | //--------------------------------------------------------------------- 207 | 208 | // Memory management settings for the rewriter. 209 | typedef struct { 210 | // Preallocated size of the parsing buffer. 211 | // 212 | // Can be set to 0. In this case rewriter won't consume any memory initially, 213 | // though there might be a performance penalty due to later reallocations. 214 | size_t preallocated_parsing_buffer_size; 215 | // Maximum amount of memory to be used by a rewriter. 216 | // 217 | // `lol_html_rewriter_write` and `lol_html_rewriter_end` will return an error 218 | // if this limit is exceeded. 219 | size_t max_allowed_memory_usage; 220 | } lol_html_memory_settings_t; 221 | 222 | // Builds HTML-rewriter out of the provided builder. Can be called 223 | // multiple times to construct different rewriters from the same 224 | // builder. 225 | // 226 | // `output_sink` receives a zero-length chunk on the end of the output. 227 | // 228 | // `output_sink` can optionally have associated user data that will 229 | // be passed to handler on each invocation along with other arguments. 230 | // 231 | // `strict` mode will bail out from tokenization process in cases when 232 | // there is no way to determine correct parsing context. Recommended 233 | // setting for safety reasons. 234 | // 235 | // In case of an error the function returns a NULL pointer. 236 | lol_html_rewriter_t *lol_html_rewriter_build( 237 | lol_html_rewriter_builder_t *builder, 238 | const char *encoding, 239 | size_t encoding_len, 240 | lol_html_memory_settings_t memory_settings, 241 | void (*output_sink)(const char *chunk, size_t chunk_len, void *user_data), 242 | void *output_sink_user_data, 243 | bool strict 244 | ); 245 | 246 | // Write HTML chunk to rewriter. 247 | // 248 | // Returns 0 in case of success and -1 otherwise. The actual error message 249 | // can be obtained using `lol_html_take_last_error` function. 250 | // 251 | // WARNING: if this function errors the rewriter gets into the unrecovarable state, 252 | // so any further attempts to use the rewriter will cause a thread panic. 253 | int lol_html_rewriter_write( 254 | lol_html_rewriter_t *rewriter, 255 | const char *chunk, 256 | size_t chunk_len 257 | ); 258 | 259 | // Completes rewriting and flushes the remaining output. 260 | // 261 | // Returns 0 in case of success and -1 otherwise. The actual error message 262 | // can be obtained using `lol_html_take_last_error` function. 263 | // 264 | // WARNING: after calling this function, further attempts to use the rewriter 265 | // (other than `lol_html_rewriter_free`) will cause a thread panic. 266 | int lol_html_rewriter_end(lol_html_rewriter_t *rewriter); 267 | 268 | // Frees the memory held by the rewriter. 269 | void lol_html_rewriter_free(lol_html_rewriter_t *rewriter); 270 | 271 | // Doctype 272 | //--------------------------------------------------------------------- 273 | 274 | // Returns doctype's name. 275 | // 276 | // Returns NULL if the doctype doesn't have a name. 277 | lol_html_str_t *lol_html_doctype_name_get(const lol_html_doctype_t *doctype); 278 | 279 | // Returns doctype's PUBLIC identifier. 280 | // 281 | // Returns NULL if the doctype doesn't have a PUBLIC identifier. 282 | lol_html_str_t *lol_html_doctype_public_id_get(const lol_html_doctype_t *doctype); 283 | 284 | // Returns doctype's SYSTEM identifier. 285 | // 286 | // Returns NULL if the doctype doesn't have a SYSTEM identifier. 287 | lol_html_str_t *lol_html_doctype_system_id_get(const lol_html_doctype_t *doctype); 288 | 289 | // Attaches custom user data to the doctype. 290 | // 291 | // The same doctype can be passed to multiple handlers if it has been 292 | // captured by multiple selectors. It might be handy to store some processing 293 | // state on the doctype, so it can be shared between handlers. 294 | void lol_html_doctype_user_data_set( 295 | const lol_html_doctype_t *doctype, 296 | void *user_data 297 | ); 298 | 299 | // Returns user data attached to the doctype. 300 | void *lol_html_doctype_user_data_get(const lol_html_doctype_t *doctype); 301 | 302 | // Comment 303 | //--------------------------------------------------------------------- 304 | 305 | // Returns comment text. 306 | lol_html_str_t lol_html_comment_text_get(const lol_html_comment_t *comment); 307 | 308 | // Sets comment text. 309 | // 310 | // Text should be a valid UTF8-string. 311 | // 312 | // Returns 0 in case of success and -1 otherwise. The actual error message 313 | // can be obtained using `lol_html_take_last_error` function. 314 | int lol_html_comment_text_set( 315 | lol_html_comment_t *comment, 316 | const char *text, 317 | size_t text_len 318 | ); 319 | 320 | // Inserts the content string before the comment either as raw text or as HTML. 321 | // 322 | // Content should be a valid UTF8-string. 323 | // 324 | // Returns 0 in case of success and -1 otherwise. The actual error message 325 | // can be obtained using `lol_html_take_last_error` function. 326 | int lol_html_comment_before( 327 | lol_html_comment_t *comment, 328 | const char *content, 329 | size_t content_len, 330 | bool is_html 331 | ); 332 | 333 | // Inserts the content string after the comment either as raw text or as HTML. 334 | // 335 | // Content should be a valid UTF8-string. 336 | // 337 | // Returns 0 in case of success and -1 otherwise. The actual error message 338 | // can be obtained using `lol_html_take_last_error` function. 339 | int lol_html_comment_after( 340 | lol_html_comment_t *comment, 341 | const char *content, 342 | size_t content_len, 343 | bool is_html 344 | ); 345 | 346 | // Replace the comment with the content of the string which is interpreted 347 | // either as raw text or as HTML. 348 | // 349 | // Content should be a valid UTF8-string. 350 | // 351 | // Returns 0 in case of success and -1 otherwise. The actual error message 352 | // can be obtained using `lol_html_take_last_error` function. 353 | int lol_html_comment_replace( 354 | lol_html_comment_t *comment, 355 | const char *content, 356 | size_t content_len, 357 | bool is_html 358 | ); 359 | 360 | // Removes the comment. 361 | void lol_html_comment_remove(lol_html_comment_t *comment); 362 | 363 | // Returns `true` if the comment has been removed. 364 | bool lol_html_comment_is_removed(const lol_html_comment_t *comment); 365 | 366 | // Attaches custom user data to the comment. 367 | // 368 | // The same comment can be passed to multiple handlers if it has been 369 | // captured by multiple selectors. It might be handy to store some 370 | // processing state on the comment, so it can be shared between handlers. 371 | void lol_html_comment_user_data_set( 372 | const lol_html_comment_t *comment, 373 | void *user_data 374 | ); 375 | 376 | // Returns user data attached to the comment. 377 | void *lol_html_comment_user_data_get(const lol_html_comment_t *comment); 378 | 379 | 380 | // Text chunk 381 | //--------------------------------------------------------------------- 382 | 383 | // Returns a fat pointer to the UTF8 representation of content of the chunk. 384 | // 385 | // If the chunk is last in the current text node then content can be an empty string. 386 | // 387 | // WARNING: The pointer is valid only during the handler execution and 388 | // should never be leaked outside of handlers. 389 | lol_html_text_chunk_content_t lol_html_text_chunk_content_get( 390 | const lol_html_text_chunk_t *chunk 391 | ); 392 | 393 | // Returns `true` if the chunk is last in the current text node. 394 | bool lol_html_text_chunk_is_last_in_text_node(const lol_html_text_chunk_t *chunk); 395 | 396 | // Inserts the content string before the text chunk either as raw text or as HTML. 397 | // 398 | // Content should be a valid UTF8-string. 399 | // 400 | // Returns 0 in case of success and -1 otherwise. The actual error message 401 | // can be obtained using `lol_html_take_last_error` function. 402 | int lol_html_text_chunk_before( 403 | lol_html_text_chunk_t *chunk, 404 | const char *content, 405 | size_t content_len, 406 | bool is_html 407 | ); 408 | 409 | // Inserts the content string after the text chunk either as raw text or as HTML. 410 | // 411 | // Content should be a valid UTF8-string. 412 | // 413 | // Returns 0 in case of success and -1 otherwise. The actual error message 414 | // can be obtained using `lol_html_take_last_error` function. 415 | int lol_html_text_chunk_after( 416 | lol_html_text_chunk_t *chunk, 417 | const char *content, 418 | size_t content_len, 419 | bool is_html 420 | ); 421 | 422 | // Replace the text chunk with the content of the string which is interpreted 423 | // either as raw text or as HTML. 424 | // 425 | // Content should be a valid UTF8-string. 426 | // 427 | // Returns 0 in case of success and -1 otherwise. The actual error message 428 | // can be obtained using `lol_html_take_last_error` function. 429 | int lol_html_text_chunk_replace( 430 | lol_html_text_chunk_t *chunk, 431 | const char *content, 432 | size_t content_len, 433 | bool is_html 434 | ); 435 | 436 | // Removes the text chunk. 437 | void lol_html_text_chunk_remove(lol_html_text_chunk_t *chunk); 438 | 439 | // Returns `true` if the text chunk has been removed. 440 | bool lol_html_text_chunk_is_removed(const lol_html_text_chunk_t *chunk); 441 | 442 | // Attaches custom user data to the text chunk. 443 | // 444 | // The same text chunk can be passed to multiple handlers if it has been 445 | // captured by multiple selectors. It might be handy to store some processing 446 | // state on the chunk, so it can be shared between handlers. 447 | void lol_html_text_chunk_user_data_set( 448 | const lol_html_text_chunk_t *chunk, 449 | void *user_data 450 | ); 451 | 452 | // Returns user data attached to the text chunk. 453 | void *lol_html_text_chunk_user_data_get(const lol_html_text_chunk_t *chunk); 454 | 455 | 456 | // Element 457 | //--------------------------------------------------------------------- 458 | 459 | // Returns the tag name of the element. 460 | lol_html_str_t lol_html_element_tag_name_get(const lol_html_element_t *element); 461 | 462 | // Sets the tag name of the element. 463 | // 464 | // Name should be a valid UTF8-string. 465 | // 466 | // Returns 0 in case of success and -1 otherwise. The actual error message 467 | // can be obtained using `lol_html_take_last_error` function. 468 | int lol_html_element_tag_name_set( 469 | lol_html_element_t *element, 470 | const char *name, 471 | size_t name_len 472 | ); 473 | 474 | // Returns the namespace URI of the element. 475 | // 476 | // NOTE: This method returns static zero-terminated C string, so it don't 477 | // need to be freed. 478 | const char* lol_html_element_namespace_uri_get(const lol_html_element_t *element); 479 | 480 | // Returns the iterator over the element attributes. 481 | // 482 | // WARNING: The iterator is valid only during the handler execution and 483 | // should never be leaked outside of it. 484 | // 485 | // Use `lol_html_attributes_iterator_free` function to deallocate 486 | // returned iterator. 487 | lol_html_attributes_iterator_t *lol_html_attributes_iterator_get( 488 | const lol_html_element_t *element 489 | ); 490 | 491 | // Frees the memory held by the attribute iterator. 492 | void lol_html_attributes_iterator_free(lol_html_attributes_iterator_t *iterator); 493 | 494 | // Advances the iterator and returns next attribute. 495 | // 496 | // Returns NULL if iterator has been exhausted. 497 | // 498 | // WARNING: Returned attribute is valid only during the handler 499 | // execution and should never be leaked outside of it. 500 | const lol_html_attribute_t *lol_html_attributes_iterator_next( 501 | lol_html_attributes_iterator_t *iterator 502 | ); 503 | 504 | // Returns the attribute name. 505 | lol_html_str_t lol_html_attribute_name_get(const lol_html_attribute_t *attribute); 506 | 507 | // Returns the attribute value. 508 | lol_html_str_t lol_html_attribute_value_get(const lol_html_attribute_t *attribute); 509 | 510 | // Returns the attribute value or NULL if attribute with the given name 511 | // doesn't exist on the element. 512 | // 513 | // Name should be a valid UTF8-string. 514 | // 515 | // If the provided name is invalid UTF8-string the function returns NULL as well. 516 | // Therefore one should always check `lol_html_take_last_error` result after the call. 517 | lol_html_str_t *lol_html_element_get_attribute( 518 | const lol_html_element_t *element, 519 | const char *name, 520 | size_t name_len 521 | ); 522 | 523 | // Returns 1 if element has attribute with the given name, and 0 otherwise. 524 | // Returns -1 in case of an error. 525 | // 526 | // Name should be a valid UTF8-string. 527 | int lol_html_element_has_attribute( 528 | const lol_html_element_t *element, 529 | const char *name, 530 | size_t name_len 531 | ); 532 | 533 | // Updates the attribute value if attribute with the given name already exists on 534 | // the element, or creates adds new attribute with given name and value otherwise. 535 | // 536 | // Name and value should be valid UTF8-strings. 537 | // 538 | // Returns 0 in case of success and -1 otherwise. The actual error message 539 | // can be obtained using `lol_html_take_last_error` function. 540 | int lol_html_element_set_attribute( 541 | lol_html_element_t *element, 542 | const char *name, 543 | size_t name_len, 544 | const char *value, 545 | size_t value_len 546 | ); 547 | 548 | // Removes the attribute with the given name from the element. 549 | // 550 | // Name should be a valid UTF8-string. 551 | // 552 | // Returns 0 in case of success and -1 otherwise. The actual error message 553 | // can be obtained using `lol_html_take_last_error` function. 554 | int lol_html_element_remove_attribute( 555 | lol_html_element_t *element, 556 | const char *name, 557 | size_t name_len 558 | ); 559 | 560 | // Inserts the content string before the element either as raw text or as HTML. 561 | // 562 | // Content should be a valid UTF8-string. 563 | // 564 | // Returns 0 in case of success and -1 otherwise. The actual error message 565 | // can be obtained using `lol_html_take_last_error` function. 566 | int lol_html_element_before( 567 | lol_html_element_t *element, 568 | const char *content, 569 | size_t content_len, 570 | bool is_html 571 | ); 572 | 573 | // Inserts the content string right after the element's start tag 574 | // either as raw text or as HTML. 575 | // 576 | // Content should be a valid UTF8-string. 577 | // 578 | // Returns 0 in case of success and -1 otherwise. The actual error message 579 | // can be obtained using `lol_html_take_last_error` function. 580 | int lol_html_element_prepend( 581 | lol_html_element_t *element, 582 | const char *content, 583 | size_t content_len, 584 | bool is_html 585 | ); 586 | 587 | // Inserts the content string right before the element's end tag 588 | // either as raw text or as HTML. 589 | // 590 | // Content should be a valid UTF8-string. 591 | // 592 | // Returns 0 in case of success and -1 otherwise. The actual error message 593 | // can be obtained using `lol_html_take_last_error` function. 594 | int lol_html_element_append( 595 | lol_html_element_t *element, 596 | const char *content, 597 | size_t content_len, 598 | bool is_html 599 | ); 600 | 601 | // Inserts the content string right after the element's end tag as raw text or as HTML. 602 | // 603 | // Content should be a valid UTF8-string. 604 | // 605 | // Returns 0 in case of success and -1 otherwise. The actual error message 606 | // can be obtained using `lol_html_take_last_error` function. 607 | int lol_html_element_after( 608 | lol_html_element_t *element, 609 | const char *content, 610 | size_t content_len, 611 | bool is_html 612 | ); 613 | 614 | // Sets either text or HTML inner content of the element. 615 | // 616 | // Content should be a valid UTF8-string. 617 | // 618 | // Returns 0 in case of success and -1 otherwise. The actual error message 619 | // can be obtained using `lol_html_take_last_error` function. 620 | int lol_html_element_set_inner_content( 621 | lol_html_element_t *element, 622 | const char *content, 623 | size_t content_len, 624 | bool is_html 625 | ); 626 | 627 | // Replaces the element with the provided text or HTML content. 628 | // 629 | // Content should be a valid UTF8-string. 630 | // 631 | // Returns 0 in case of success and -1 otherwise. The actual error message 632 | // can be obtained using `lol_html_take_last_error` function. 633 | int lol_html_element_replace( 634 | lol_html_element_t *element, 635 | const char *content, 636 | size_t content_len, 637 | bool is_html 638 | ); 639 | 640 | // Removes the element. 641 | void lol_html_element_remove(const lol_html_element_t *element); 642 | 643 | // Removes the element, but leaves its inner content intact. 644 | void lol_html_element_remove_and_keep_content(const lol_html_element_t *element); 645 | 646 | // Returns `true` if the element has been removed. 647 | bool lol_html_element_is_removed(const lol_html_element_t *element); 648 | 649 | // Attaches custom user data to the element. 650 | // 651 | // The same element can be passed to multiple handlers if it has been 652 | // captured by multiple selectors. It might be handy to store some processing 653 | // state on the element, so it can be shared between handlers. 654 | void lol_html_element_user_data_set( 655 | const lol_html_element_t *element, 656 | void *user_data 657 | ); 658 | 659 | // Returns user data attached to the text chunk. 660 | void *lol_html_element_user_data_get(const lol_html_element_t *element); 661 | 662 | // Inserts the content at the end of the document, either as raw text or as HTML. 663 | // 664 | // The content should be a valid UTF-8 string. 665 | // 666 | // Returns 0 if successful, and -1 otherwise. The actual error message 667 | // can be obtained using the `lol_html_take_last_error` function. 668 | int lol_html_doc_end_append( 669 | lol_html_doc_end_t *doc_end, 670 | const char *content, 671 | size_t content_len, 672 | bool is_html 673 | ); 674 | 675 | #if defined(__cplusplus) 676 | } // extern C 677 | #endif 678 | 679 | #endif // LOL_HTML_H 680 | -------------------------------------------------------------------------------- /build/linux-x86_64/liblolhtml.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/build/linux-x86_64/liblolhtml.a -------------------------------------------------------------------------------- /build/macos-x86_64/liblolhtml.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/build/macos-x86_64/liblolhtml.a -------------------------------------------------------------------------------- /build/windows-x86_64/liblolhtml.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/build/windows-x86_64/liblolhtml.a -------------------------------------------------------------------------------- /builder.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | extern void callback_sink(const char *chunk, size_t chunk_len, void *user_data); 7 | extern lol_html_rewriter_directive_t callback_doctype(lol_html_doctype_t *doctype, void *user_data); 8 | extern lol_html_rewriter_directive_t callback_comment(lol_html_comment_t *comment, void *user_data); 9 | extern lol_html_rewriter_directive_t callback_text_chunk(lol_html_text_chunk_t *text_chunk, void *user_data); 10 | extern lol_html_rewriter_directive_t callback_element(lol_html_element_t *element, void *user_data); 11 | extern lol_html_rewriter_directive_t callback_doc_end(lol_html_doc_end_t *doc_end, void *user_data); 12 | */ 13 | import "C" 14 | import ( 15 | "unsafe" 16 | ) 17 | 18 | // rewriterBuilder is used to build a rewriter. 19 | type rewriterBuilder struct { 20 | rb *C.lol_html_rewriter_builder_t 21 | pointers []unsafe.Pointer 22 | built bool // this builder has built at least one writer 23 | } 24 | 25 | func newRewriterBuilder() *rewriterBuilder { 26 | return &rewriterBuilder{rb: C.lol_html_rewriter_builder_new(), pointers: nil, built: false} 27 | } 28 | 29 | func (rb *rewriterBuilder) Free() { 30 | if rb != nil { 31 | C.lol_html_rewriter_builder_free(rb.rb) 32 | if !rb.built { 33 | unrefPointers(rb.pointers) 34 | } 35 | } 36 | } 37 | 38 | func (rb *rewriterBuilder) AddDocumentContentHandlers( 39 | doctypeHandler DoctypeHandlerFunc, 40 | commentHandler CommentHandlerFunc, 41 | textChunkHandler TextChunkHandlerFunc, 42 | documentEndHandler DocumentEndHandlerFunc, 43 | ) { 44 | var cCallbackDoctypePointer, cCallbackCommentPointer, cCallbackTextChunkPointer, cCallbackDocumentEndPointer *[0]byte 45 | if doctypeHandler != nil { 46 | cCallbackDoctypePointer = (*[0]byte)(C.callback_doctype) 47 | } 48 | if commentHandler != nil { 49 | cCallbackCommentPointer = (*[0]byte)(C.callback_comment) 50 | } 51 | if textChunkHandler != nil { 52 | cCallbackTextChunkPointer = (*[0]byte)(C.callback_text_chunk) 53 | } 54 | if documentEndHandler != nil { 55 | cCallbackDocumentEndPointer = (*[0]byte)(C.callback_doc_end) 56 | } 57 | doctypeHandlerPointer := savePointer(doctypeHandler) 58 | commentHandlerPointer := savePointer(commentHandler) 59 | textChunkHandlerPointer := savePointer(textChunkHandler) 60 | documentEndHandlerPointer := savePointer(documentEndHandler) 61 | C.lol_html_rewriter_builder_add_document_content_handlers( 62 | rb.rb, 63 | cCallbackDoctypePointer, 64 | doctypeHandlerPointer, 65 | cCallbackCommentPointer, 66 | commentHandlerPointer, 67 | cCallbackTextChunkPointer, 68 | textChunkHandlerPointer, 69 | cCallbackDocumentEndPointer, 70 | documentEndHandlerPointer, 71 | ) 72 | rb.pointers = append( 73 | rb.pointers, 74 | doctypeHandlerPointer, 75 | commentHandlerPointer, 76 | textChunkHandlerPointer, 77 | documentEndHandlerPointer, 78 | ) 79 | } 80 | 81 | func (rb *rewriterBuilder) AddElementContentHandlers( 82 | selector *selector, 83 | elementHandler ElementHandlerFunc, 84 | commentHandler CommentHandlerFunc, 85 | textChunkHandler TextChunkHandlerFunc, 86 | ) { 87 | var cCallbackElementPointer, cCallbackCommentPointer, cCallbackTextChunkPointer *[0]byte 88 | if elementHandler != nil { 89 | cCallbackElementPointer = (*[0]byte)(C.callback_element) 90 | } 91 | if commentHandler != nil { 92 | cCallbackCommentPointer = (*[0]byte)(C.callback_comment) 93 | } 94 | if textChunkHandler != nil { 95 | cCallbackTextChunkPointer = (*[0]byte)(C.callback_text_chunk) 96 | } 97 | elementHandlerPointer := savePointer(elementHandler) 98 | commentHandlerPointer := savePointer(commentHandler) 99 | textChunkHandlerPointer := savePointer(textChunkHandler) 100 | C.lol_html_rewriter_builder_add_element_content_handlers( 101 | rb.rb, 102 | (*C.lol_html_selector_t)(selector), 103 | cCallbackElementPointer, 104 | elementHandlerPointer, 105 | cCallbackCommentPointer, 106 | commentHandlerPointer, 107 | cCallbackTextChunkPointer, 108 | textChunkHandlerPointer, 109 | ) 110 | rb.pointers = append(rb.pointers, elementHandlerPointer, commentHandlerPointer, textChunkHandlerPointer) 111 | } 112 | 113 | func (rb *rewriterBuilder) Build(sink OutputSink, config Config) (*rewriter, error) { 114 | encodingC := C.CString(config.Encoding) 115 | defer C.free(unsafe.Pointer(encodingC)) 116 | encodingLen := len(config.Encoding) 117 | memorySettingsC := C.lol_html_memory_settings_t{ 118 | preallocated_parsing_buffer_size: C.size_t(config.Memory.PreallocatedParsingBufferSize), 119 | max_allowed_memory_usage: C.size_t(config.Memory.MaxAllowedMemoryUsage), 120 | } 121 | p := savePointer(sink) 122 | r := C.lol_html_rewriter_build( 123 | rb.rb, 124 | encodingC, 125 | C.size_t(encodingLen), 126 | memorySettingsC, 127 | (*[0]byte)(C.callback_sink), 128 | p, 129 | C.bool(config.Strict), 130 | ) 131 | if r != nil { 132 | rb.built = true 133 | return &rewriter{rewriter: r, pointers: rb.pointers}, nil 134 | } 135 | return nil, getError() 136 | } 137 | -------------------------------------------------------------------------------- /builder_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | -------------------------------------------------------------------------------- /callback.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | 7 | extern void callbackSink(const char *chunk, size_t chunk_len, void *); 8 | 9 | extern lol_html_rewriter_directive_t callbackDoctype(lol_html_doctype_t *doctype, void *user_data); 10 | 11 | extern lol_html_rewriter_directive_t callbackComment(lol_html_comment_t *comment, void *user_data); 12 | 13 | extern lol_html_rewriter_directive_t callbackTextChunk(lol_html_text_chunk_t *text_chunk, void *user_data); 14 | 15 | extern lol_html_rewriter_directive_t callbackElement(lol_html_element_t *element, void *user_data); 16 | 17 | extern lol_html_rewriter_directive_t callbackDocumentEnd(lol_html_doc_end_t *doc_end, void *user_data); 18 | 19 | void callback_sink(const char *chunk, size_t chunk_len, void *user_data) { 20 | return callbackSink(chunk, chunk_len, user_data); 21 | } 22 | 23 | lol_html_rewriter_directive_t callback_doctype(lol_html_doctype_t *doctype, void *user_data) { 24 | return callbackDoctype(doctype, user_data); 25 | } 26 | 27 | lol_html_rewriter_directive_t callback_comment(lol_html_comment_t *comment, void *user_data) { 28 | return callbackComment(comment, user_data); 29 | } 30 | 31 | lol_html_rewriter_directive_t callback_text_chunk(lol_html_text_chunk_t *text_chunk, void *user_data) { 32 | return callbackTextChunk(text_chunk, user_data); 33 | } 34 | 35 | lol_html_rewriter_directive_t callback_element(lol_html_element_t *element, void *user_data){ 36 | return callbackElement(element, user_data); 37 | } 38 | 39 | lol_html_rewriter_directive_t callback_doc_end(lol_html_doc_end_t *doc_end, void *user_data) { 40 | return callbackDocumentEnd(doc_end, user_data); 41 | } 42 | */ 43 | import "C" 44 | -------------------------------------------------------------------------------- /comment.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // Comment represents an HTML comment. 11 | type Comment C.lol_html_comment_t 12 | 13 | // CommentHandlerFunc is a callback handler function to do something with a Comment. 14 | // Expected to return a RewriterDirective as instruction to continue or stop. 15 | type CommentHandlerFunc func(*Comment) RewriterDirective 16 | 17 | // Text returns the comment's text. 18 | func (c *Comment) Text() string { 19 | textC := (str)(C.lol_html_comment_text_get((*C.lol_html_comment_t)(c))) 20 | defer textC.Free() 21 | return textC.String() 22 | } 23 | 24 | // SetText sets the comment's text and returns an error if there is one. 25 | func (c *Comment) SetText(text string) error { 26 | textC := C.CString(text) 27 | defer C.free(unsafe.Pointer(textC)) 28 | textLen := len(text) 29 | errCode := C.lol_html_comment_text_set((*C.lol_html_comment_t)(c), textC, C.size_t(textLen)) 30 | if errCode == 0 { 31 | return nil 32 | } 33 | return getError() 34 | } 35 | 36 | type commentAlter int 37 | 38 | const ( 39 | commentInsertBefore commentAlter = iota 40 | commentInsertAfter 41 | commentReplace 42 | ) 43 | 44 | func (c *Comment) alter(content string, alter commentAlter, isHTML bool) error { 45 | contentC := C.CString(content) 46 | defer C.free(unsafe.Pointer(contentC)) 47 | contentLen := len(content) 48 | var errCode C.int 49 | switch alter { 50 | case commentInsertBefore: 51 | errCode = C.lol_html_comment_before((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 52 | case commentInsertAfter: 53 | errCode = C.lol_html_comment_after((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 54 | case commentReplace: 55 | errCode = C.lol_html_comment_replace((*C.lol_html_comment_t)(c), contentC, C.size_t(contentLen), C.bool(isHTML)) 56 | default: 57 | panic("not implemented") 58 | } 59 | if errCode == 0 { 60 | return nil 61 | } 62 | return getError() 63 | } 64 | 65 | // InsertBeforeAsText inserts the given content before the comment. 66 | // 67 | // The rewriter will HTML-escape the content before insertion: 68 | // 69 | // `<` will be replaced with `<` 70 | // 71 | // `>` will be replaced with `>` 72 | // 73 | // `&` will be replaced with `&` 74 | func (c *Comment) InsertBeforeAsText(content string) error { 75 | return c.alter(content, commentInsertAfter, false) 76 | } 77 | 78 | // InsertBeforeAsHTML inserts the given content before the comment. 79 | // The content is inserted as is. 80 | func (c *Comment) InsertBeforeAsHTML(content string) error { 81 | return c.alter(content, commentInsertBefore, true) 82 | } 83 | 84 | // InsertAfterAsText inserts the given content before the comment. 85 | // 86 | // The rewriter will HTML-escape the content before insertion: 87 | // 88 | // `<` will be replaced with `<` 89 | // 90 | // `>` will be replaced with `>` 91 | // 92 | // `&` will be replaced with `&` 93 | func (c *Comment) InsertAfterAsText(content string) error { 94 | return c.alter(content, commentInsertAfter, false) 95 | } 96 | 97 | // InsertAfterAsHTML inserts the given content before the comment. 98 | // The content is inserted as is. 99 | func (c *Comment) InsertAfterAsHTML(content string) error { 100 | return c.alter(content, commentInsertAfter, true) 101 | } 102 | 103 | // ReplaceAsText replace the comment with the supplied content. 104 | // 105 | // The rewriter will HTML-escape the content: 106 | // 107 | // `<` will be replaced with `<` 108 | // 109 | // `>` will be replaced with `>` 110 | // 111 | // `&` will be replaced with `&` 112 | func (c *Comment) ReplaceAsText(content string) error { 113 | return c.alter(content, commentReplace, false) 114 | } 115 | 116 | // ReplaceAsHTML replace the comment with the supplied content. 117 | // The content is kept as is. 118 | func (c *Comment) ReplaceAsHTML(content string) error { 119 | return c.alter(content, commentReplace, true) 120 | } 121 | 122 | // Remove removes the comment. 123 | func (c *Comment) Remove() { 124 | C.lol_html_comment_remove((*C.lol_html_comment_t)(c)) 125 | } 126 | 127 | // IsRemoved returns whether the comment is removed or not. 128 | func (c *Comment) IsRemoved() bool { 129 | return (bool)(C.lol_html_comment_is_removed((*C.lol_html_comment_t)(c))) 130 | } 131 | -------------------------------------------------------------------------------- /comment_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestComment_GetSetText(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | CommentHandler: func(comment *lolhtml.Comment) lolhtml.RewriterDirective { 18 | if text := comment.Text(); text != "Hey 42" { 19 | t.Errorf("wrong text %s\n", text) 20 | } 21 | if err := comment.SetText("Yo"); err != nil { 22 | t.Errorf("set text error %s\n", err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | if _, err = w.Write([]byte("")); err != nil { 35 | t.Error(err) 36 | } 37 | if err = w.Close(); err != nil { 38 | t.Error(err) 39 | } 40 | wantedText := "" 41 | if finalText := buf.String(); finalText != wantedText { 42 | t.Errorf("want %s got %s \n", wantedText, finalText) 43 | } 44 | } 45 | 46 | func TestComment_Replace(t *testing.T) { 47 | var buf bytes.Buffer 48 | w, err := lolhtml.NewWriter( 49 | &buf, 50 | &lolhtml.Handlers{ 51 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 52 | { 53 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 54 | if err := c.ReplaceAsHTML(""); err != nil { 55 | t.Error(err) 56 | } 57 | if !c.IsRemoved() { 58 | t.FailNow() 59 | } 60 | return lolhtml.Continue 61 | }, 62 | }, 63 | }, 64 | }, 65 | ) 66 | if err != nil { 67 | t.Error(err) 68 | } 69 | 70 | if _, err := w.Write([]byte("
")); err != nil { 71 | t.Error(err) 72 | } 73 | if err := w.Close(); err != nil { 74 | t.Error(err) 75 | } 76 | wantedText := "
" 77 | if finalText := buf.String(); finalText != wantedText { 78 | t.Errorf("want %s got %s \n", wantedText, finalText) 79 | } 80 | } 81 | 82 | func TestComment_InsertAfter(t *testing.T) { 83 | var buf bytes.Buffer 84 | w, err := lolhtml.NewWriter( 85 | &buf, 86 | &lolhtml.Handlers{ 87 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 88 | { 89 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 90 | if err := c.InsertAfterAsHTML(""); err != nil { 91 | t.Error(err) 92 | } 93 | return lolhtml.Continue 94 | }, 95 | }, 96 | }, 97 | }, 98 | ) 99 | if err != nil { 100 | t.Error(err) 101 | } 102 | 103 | if _, err := w.Write([]byte("
")); err != nil { 104 | t.Error(err) 105 | } 106 | if err := w.Close(); err != nil { 107 | t.Error(err) 108 | } 109 | wantedText := "
" 110 | if finalText := buf.String(); finalText != wantedText { 111 | t.Errorf("want %s got %s \n", wantedText, finalText) 112 | } 113 | } 114 | 115 | func TestComment_Remove(t *testing.T) { 116 | var buf bytes.Buffer 117 | w, err := lolhtml.NewWriter( 118 | &buf, 119 | &lolhtml.Handlers{ 120 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 121 | { 122 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 123 | if c.IsRemoved() { 124 | t.FailNow() 125 | } 126 | c.Remove() 127 | if !c.IsRemoved() { 128 | t.FailNow() 129 | } 130 | return lolhtml.Continue 131 | }, 132 | }, 133 | }, 134 | }, 135 | ) 136 | if err != nil { 137 | t.Error(err) 138 | } 139 | 140 | if _, err := w.Write([]byte("<>")); err != nil { 141 | t.Error(err) 142 | } 143 | if err := w.Close(); err != nil { 144 | t.Error(err) 145 | } 146 | wantedText := "<>" 147 | if finalText := buf.String(); finalText != wantedText { 148 | t.Errorf("want %s got %s \n", wantedText, finalText) 149 | } 150 | } 151 | 152 | func TestComment_InsertBeforeAndAfter(t *testing.T) { 153 | var buf bytes.Buffer 154 | w, err := lolhtml.NewWriter( 155 | &buf, 156 | &lolhtml.Handlers{ 157 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 158 | { 159 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 160 | if err := c.InsertBeforeAsHTML("
"); err != nil { 161 | t.Error(err) 162 | } 163 | if err := c.InsertAfterAsText("
"); err != nil { 164 | t.Error(err) 165 | } 166 | return lolhtml.Continue 167 | }, 168 | }, 169 | }, 170 | }, 171 | ) 172 | if err != nil { 173 | t.Error(err) 174 | } 175 | 176 | if _, err := w.Write([]byte("")); err != nil { 177 | t.Error(err) 178 | } 179 | if err := w.Close(); err != nil { 180 | t.Error(err) 181 | } 182 | wantedText := "
</div>" 183 | if finalText := buf.String(); finalText != wantedText { 184 | t.Errorf("want %s got %s \n", wantedText, finalText) 185 | } 186 | } 187 | 188 | func TestComment_StopRewriting(t *testing.T) { 189 | var buf bytes.Buffer 190 | w, err := lolhtml.NewWriter( 191 | &buf, 192 | &lolhtml.Handlers{ 193 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 194 | { 195 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 196 | return lolhtml.Stop 197 | }, 198 | }, 199 | }, 200 | }, 201 | ) 202 | if err != nil { 203 | t.Error(err) 204 | } 205 | 206 | _, err = w.Write([]byte("
")) 207 | if err == nil { 208 | t.FailNow() 209 | } 210 | if err.Error() != "The rewriter has been stopped." { 211 | t.Error(err) 212 | } 213 | } 214 | 215 | func TestComment_StopRewritingWithSelector(t *testing.T) { 216 | var buf bytes.Buffer 217 | w, err := lolhtml.NewWriter( 218 | &buf, 219 | &lolhtml.Handlers{ 220 | ElementContentHandler: []lolhtml.ElementContentHandler{ 221 | { 222 | Selector: "*", 223 | CommentHandler: func(c *lolhtml.Comment) lolhtml.RewriterDirective { 224 | return lolhtml.Stop 225 | }, 226 | }, 227 | }, 228 | }, 229 | ) 230 | if err != nil { 231 | t.Error(err) 232 | } 233 | 234 | _, err = w.Write([]byte("
")) 235 | if err == nil { 236 | t.FailNow() 237 | } 238 | if err.Error() != "The rewriter has been stopped." { 239 | t.Error(err) 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | import ( 8 | "unsafe" 9 | ) 10 | 11 | // Config defines settings for the rewriter. 12 | type Config struct { 13 | // defaults to "utf-8". 14 | Encoding string 15 | // defaults to PreallocatedParsingBufferSize: 1024, MaxAllowedMemoryUsage: 1<<63 - 1. 16 | Memory *MemorySettings 17 | // defaults to func([]byte) {}. In other words, totally discard output. 18 | Sink OutputSink 19 | // defaults to true. If true, bail out for security reasons when ambiguous. 20 | Strict bool 21 | } 22 | 23 | func newDefaultConfig() Config { 24 | return Config{ 25 | Encoding: "utf-8", 26 | Memory: &MemorySettings{ 27 | PreallocatedParsingBufferSize: 1024, 28 | MaxAllowedMemoryUsage: 1<<63 - 1, 29 | }, 30 | Sink: func([]byte) {}, 31 | Strict: true, 32 | } 33 | } 34 | 35 | // MemorySettings sets the memory limitations for the rewriter. 36 | type MemorySettings struct { 37 | PreallocatedParsingBufferSize int // defaults to 1024 38 | MaxAllowedMemoryUsage int // defaults to 1<<63 -1 39 | } 40 | 41 | // OutputSink is a callback function where output is written to. A byte slice is passed each time, 42 | // representing a chunk of output. 43 | // 44 | // Exported for special usages which require each output chunk to be identified and processed 45 | // individually. For most common uses, NewWriter would be more convenient. 46 | type OutputSink func([]byte) 47 | 48 | // DocumentContentHandler is a group of handlers that would be applied to the whole HTML document. 49 | type DocumentContentHandler struct { 50 | DoctypeHandler DoctypeHandlerFunc 51 | CommentHandler CommentHandlerFunc 52 | TextChunkHandler TextChunkHandlerFunc 53 | DocumentEndHandler DocumentEndHandlerFunc 54 | } 55 | 56 | // ElementContentHandler is a group of handlers that would be applied to the content matched by 57 | // the given selector. 58 | type ElementContentHandler struct { 59 | Selector string 60 | ElementHandler ElementHandlerFunc 61 | CommentHandler CommentHandlerFunc 62 | TextChunkHandler TextChunkHandlerFunc 63 | } 64 | 65 | // Handlers contain DocumentContentHandlers and ElementContentHandlers. Can contain arbitrary numbers 66 | // of them, including zero (nil slice). 67 | type Handlers struct { 68 | DocumentContentHandler []DocumentContentHandler 69 | ElementContentHandler []ElementContentHandler 70 | } 71 | 72 | //export callbackSink 73 | func callbackSink(chunk *C.char, chunkLen C.size_t, userData unsafe.Pointer) { 74 | c := C.GoBytes(unsafe.Pointer(chunk), C.int(chunkLen)) 75 | cb := restorePointer(userData).(OutputSink) 76 | cb(c) 77 | } 78 | 79 | //export callbackDoctype 80 | func callbackDoctype(doctype *Doctype, userData unsafe.Pointer) RewriterDirective { 81 | cb := restorePointer(userData).(DoctypeHandlerFunc) 82 | return cb(doctype) 83 | } 84 | 85 | //export callbackComment 86 | func callbackComment(comment *Comment, userData unsafe.Pointer) RewriterDirective { 87 | cb := restorePointer(userData).(CommentHandlerFunc) 88 | return cb(comment) 89 | } 90 | 91 | //export callbackTextChunk 92 | func callbackTextChunk(textChunk *TextChunk, userData unsafe.Pointer) RewriterDirective { 93 | cb := restorePointer(userData).(TextChunkHandlerFunc) 94 | return cb(textChunk) 95 | } 96 | 97 | //export callbackElement 98 | func callbackElement(element *Element, userData unsafe.Pointer) RewriterDirective { 99 | cb := restorePointer(userData).(ElementHandlerFunc) 100 | return cb(element) 101 | } 102 | 103 | //export callbackDocumentEnd 104 | func callbackDocumentEnd(documentEnd *DocumentEnd, userData unsafe.Pointer) RewriterDirective { 105 | cb := restorePointer(userData).(DocumentEndHandlerFunc) 106 | return cb(documentEnd) 107 | } 108 | -------------------------------------------------------------------------------- /const.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | import "C" 4 | 5 | // RewriterDirective is a "status code“ that should be returned by callback handlers, to inform the 6 | // rewriter to continue or stop parsing. 7 | type RewriterDirective int 8 | 9 | const ( 10 | // Continue lets the normal parsing process continue. 11 | Continue RewriterDirective = iota 12 | 13 | // Stop stops the rewriter immediately. Content currently buffered is discarded, and an error is returned. 14 | // After stopping, the Writer should not be used anymore except for Close(). 15 | Stop 16 | ) 17 | -------------------------------------------------------------------------------- /doctype.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | 8 | // Doctype represents the document's doctype. 9 | type Doctype C.lol_html_doctype_t 10 | 11 | // DoctypeHandlerFunc is a callback handler function to do something with a Comment. 12 | type DoctypeHandlerFunc func(*Doctype) RewriterDirective 13 | 14 | // Name returns doctype name. 15 | func (d *Doctype) Name() string { 16 | nameC := (*str)(C.lol_html_doctype_name_get((*C.lol_html_doctype_t)(d))) 17 | defer nameC.Free() 18 | return nameC.String() 19 | } 20 | 21 | // PublicID returns doctype public ID. 22 | func (d *Doctype) PublicID() string { 23 | nameC := (*str)(C.lol_html_doctype_public_id_get((*C.lol_html_doctype_t)(d))) 24 | defer nameC.Free() 25 | return nameC.String() 26 | } 27 | 28 | // SystemID returns doctype system ID. 29 | func (d *Doctype) SystemID() string { 30 | nameC := (*str)(C.lol_html_doctype_system_id_get((*C.lol_html_doctype_t)(d))) 31 | defer nameC.Free() 32 | return nameC.String() 33 | } 34 | -------------------------------------------------------------------------------- /doctype_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/coolspring8/go-lolhtml" 7 | ) 8 | 9 | func TestDoctype_GetDoctypeFields(t *testing.T) { 10 | w, err := lolhtml.NewWriter( 11 | nil, 12 | &lolhtml.Handlers{ 13 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 14 | { 15 | DoctypeHandler: func(doctype *lolhtml.Doctype) lolhtml.RewriterDirective { 16 | if name := doctype.Name(); name != "math" { 17 | t.Errorf("wrong doctype name %s\n", name) 18 | } 19 | if publicId := doctype.PublicID(); publicId != "" { 20 | t.Errorf("wrong doctype name %s\n", publicId) 21 | } 22 | if systemId := doctype.SystemID(); systemId != "http://www.w3.org/Math/DTD/mathml1/mathml.dtd" { 23 | t.Errorf("wrong doctype name %s\n", systemId) 24 | } 25 | return lolhtml.Continue 26 | }, 27 | }, 28 | }, 29 | }, 30 | ) 31 | if err != nil { 32 | t.Error(err) 33 | } 34 | 35 | _, err = w.Write([]byte(``)) 36 | if err != nil { 37 | t.Error(err) 38 | } 39 | err = w.Close() 40 | if err != nil { 41 | t.Error(err) 42 | } 43 | } 44 | 45 | func TestDoctype_StopRewriting(t *testing.T) { 46 | w, err := lolhtml.NewWriter( 47 | nil, 48 | &lolhtml.Handlers{ 49 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 50 | { 51 | DoctypeHandler: func(d *lolhtml.Doctype) lolhtml.RewriterDirective { 52 | return lolhtml.Stop 53 | }, 54 | }, 55 | }, 56 | }, 57 | ) 58 | if err != nil { 59 | t.Error(err) 60 | } 61 | 62 | _, err = w.Write([]byte("")) 63 | if err == nil { 64 | t.FailNow() 65 | } 66 | if err.Error() != "The rewriter has been stopped." { 67 | t.Error(err) 68 | } 69 | err = w.Close() 70 | if err == nil { 71 | t.FailNow() 72 | } 73 | if err.Error() != "The rewriter has been stopped." { 74 | t.Error(err) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /documentend.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // DocumentEnd represents the end of the document. 11 | type DocumentEnd C.lol_html_doc_end_t 12 | 13 | // DocumentEndHandlerFunc is a callback handler function to do something with a DocumentEnd. 14 | type DocumentEndHandlerFunc func(*DocumentEnd) RewriterDirective 15 | 16 | // AppendAsText appends the given content at the end of the document. 17 | // 18 | // The rewriter will HTML-escape the content before appending: 19 | // 20 | // `<` will be replaced with `<` 21 | // 22 | // `>` will be replaced with `>` 23 | // 24 | // `&` will be replaced with `&` 25 | func (d *DocumentEnd) AppendAsText(content string) error { 26 | contentC := C.CString(content) 27 | defer C.free(unsafe.Pointer(contentC)) 28 | contentLen := len(content) 29 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), false) 30 | if errCode == 0 { 31 | return nil 32 | } 33 | return getError() 34 | } 35 | 36 | // AppendAsHTML appends the given content at the end of the document. 37 | // The content is appended as is. 38 | func (d *DocumentEnd) AppendAsHTML(content string) error { 39 | contentC := C.CString(content) 40 | defer C.free(unsafe.Pointer(contentC)) 41 | contentLen := len(content) 42 | errCode := C.lol_html_doc_end_append((*C.lol_html_doc_end_t)(d), contentC, C.size_t(contentLen), true) 43 | if errCode == 0 { 44 | return nil 45 | } 46 | return getError() 47 | } 48 | -------------------------------------------------------------------------------- /documentend_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestDocumentEnd_AppendToEmptyDoc(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective { 18 | if err := docEnd.AppendAsHTML(""); err != nil { 19 | t.Error(err) 20 | } 21 | if err := docEnd.AppendAsText("hello & world"); err != nil { 22 | t.Error(err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | if _, err = w.Write([]byte("")); err != nil { 35 | t.Error(err) 36 | } 37 | if err = w.Close(); err != nil { 38 | t.Error(err) 39 | } 40 | wantedText := "hello & world" 41 | if finalText := buf.String(); finalText != wantedText { 42 | t.Errorf("want %s got %s \n", wantedText, finalText) 43 | } 44 | } 45 | 46 | func TestDocumentEnd_AppendAtEnd(t *testing.T) { 47 | var buf bytes.Buffer 48 | w, err := lolhtml.NewWriter( 49 | &buf, 50 | &lolhtml.Handlers{ 51 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 52 | { 53 | DocumentEndHandler: func(docEnd *lolhtml.DocumentEnd) lolhtml.RewriterDirective { 54 | if err := docEnd.AppendAsHTML(""); err != nil { 55 | t.Error(err) 56 | } 57 | if err := docEnd.AppendAsText("hello & world"); err != nil { 58 | t.Error(err) 59 | } 60 | return lolhtml.Continue 61 | }, 62 | }, 63 | }, 64 | }, 65 | ) 66 | if err != nil { 67 | t.Error(err) 68 | } 69 | 70 | if _, err = w.Write([]byte("
Hello
")); err != nil { 71 | t.Error(err) 72 | } 73 | if err = w.Close(); err != nil { 74 | t.Error(err) 75 | } 76 | wantedText := "
Hello
hello & world" 77 | if finalText := buf.String(); finalText != wantedText { 78 | t.Errorf("want %s got %s \n", wantedText, finalText) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /element.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import ( 9 | "errors" 10 | "unsafe" 11 | ) 12 | 13 | // Element represents an HTML element. 14 | type Element C.lol_html_element_t 15 | 16 | // ElementHandlerFunc is a callback handler function to do something with an Element. 17 | type ElementHandlerFunc func(*Element) RewriterDirective 18 | 19 | // TagName gets the element's tag name. 20 | func (e *Element) TagName() string { 21 | tagNameC := (str)(C.lol_html_element_tag_name_get((*C.lol_html_element_t)(e))) 22 | defer tagNameC.Free() 23 | return tagNameC.String() 24 | } 25 | 26 | // SetTagName sets the element's tag name. 27 | func (e *Element) SetTagName(name string) error { 28 | nameC := C.CString(name) 29 | defer C.free(unsafe.Pointer(nameC)) 30 | nameLen := len(name) 31 | errCode := C.lol_html_element_tag_name_set((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 32 | if errCode == 0 { 33 | return nil 34 | } 35 | return getError() 36 | } 37 | 38 | // NamespaceURI gets the element's namespace URI. 39 | func (e *Element) NamespaceURI() string { 40 | // don't need to be freed 41 | namespaceURIC := C.lol_html_element_namespace_uri_get((*C.lol_html_element_t)(e)) 42 | return C.GoString(namespaceURIC) 43 | } 44 | 45 | // AttributeIterator returns a pointer to an AttributeIterator. Can be used to iterate 46 | // over all attributes of the element. 47 | func (e *Element) AttributeIterator() *AttributeIterator { 48 | return (*AttributeIterator)(C.lol_html_attributes_iterator_get((*C.lol_html_element_t)(e))) 49 | } 50 | 51 | // AttributeValue returns the value of the attribute on this element. 52 | func (e *Element) AttributeValue(name string) (string, error) { 53 | nameC := C.CString(name) 54 | defer C.free(unsafe.Pointer(nameC)) 55 | nameLen := len(name) 56 | valueC := (*str)(C.lol_html_element_get_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen))) 57 | defer valueC.Free() 58 | // always check error, so not using getError() 59 | errC := (*str)(C.lol_html_take_last_error()) 60 | defer errC.Free() 61 | errMsg := errC.String() 62 | if errMsg != "" { 63 | return "", errors.New(errMsg) 64 | } 65 | return valueC.String(), nil 66 | } 67 | 68 | // HasAttribute returns whether the element has the attribute of this name or not. 69 | func (e *Element) HasAttribute(name string) (bool, error) { 70 | nameC := C.CString(name) 71 | defer C.free(unsafe.Pointer(nameC)) 72 | nameLen := len(name) 73 | codeC := C.lol_html_element_has_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 74 | if codeC == 1 { 75 | return true, nil 76 | } else if codeC == 0 { 77 | return false, nil 78 | } 79 | return false, getError() 80 | } 81 | 82 | // SetAttribute updates or creates the attribute with name and value on the element. 83 | func (e *Element) SetAttribute(name string, value string) error { 84 | nameC := C.CString(name) 85 | defer C.free(unsafe.Pointer(nameC)) 86 | nameLen := len(name) 87 | valueC := C.CString(value) 88 | defer C.free(unsafe.Pointer(valueC)) 89 | valueLen := len(value) 90 | errCode := C.lol_html_element_set_attribute( 91 | (*C.lol_html_element_t)(e), 92 | nameC, 93 | C.size_t(nameLen), 94 | valueC, 95 | C.size_t(valueLen), 96 | ) 97 | if errCode == 0 { 98 | return nil 99 | } 100 | return getError() 101 | } 102 | 103 | // RemoveAttribute removes the attribute with the name from the element. 104 | func (e *Element) RemoveAttribute(name string) error { 105 | nameC := C.CString(name) 106 | defer C.free(unsafe.Pointer(nameC)) 107 | nameLen := len(name) 108 | errCode := C.lol_html_element_remove_attribute((*C.lol_html_element_t)(e), nameC, C.size_t(nameLen)) 109 | if errCode == 0 { 110 | return nil 111 | } 112 | return getError() 113 | } 114 | 115 | type elementAlter int 116 | 117 | const ( 118 | elementInsertBeforeStartTag elementAlter = iota 119 | elementInsertAfterStartTag 120 | elementInsertBeforeEndTag 121 | elementInsertAfterEndTag 122 | elementSetInnerContent 123 | elementReplace 124 | ) 125 | 126 | func (e *Element) alter(content string, alter elementAlter, isHTML bool) error { 127 | contentC := C.CString(content) 128 | defer C.free(unsafe.Pointer(contentC)) 129 | contentLen := len(content) 130 | var errCode C.int 131 | switch alter { 132 | case elementInsertBeforeStartTag: 133 | errCode = C.lol_html_element_before((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 134 | case elementInsertAfterStartTag: 135 | errCode = C.lol_html_element_prepend((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 136 | case elementInsertBeforeEndTag: 137 | errCode = C.lol_html_element_append((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 138 | case elementInsertAfterEndTag: 139 | errCode = C.lol_html_element_after((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 140 | case elementSetInnerContent: 141 | errCode = C.lol_html_element_set_inner_content((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 142 | case elementReplace: 143 | errCode = C.lol_html_element_replace((*C.lol_html_element_t)(e), contentC, C.size_t(contentLen), C.bool(isHTML)) 144 | default: 145 | panic("not implemented") 146 | } 147 | if errCode == 0 { 148 | return nil 149 | } 150 | return getError() 151 | } 152 | 153 | // InsertBeforeStartTagAsText inserts the given content before the element's start tag. 154 | // 155 | // The rewriter will HTML-escape the content before insertion: 156 | // 157 | // `<` will be replaced with `<` 158 | // 159 | // `>` will be replaced with `>` 160 | // 161 | // `&` will be replaced with `&` 162 | func (e *Element) InsertBeforeStartTagAsText(content string) error { 163 | return e.alter(content, elementInsertBeforeStartTag, false) 164 | } 165 | 166 | // InsertBeforeStartTagAsHTML inserts the given content before the element's start tag. 167 | // The content is inserted as is. 168 | func (e *Element) InsertBeforeStartTagAsHTML(content string) error { 169 | return e.alter(content, elementInsertBeforeStartTag, true) 170 | } 171 | 172 | // InsertAfterStartTagAsText inserts (prepend) the given content after the element's start tag. 173 | // 174 | // The rewriter will HTML-escape the content before insertion: 175 | // 176 | // `<` will be replaced with `<` 177 | // 178 | // `>` will be replaced with `>` 179 | // 180 | // `&` will be replaced with `&` 181 | func (e *Element) InsertAfterStartTagAsText(content string) error { 182 | return e.alter(content, elementInsertAfterStartTag, false) 183 | } 184 | 185 | // InsertAfterStartTagAsHTML inserts (prepend) the given content after the element's start tag. 186 | // The content is inserted as is. 187 | func (e *Element) InsertAfterStartTagAsHTML(content string) error { 188 | return e.alter(content, elementInsertAfterStartTag, true) 189 | } 190 | 191 | // InsertBeforeEndTagAsText inserts (append) the given content after the element's end tag. 192 | // 193 | // The rewriter will HTML-escape the content before insertion: 194 | // 195 | // `<` will be replaced with `<` 196 | // 197 | // `>` will be replaced with `>` 198 | // 199 | // `&` will be replaced with `&` 200 | func (e *Element) InsertBeforeEndTagAsText(content string) error { 201 | return e.alter(content, elementInsertBeforeEndTag, false) 202 | } 203 | 204 | // InsertBeforeEndTagAsHTML inserts (append) the given content before the element's end tag. 205 | // The content is inserted as is. 206 | func (e *Element) InsertBeforeEndTagAsHTML(content string) error { 207 | return e.alter(content, elementInsertBeforeEndTag, true) 208 | } 209 | 210 | // InsertAfterEndTagAsText inserts the given content after the element's end tag. 211 | // 212 | // The rewriter will HTML-escape the content before insertion: 213 | // 214 | // `<` will be replaced with `<` 215 | // 216 | // `>` will be replaced with `>` 217 | // 218 | // `&` will be replaced with `&` 219 | func (e *Element) InsertAfterEndTagAsText(content string) error { 220 | return e.alter(content, elementInsertAfterEndTag, false) 221 | } 222 | 223 | // InsertAfterEndTagAsHTML inserts the given content after the element's end tag. 224 | // The content is inserted as is. 225 | func (e *Element) InsertAfterEndTagAsHTML(content string) error { 226 | return e.alter(content, elementInsertAfterEndTag, true) 227 | } 228 | 229 | // SetInnerContentAsText overwrites the element's inner content. 230 | // 231 | // The rewriter will HTML-escape the content: 232 | // 233 | // `<` will be replaced with `<` 234 | // 235 | // `>` will be replaced with `>` 236 | // 237 | // `&` will be replaced with `&` 238 | func (e *Element) SetInnerContentAsText(content string) error { 239 | return e.alter(content, elementSetInnerContent, false) 240 | } 241 | 242 | // SetInnerContentAsHTML overwrites the element's inner content. 243 | // The content is kept as is. 244 | func (e *Element) SetInnerContentAsHTML(content string) error { 245 | return e.alter(content, elementSetInnerContent, true) 246 | } 247 | 248 | // ReplaceAsText replace the whole element with the supplied content. 249 | // 250 | // The rewriter will HTML-escape the content: 251 | // 252 | // `<` will be replaced with `<` 253 | // 254 | // `>` will be replaced with `>` 255 | // 256 | // `&` will be replaced with `&` 257 | func (e *Element) ReplaceAsText(content string) error { 258 | return e.alter(content, elementReplace, false) 259 | } 260 | 261 | // ReplaceAsHTML replace the whole element with the supplied content. 262 | // The content is kept as is. 263 | func (e *Element) ReplaceAsHTML(content string) error { 264 | return e.alter(content, elementReplace, true) 265 | } 266 | 267 | // Remove completely removes the element. 268 | func (e *Element) Remove() { 269 | C.lol_html_element_remove((*C.lol_html_element_t)(e)) 270 | } 271 | 272 | // RemoveAndKeepContent removes the element but keeps the inner content. 273 | func (e *Element) RemoveAndKeepContent() { 274 | C.lol_html_element_remove_and_keep_content((*C.lol_html_element_t)(e)) 275 | } 276 | 277 | // IsRemoved returns whether the element is removed or not. 278 | func (e *Element) IsRemoved() bool { 279 | return (bool)(C.lol_html_element_is_removed((*C.lol_html_element_t)(e))) 280 | } 281 | -------------------------------------------------------------------------------- /element_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestElement_ModifyTagName(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | ElementContentHandler: []lolhtml.ElementContentHandler{ 16 | { 17 | Selector: "*", 18 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 19 | wantName := "div" 20 | if name := e.TagName(); name != wantName { 21 | t.Errorf("got %s want %s\n", name, wantName) 22 | } 23 | err := e.SetTagName("") 24 | if err == nil { 25 | t.FailNow() 26 | } 27 | if err.Error() != "Tag name can't be empty." { 28 | t.Error(err) 29 | } 30 | if err = e.SetTagName("span"); err != nil { 31 | t.Error(err) 32 | } 33 | return lolhtml.Continue 34 | }, 35 | }, 36 | }, 37 | }, 38 | ) 39 | if err != nil { 40 | t.Error(err) 41 | } 42 | 43 | if _, err = w.Write([]byte("Hi
")); err != nil { 44 | t.Error(err) 45 | } 46 | if err = w.Close(); err != nil { 47 | t.Error(err) 48 | } 49 | wantedText := "Hi " 50 | if finalText := buf.String(); finalText != wantedText { 51 | t.Errorf("want %s got %s \n", wantedText, finalText) 52 | } 53 | } 54 | 55 | func TestElement_ModifyAttributes(t *testing.T) { 56 | var buf bytes.Buffer 57 | w, err := lolhtml.NewWriter( 58 | &buf, 59 | &lolhtml.Handlers{ 60 | ElementContentHandler: []lolhtml.ElementContentHandler{ 61 | { 62 | Selector: "*", 63 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 64 | has, err := e.HasAttribute("foo") 65 | if err != nil { 66 | t.Error(err) 67 | } 68 | if !has { 69 | t.FailNow() 70 | } 71 | has, err = e.HasAttribute("Bar") 72 | if err != nil { 73 | t.Error(err) 74 | } 75 | if has { 76 | t.FailNow() 77 | } 78 | 79 | a, err := e.AttributeValue("foo") 80 | if err != nil { 81 | t.Error(err) 82 | } 83 | wantValue := "42" 84 | if a != wantValue { 85 | t.Errorf("got %s; want %s", a, wantValue) 86 | } 87 | a, err = e.AttributeValue("Bar") 88 | if err != nil { 89 | t.Error(err) 90 | } 91 | if a != "" { 92 | t.Errorf("got %s; want empty", a) 93 | } 94 | 95 | if err := e.SetAttribute("Bar", "hey"); err != nil { 96 | t.Error(err) 97 | } 98 | 99 | if err := e.RemoveAttribute("foo"); err != nil { 100 | t.Error(err) 101 | } 102 | 103 | return lolhtml.Continue 104 | }, 105 | }, 106 | }, 107 | }, 108 | ) 109 | if err != nil { 110 | t.Error(err) 111 | } 112 | 113 | if _, err = w.Write([]byte("")); err != nil { 114 | t.Error(err) 115 | } 116 | if err = w.Close(); err != nil { 117 | t.Error(err) 118 | } 119 | wantedText := "" 120 | if finalText := buf.String(); finalText != wantedText { 121 | t.Errorf("want %s got %s \n", wantedText, finalText) 122 | } 123 | } 124 | 125 | func TestElement_InsertContentAroundElement(t *testing.T) { 126 | var buf bytes.Buffer 127 | w, err := lolhtml.NewWriter( 128 | &buf, 129 | &lolhtml.Handlers{ 130 | ElementContentHandler: []lolhtml.ElementContentHandler{ 131 | { 132 | Selector: "*", 133 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 134 | if err := e.InsertBeforeStartTagAsText("&before"); err != nil { 135 | t.Error(err) 136 | } 137 | if err := e.InsertAfterStartTagAsHTML(""); err != nil { 138 | t.Error(err) 139 | } 140 | if err := e.InsertBeforeEndTagAsHTML(""); err != nil { 141 | t.Error(err) 142 | } 143 | if err := e.InsertAfterEndTagAsText("&after"); err != nil { 144 | t.Error(err) 145 | } 146 | return lolhtml.Continue 147 | }, 148 | }, 149 | }, 150 | }, 151 | ) 152 | if err != nil { 153 | t.Error(err) 154 | } 155 | 156 | if _, err = w.Write([]byte("
Hi
")); err != nil { 157 | t.Error(err) 158 | } 159 | if err = w.Close(); err != nil { 160 | t.Error(err) 161 | } 162 | wantedText := "&before
Hi
&after" 163 | if finalText := buf.String(); finalText != wantedText { 164 | t.Errorf("want %s got %s \n", wantedText, finalText) 165 | } 166 | } 167 | 168 | func TestElement_SetInnerContent(t *testing.T) { 169 | var buf bytes.Buffer 170 | w, err := lolhtml.NewWriter( 171 | &buf, 172 | &lolhtml.Handlers{ 173 | ElementContentHandler: []lolhtml.ElementContentHandler{ 174 | { 175 | Selector: "div", 176 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 177 | if err := e.SetInnerContentAsText("hey & ya"); err != nil { 178 | t.Error(err) 179 | } 180 | return lolhtml.Continue 181 | }, 182 | }, 183 | }, 184 | }, 185 | ) 186 | if err != nil { 187 | t.Error(err) 188 | } 189 | 190 | if _, err = w.Write([]byte("
42
")); err != nil { 191 | t.Error(err) 192 | } 193 | if err = w.Close(); err != nil { 194 | t.Error(err) 195 | } 196 | wantedText := "
hey & ya
" 197 | if finalText := buf.String(); finalText != wantedText { 198 | t.Errorf("want %s got %s \n", wantedText, finalText) 199 | } 200 | } 201 | 202 | func TestElement_Replace(t *testing.T) { 203 | var buf bytes.Buffer 204 | w, err := lolhtml.NewWriter( 205 | &buf, 206 | &lolhtml.Handlers{ 207 | ElementContentHandler: []lolhtml.ElementContentHandler{ 208 | { 209 | Selector: "div", 210 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 211 | if err := e.ReplaceAsHTML("hey & ya"); err != nil { 212 | t.Error(err) 213 | } 214 | return lolhtml.Continue 215 | }, 216 | }, 217 | }, 218 | }, 219 | ) 220 | if err != nil { 221 | t.Error(err) 222 | } 223 | 224 | if _, err = w.Write([]byte("
42

Hello
good bye

Hello2

")); err != nil { 225 | t.Error(err) 226 | } 227 | if err = w.Close(); err != nil { 228 | t.Error(err) 229 | } 230 | wantedText := "hey & ya

Hellohey & ya

Hello2

" 231 | if finalText := buf.String(); finalText != wantedText { 232 | t.Errorf("want %s got %s \n", wantedText, finalText) 233 | } 234 | } 235 | 236 | func TestElement_Remove(t *testing.T) { 237 | var buf bytes.Buffer 238 | w, err := lolhtml.NewWriter( 239 | &buf, 240 | &lolhtml.Handlers{ 241 | ElementContentHandler: []lolhtml.ElementContentHandler{ 242 | { 243 | Selector: "h1", 244 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 245 | if e.IsRemoved() { 246 | t.FailNow() 247 | } 248 | e.Remove() 249 | if !e.IsRemoved() { 250 | t.FailNow() 251 | } 252 | return lolhtml.Continue 253 | }, 254 | }, 255 | }, 256 | }, 257 | ) 258 | if err != nil { 259 | t.Error(err) 260 | } 261 | 262 | if _, err = w.Write([]byte("
42

Hello

Hello2

")); err != nil { 263 | t.Error(err) 264 | } 265 | if err = w.Close(); err != nil { 266 | t.Error(err) 267 | } 268 | wantedText := "
42

Hello2

" 269 | if finalText := buf.String(); finalText != wantedText { 270 | t.Errorf("want %s got %s \n", wantedText, finalText) 271 | } 272 | } 273 | 274 | func TestElement_RemoveElementAndKeepContent(t *testing.T) { 275 | var buf bytes.Buffer 276 | w, err := lolhtml.NewWriter( 277 | &buf, 278 | &lolhtml.Handlers{ 279 | ElementContentHandler: []lolhtml.ElementContentHandler{ 280 | { 281 | Selector: "h2", 282 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 283 | if e.IsRemoved() { 284 | t.FailNow() 285 | } 286 | e.RemoveAndKeepContent() 287 | if !e.IsRemoved() { 288 | t.FailNow() 289 | } 290 | return lolhtml.Continue 291 | }, 292 | }, 293 | }, 294 | }, 295 | ) 296 | if err != nil { 297 | t.Error(err) 298 | } 299 | 300 | if _, err = w.Write([]byte("
42

Hello1

Hello

Hello2

")); err != nil { 301 | t.Error(err) 302 | } 303 | if err = w.Close(); err != nil { 304 | t.Error(err) 305 | } 306 | wantedText := "
42Hello1

Hello

Hello2" 307 | if finalText := buf.String(); finalText != wantedText { 308 | t.Errorf("want %s got %s \n", wantedText, finalText) 309 | } 310 | } 311 | 312 | func TestElement_GetEmptyElementAttribute(t *testing.T) { 313 | var buf bytes.Buffer 314 | w, err := lolhtml.NewWriter( 315 | &buf, 316 | &lolhtml.Handlers{ 317 | ElementContentHandler: []lolhtml.ElementContentHandler{ 318 | { 319 | Selector: "span", 320 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 321 | has, err := e.HasAttribute("foo") 322 | if err != nil { 323 | t.Error(err) 324 | } 325 | if !has { 326 | t.FailNow() 327 | } 328 | value, err := e.AttributeValue("foo") 329 | if err != nil { 330 | t.Error(err) 331 | } 332 | if value != "" { 333 | t.Errorf("got %s; want empty", value) 334 | } 335 | return lolhtml.Continue 336 | }, 337 | }, 338 | }, 339 | }, 340 | ) 341 | if err != nil { 342 | t.Error(err) 343 | } 344 | 345 | if _, err = w.Write([]byte("")); err != nil { 346 | t.Error(err) 347 | } 348 | if err = w.Close(); err != nil { 349 | t.Error(err) 350 | } 351 | wantedText := "" 352 | if finalText := buf.String(); finalText != wantedText { 353 | t.Errorf("want %s got %s \n", wantedText, finalText) 354 | } 355 | } 356 | 357 | func TestElement_IterateAttributes(t *testing.T) { 358 | w, err := lolhtml.NewWriter( 359 | nil, 360 | &lolhtml.Handlers{ 361 | ElementContentHandler: []lolhtml.ElementContentHandler{ 362 | { 363 | Selector: "*", 364 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 365 | ai := e.AttributeIterator() 366 | 367 | a := ai.Next() 368 | if name := a.Name(); name != "foo" { 369 | t.Errorf("got %s; want foo", name) 370 | } 371 | if value := a.Value(); value != "42" { 372 | t.Errorf("got %s; want foo", value) 373 | } 374 | 375 | a = ai.Next() 376 | if name := a.Name(); name != "bar" { 377 | t.Errorf("got %s; want bar", name) 378 | } 379 | if value := a.Value(); value != "1337" { 380 | t.Errorf("got %s; want 1337", value) 381 | } 382 | 383 | a = ai.Next() 384 | if a != nil { 385 | t.FailNow() 386 | } 387 | 388 | return lolhtml.Continue 389 | }, 390 | }, 391 | }, 392 | }, 393 | ) 394 | if err != nil { 395 | t.Error(err) 396 | } 397 | 398 | if _, err = w.Write([]byte("
")); err != nil { 399 | t.Error(err) 400 | } 401 | if err = w.Close(); err != nil { 402 | t.Error(err) 403 | } 404 | } 405 | 406 | func TestElement_AssertNsIsHtml(t *testing.T) { 407 | w, err := lolhtml.NewWriter( 408 | nil, 409 | &lolhtml.Handlers{ 410 | ElementContentHandler: []lolhtml.ElementContentHandler{ 411 | { 412 | Selector: "script", 413 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 414 | wantedText := "http://www.w3.org/1999/xhtml" 415 | if ns := e.NamespaceURI(); ns != wantedText { 416 | t.Errorf("got %s; want %s", ns, wantedText) 417 | } 418 | return lolhtml.Continue 419 | }, 420 | }, 421 | }, 422 | }, 423 | ) 424 | if err != nil { 425 | t.Error(err) 426 | } 427 | 428 | if _, err = w.Write([]byte("")); err != nil { 429 | t.Error(err) 430 | } 431 | if err = w.Close(); err != nil { 432 | t.Error(err) 433 | } 434 | } 435 | 436 | func TestElement_AssertNsIsSvg(t *testing.T) { 437 | w, err := lolhtml.NewWriter( 438 | nil, 439 | &lolhtml.Handlers{ 440 | ElementContentHandler: []lolhtml.ElementContentHandler{ 441 | { 442 | Selector: "script", 443 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 444 | wantedText := "http://www.w3.org/2000/svg" 445 | if ns := e.NamespaceURI(); ns != wantedText { 446 | t.Errorf("got %s; want %s", ns, wantedText) 447 | } 448 | return lolhtml.Continue 449 | }, 450 | }, 451 | }, 452 | }, 453 | ) 454 | if err != nil { 455 | t.Error(err) 456 | } 457 | 458 | if _, err = w.Write([]byte("")); err != nil { 459 | t.Error(err) 460 | } 461 | if err = w.Close(); err != nil { 462 | t.Error(err) 463 | } 464 | } 465 | 466 | func TestElement_StopRewriting(t *testing.T) { 467 | w, err := lolhtml.NewWriter( 468 | nil, 469 | &lolhtml.Handlers{ 470 | ElementContentHandler: []lolhtml.ElementContentHandler{ 471 | { 472 | Selector: "span", 473 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 474 | return lolhtml.Stop 475 | }, 476 | }, 477 | }, 478 | }, 479 | ) 480 | if err != nil { 481 | t.Error(err) 482 | } 483 | 484 | _, err = w.Write([]byte("")) 485 | if err == nil { 486 | t.FailNow() 487 | } 488 | if err.Error() != "The rewriter has been stopped." { 489 | t.Error(err) 490 | } 491 | err = w.Close() 492 | if err == nil { 493 | t.FailNow() 494 | } 495 | if err.Error() != "The rewriter has been stopped." { 496 | t.Error(err) 497 | } 498 | } 499 | -------------------------------------------------------------------------------- /error.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include "lol_html.h" 5 | */ 6 | import "C" 7 | import "errors" 8 | 9 | // ErrCannotGetErrorMessage indicates getting error code from lol_html, but unable to acquire the concrete 10 | // error message. 11 | var ErrCannotGetErrorMessage = errors.New("cannot get error message from underlying lol_html lib") 12 | 13 | // getError is a helper function that gets error message for the last function call. 14 | // You should make sure there is an error when calling this, or the function interprets 15 | // the NULL error message obtained as ErrCannotGetErrorMessage. 16 | func getError() error { 17 | errC := (*str)(C.lol_html_take_last_error()) 18 | defer errC.Free() 19 | if errMsg := errC.String(); errMsg != "" { 20 | return errors.New(errMsg) 21 | } 22 | return ErrCannotGetErrorMessage 23 | } 24 | -------------------------------------------------------------------------------- /error_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | // TestNullErrorStr tests internal functions for handling a null lol_html_str_t, by calling lol_html_take_last_error() 11 | // when there is no error. 12 | func TestNullErrorStr(t *testing.T) { 13 | err := lolhtml.GetError() 14 | if !errors.Is(err, lolhtml.ErrCannotGetErrorMessage) { 15 | t.Error(err) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // This file is for demonstration in godoc. For more examples, see the /examples directory. 2 | package lolhtml_test 3 | 4 | import ( 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | "strings" 11 | 12 | "github.com/coolspring8/go-lolhtml" 13 | ) 14 | 15 | func ExampleNewWriter() { 16 | chunk := []byte("Hello, World!") 17 | r := bytes.NewReader(chunk) 18 | w, err := lolhtml.NewWriter( 19 | // output to stdout 20 | os.Stdout, 21 | &lolhtml.Handlers{ 22 | ElementContentHandler: []lolhtml.ElementContentHandler{ 23 | { 24 | Selector: "span", 25 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 26 | err := e.SetInnerContentAsText("LOL-HTML") 27 | if err != nil { 28 | log.Fatal(err) 29 | } 30 | return lolhtml.Continue 31 | }, 32 | }, 33 | }, 34 | }, 35 | ) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | 40 | // copy from the bytes reader to lolhtml writer 41 | _, err = io.Copy(w, r) 42 | if err != nil { 43 | log.Fatal(err) 44 | } 45 | 46 | // explicitly close the writer and flush the remaining content 47 | err = w.Close() 48 | if err != nil { 49 | log.Fatal(err) 50 | } 51 | // Output: Hello, LOL-HTML! 52 | } 53 | 54 | func ExampleRewriteString() { 55 | output, err := lolhtml.RewriteString( 56 | `
`, 57 | &lolhtml.Handlers{ 58 | ElementContentHandler: []lolhtml.ElementContentHandler{ 59 | { 60 | Selector: "a[href]", 61 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 62 | href, err := e.AttributeValue("href") 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | href = strings.ReplaceAll(href, "http:", "https:") 67 | 68 | err = e.SetAttribute("href", href) 69 | if err != nil { 70 | log.Fatal(err) 71 | } 72 | 73 | return lolhtml.Continue 74 | }, 75 | }, 76 | }, 77 | }, 78 | ) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | 83 | fmt.Println(output) 84 | // Output:
85 | } 86 | -------------------------------------------------------------------------------- /examples/defer-scripts/main.go: -------------------------------------------------------------------------------- 1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go 2 | package main 3 | 4 | import ( 5 | "io" 6 | "log" 7 | "os" 8 | 9 | "github.com/coolspring8/go-lolhtml" 10 | ) 11 | 12 | func main() { 13 | w, err := lolhtml.NewWriter( 14 | os.Stdout, 15 | &lolhtml.Handlers{ 16 | ElementContentHandler: []lolhtml.ElementContentHandler{ 17 | { 18 | Selector: "script[src]:not([async]):not([defer])", 19 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 20 | err := e.SetAttribute("defer", "") 21 | if err != nil { 22 | log.Fatal(err) 23 | } 24 | return lolhtml.Continue 25 | }, 26 | }, 27 | }, 28 | }, 29 | ) 30 | if err != nil { 31 | log.Fatal(err) 32 | } 33 | 34 | _, err = io.Copy(w, os.Stdin) 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | 39 | err = w.Close() 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /examples/mixed-content-rewriter/main.go: -------------------------------------------------------------------------------- 1 | // Usage: curl -NL https://git.io/JeOSZ | go run main.go 2 | package main 3 | 4 | import ( 5 | "io" 6 | "log" 7 | "os" 8 | "strings" 9 | 10 | "github.com/coolspring8/go-lolhtml" 11 | ) 12 | 13 | func main() { 14 | w, err := lolhtml.NewWriter( 15 | os.Stdout, 16 | &lolhtml.Handlers{ 17 | ElementContentHandler: []lolhtml.ElementContentHandler{ 18 | { 19 | Selector: "a[href], link[rel=stylesheet][href]", 20 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 21 | rewriteUrlInAttribute(e, "href") 22 | return lolhtml.Continue 23 | }, 24 | }, 25 | { 26 | Selector: "script[src], iframe[src], img[src], audio[src], video[src]", 27 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 28 | rewriteUrlInAttribute(e, "src") 29 | return lolhtml.Continue 30 | }, 31 | }, 32 | }, 33 | }, 34 | ) 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | 39 | _, err = io.Copy(w, os.Stdin) 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | 44 | err = w.Close() 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | } 49 | 50 | func rewriteUrlInAttribute(e *lolhtml.Element, attributeName string) { 51 | attr, err := e.AttributeValue(attributeName) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | attr = strings.ReplaceAll(attr, "http://", "https://") 56 | 57 | err = e.SetAttribute(attributeName, attr) 58 | if err != nil { 59 | log.Fatal(err) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/web-scraper/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Web Scraper · By Adam Schwartz · Powered by Cloudflare Workers® 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 238 | 239 | 240 |
241 |
242 |
243 |

244 | 245 |
246 |
247 |
248 |
249 |
250 | 251 |
252 | 253 |
254 |
255 |
256 | 257 |
258 | 259 |
260 |
261 |
262 |
263 | 264 | 265 |
266 |
267 | 268 | 269 |
270 |
271 |
272 |
273 |
274 | 275 | 276 |
277 |
278 |
279 |
280 | 281 |
282 | 283 |
284 |
285 |
286 | 287 | 288 |
289 |
290 |
291 |
292 |
293 |
294 | 295 | Permalink 296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 | 323 |
324 |
325 | 458 | 459 | -------------------------------------------------------------------------------- /examples/web-scraper/main.go: -------------------------------------------------------------------------------- 1 | // This is a ported Go version of https://web.scraper.workers.dev/, whose source code is 2 | // available at https://github.com/adamschwartz/web.scraper.workers.dev licensed under MIT. 3 | // 4 | // This translation is for demonstration purpose only, so many parts of the code are suboptimal. 5 | // 6 | // Sometimes you may get a "different" result, as Go's encoding/json package always sorts the 7 | // keys of a map (when using multiple selectors), and encodes a nil slice as the null JSON value. 8 | package main 9 | 10 | import ( 11 | "encoding/json" 12 | "fmt" 13 | "io" 14 | "log" 15 | "net/http" 16 | "regexp" 17 | "strings" 18 | 19 | "github.com/coolspring8/go-lolhtml" 20 | ) 21 | 22 | var ( 23 | debug = true 24 | listenAddress = ":80" 25 | mainPageFileName = "index.html" 26 | ) 27 | 28 | var ( 29 | urlHasPrefix = regexp.MustCompile(`^[a-zA-Z]+://`) 30 | unifyWhitespace = regexp.MustCompile(`\s{2,}`) 31 | ) 32 | 33 | // used to separate texts in different elements. 34 | var textSeparator = "TEXT_SEPARATOR_TEXT_SEPARATOR" 35 | 36 | func main() { 37 | log.Printf("Server started at %s", listenAddress) 38 | http.HandleFunc("/", handler) 39 | log.Fatal(http.ListenAndServe(listenAddress, nil)) 40 | } 41 | 42 | func handler(w http.ResponseWriter, req *http.Request) { 43 | log.Println(req.URL) 44 | 45 | // 404 46 | if req.URL.Path != "/" { 47 | w.WriteHeader(http.StatusNotFound) 48 | _, _ = w.Write([]byte("Not found")) 49 | return 50 | } 51 | 52 | q := req.URL.Query() 53 | 54 | url := q.Get("url") 55 | if url != "" && !urlHasPrefix.MatchString(url) { 56 | url = "http://" + url 57 | } 58 | 59 | selector := q.Get("selector") 60 | 61 | attr := q.Get("attr") 62 | 63 | var spaced bool 64 | _spaced := q.Get("spaced") 65 | if _spaced != "" { 66 | spaced = true 67 | } else { 68 | spaced = false 69 | } 70 | 71 | var pretty bool 72 | _pretty := q.Get("pretty") 73 | if _pretty != "" { 74 | pretty = true 75 | } else { 76 | pretty = false 77 | } 78 | 79 | // home page 80 | if url == "" && selector == "" { 81 | http.ServeFile(w, req, mainPageFileName) 82 | return 83 | } 84 | 85 | // text or attr: get text, part 1/2 86 | handlers := lolhtml.Handlers{} 87 | // matches and selectors are used by text scraper 88 | matches := make(map[string][]string) 89 | var selectors []string 90 | _selectors := strings.Split(selector, ",") 91 | for _, s := range _selectors { 92 | selectors = append(selectors, strings.TrimSpace(s)) 93 | } 94 | // attrValue is used by attribute scraper 95 | var attrValue string 96 | if attr == "" { 97 | nextText := make(map[string]string) 98 | 99 | for _, s := range selectors { 100 | s := s 101 | handlers.ElementContentHandler = append( 102 | handlers.ElementContentHandler, 103 | lolhtml.ElementContentHandler{ 104 | Selector: s, 105 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 106 | matches[s] = append(matches[s], textSeparator) 107 | nextText[s] = "" 108 | return lolhtml.Continue 109 | }, 110 | TextChunkHandler: func(t *lolhtml.TextChunk) lolhtml.RewriterDirective { 111 | nextText[s] += t.Content() 112 | if t.IsLastInTextNode() { 113 | if spaced { 114 | nextText[s] += " " 115 | } 116 | matches[s] = append(matches[s], nextText[s]) 117 | nextText[s] = "" 118 | } 119 | return lolhtml.Continue 120 | }, 121 | }, 122 | ) 123 | } 124 | } else { 125 | handlers = lolhtml.Handlers{ 126 | ElementContentHandler: []lolhtml.ElementContentHandler{ 127 | { 128 | Selector: selector, 129 | ElementHandler: func(e *lolhtml.Element) lolhtml.RewriterDirective { 130 | attrValue, _ = e.AttributeValue(attr) 131 | return lolhtml.Stop 132 | }, 133 | }, 134 | }, 135 | } 136 | } 137 | 138 | lolWriter, err := lolhtml.NewWriter( 139 | nil, 140 | &handlers, 141 | ) 142 | if err != nil { 143 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 144 | return 145 | } 146 | 147 | // fetch target page content 148 | resp, err := http.Get(url) 149 | if err != nil { 150 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 151 | return 152 | } 153 | if resp.StatusCode != http.StatusOK { 154 | sendError(w, http.StatusBadGateway, fmt.Sprintf("Status %d requesting %s", resp.StatusCode, url), pretty) 155 | return 156 | } 157 | defer resp.Body.Close() 158 | 159 | // might be confusing 160 | _, err = io.Copy(lolWriter, resp.Body) 161 | if err != nil && err.Error() != "The rewriter has been stopped." { 162 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 163 | return 164 | } 165 | if err == nil || err.Error() != "The rewriter has been stopped." { 166 | err = lolWriter.Close() 167 | if err != nil { 168 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 169 | return 170 | } 171 | } 172 | 173 | // text or attr: post-process texts, part 2/2 174 | if attr == "" { 175 | for _, s := range selectors { 176 | var nodeCompleteTexts []string 177 | nextText := "" 178 | 179 | for _, text := range matches[s] { 180 | if text == textSeparator { 181 | if strings.TrimSpace(nextText) != "" { 182 | nodeCompleteTexts = append(nodeCompleteTexts, cleanText(nextText)) 183 | nextText = "" 184 | } 185 | } else { 186 | nextText += text 187 | } 188 | } 189 | 190 | lastText := cleanText(nextText) 191 | if lastText != "" { 192 | nodeCompleteTexts = append(nodeCompleteTexts, lastText) 193 | } 194 | matches[s] = nodeCompleteTexts 195 | } 196 | } 197 | 198 | w.WriteHeader(http.StatusOK) 199 | 200 | enc := json.NewEncoder(w) 201 | enc.SetEscapeHTML(false) 202 | if pretty { 203 | enc.SetIndent("", " ") 204 | } 205 | 206 | if attr == "" { 207 | err = enc.Encode(Response{Result: matches}) 208 | } else { 209 | err = enc.Encode(Response{Result: attrValue}) 210 | } 211 | if err != nil { 212 | sendError(w, http.StatusInternalServerError, err.Error(), pretty) 213 | return 214 | } 215 | } 216 | 217 | type Response struct { 218 | Result interface{} `json:"result,omitempty"` 219 | Error string `json:"error,omitempty"` 220 | } 221 | 222 | func sendError(w http.ResponseWriter, statusCode int, errorText string, pretty bool) { 223 | w.WriteHeader(statusCode) 224 | 225 | enc := json.NewEncoder(w) 226 | enc.SetEscapeHTML(false) 227 | if pretty { 228 | enc.SetIndent("", " ") 229 | } 230 | 231 | // redact concrete error message if debug != true 232 | if !debug && statusCode == http.StatusInternalServerError { 233 | errorText = "Internal server error" 234 | } 235 | 236 | err := enc.Encode(Response{Error: errorText}) 237 | if err != nil { 238 | _, _ = w.Write([]byte(errorText)) 239 | } 240 | } 241 | 242 | func cleanText(s string) string { 243 | return unifyWhitespace.ReplaceAllString(strings.TrimSpace(s), " ") 244 | } 245 | -------------------------------------------------------------------------------- /export_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | // just export some internal functions for tests 4 | 5 | var GetError = getError 6 | var NewSelector = newSelector 7 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/coolspring8/go-lolhtml 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoolSpring8/go-lolhtml/2cb4478586ff392fe240b42831045f1ac74232c1/go.sum -------------------------------------------------------------------------------- /lolhtml.go: -------------------------------------------------------------------------------- 1 | // Package lolhtml provides the ability to parse and rewrite HTML on the fly, 2 | // with a CSS-selector based API. 3 | // 4 | // It is a binding for the Rust crate lol_html. 5 | // https://github.com/cloudflare/lol-html 6 | // 7 | // Please see /examples subdirectory for more detailed examples. 8 | package lolhtml 9 | 10 | /* 11 | #cgo CFLAGS:-I${SRCDIR}/build/include 12 | #cgo LDFLAGS:-llolhtml 13 | #cgo !windows LDFLAGS:-lm 14 | #cgo linux,amd64 LDFLAGS:-L${SRCDIR}/build/linux-x86_64 15 | #cgo darwin,amd64 LDFLAGS:-L${SRCDIR}/build/macos-x86_64 16 | #cgo windows,amd64 LDFLAGS:-L${SRCDIR}/build/windows-x86_64 17 | #include 18 | #include "lol_html.h" 19 | */ 20 | import "C" 21 | -------------------------------------------------------------------------------- /pointer.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | // Credit to https://github.com/mattn/go-pointer. 4 | 5 | // #include 6 | import "C" 7 | import ( 8 | "sync" 9 | "unsafe" 10 | ) 11 | 12 | // sync.Map documentation states that it is optimized for "when the entry for a given key is only 13 | // ever written once but read many times, as in caches that only grow". My benchmarks show that sync.Map 14 | // version rewriter is slower in single-goroutine calls, but faster when used in multiple goroutines 15 | // (and personally I think the latter is more important). 16 | var store sync.Map 17 | 18 | func savePointer(v interface{}) unsafe.Pointer { 19 | if v == nil { 20 | return nil 21 | } 22 | 23 | ptr := C.malloc(C.size_t(1)) 24 | if ptr == nil { 25 | panic(`can't allocate "cgo-pointer hack index pointer": ptr == nil`) 26 | } 27 | 28 | store.Store(ptr, v) 29 | 30 | return ptr 31 | } 32 | 33 | func restorePointer(ptr unsafe.Pointer) (v interface{}) { 34 | if ptr == nil { 35 | return nil 36 | } 37 | 38 | if v, ok := store.Load(ptr); ok { 39 | return v 40 | } 41 | return nil 42 | } 43 | 44 | func unrefPointer(ptr unsafe.Pointer) { 45 | if ptr == nil { 46 | return 47 | } 48 | 49 | store.Delete(ptr) 50 | 51 | C.free(ptr) 52 | } 53 | 54 | func unrefPointers(ptrs []unsafe.Pointer) { 55 | for _, ptr := range ptrs { 56 | unrefPointer(ptr) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /rewriter.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // rewriter represents an actual HTML rewriter. 13 | // rewriterBuilder, rewriter and selector are kept private to simplify public API. 14 | // If you find it useful to use them publicly, please inform me. 15 | type rewriter struct { 16 | rewriter *C.lol_html_rewriter_t 17 | pointers []unsafe.Pointer 18 | // TODO: unrecoverable bool 19 | } 20 | 21 | func (r *rewriter) Write(p []byte) (n int, err error) { 22 | pLen := len(p) 23 | // avoid 0-sized array 24 | if pLen == 0 { 25 | p = []byte("\x00") 26 | } 27 | pC := (*C.char)(unsafe.Pointer(&p[0])) 28 | errCode := C.lol_html_rewriter_write(r.rewriter, pC, C.size_t(pLen)) 29 | if errCode == 0 { 30 | return pLen, nil 31 | } 32 | return 0, getError() 33 | } 34 | 35 | func (r *rewriter) WriteString(chunk string) (n int, err error) { 36 | chunkC := C.CString(chunk) 37 | defer C.free(unsafe.Pointer(chunkC)) 38 | chunkLen := len(chunk) 39 | errCode := C.lol_html_rewriter_write(r.rewriter, chunkC, C.size_t(chunkLen)) 40 | if errCode == 0 { 41 | return chunkLen, nil 42 | } 43 | return 0, getError() 44 | } 45 | 46 | func (r *rewriter) End() error { 47 | errCode := C.lol_html_rewriter_end(r.rewriter) 48 | if errCode == 0 { 49 | return nil 50 | } 51 | return getError() 52 | } 53 | 54 | func (r *rewriter) Free() { 55 | if r != nil { 56 | C.lol_html_rewriter_free(r.rewriter) 57 | unrefPointers(r.pointers) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /rewriter_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/coolspring8/go-lolhtml" 7 | ) 8 | 9 | func TestRewriter_NonAsciiEncoding(t *testing.T) { 10 | w, err := lolhtml.NewWriter( 11 | nil, 12 | nil, 13 | lolhtml.Config{ 14 | Encoding: "UTF-16", 15 | Memory: &lolhtml.MemorySettings{ 16 | PreallocatedParsingBufferSize: 0, 17 | MaxAllowedMemoryUsage: 16, 18 | }, 19 | Strict: true, 20 | }) 21 | if w != nil || err == nil { 22 | t.FailNow() 23 | } 24 | if err.Error() != "Expected ASCII-compatible encoding." { 25 | t.Error(err) 26 | } 27 | err = w.Close() 28 | if err != nil { 29 | t.Error(err) 30 | } 31 | } 32 | 33 | func TestRewriter_MemoryLimiting(t *testing.T) { 34 | w, err := lolhtml.NewWriter( 35 | nil, 36 | &lolhtml.Handlers{ 37 | ElementContentHandler: []lolhtml.ElementContentHandler{ 38 | { 39 | "span", 40 | nil, 41 | nil, 42 | nil, 43 | }, 44 | }, 45 | }, 46 | lolhtml.Config{ 47 | Encoding: "utf-8", 48 | Memory: &lolhtml.MemorySettings{ 49 | PreallocatedParsingBufferSize: 0, 50 | MaxAllowedMemoryUsage: 5, 51 | }, 52 | Strict: true, 53 | }, 54 | ) 55 | if err != nil { 56 | t.Error(err) 57 | } 58 | _, err = w.Write([]byte("len from size_t (uint) to int (int32) on 32-bit machines? 25 | func (s *str) String() string { 26 | if s == nil { 27 | return "" 28 | } 29 | return C.GoStringN(s.data, C.int(s.len)) 30 | } 31 | 32 | func (s *textChunkContent) String() string { 33 | //var nullTextChunkContent textChunkContent 34 | //if s == nullTextChunkContent { 35 | // return "" 36 | //} 37 | if s == nil { 38 | return "" 39 | } 40 | return C.GoStringN(s.data, C.int(s.len)) 41 | } 42 | -------------------------------------------------------------------------------- /textchunk.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | /* 4 | #include 5 | #include "lol_html.h" 6 | */ 7 | import "C" 8 | import "unsafe" 9 | 10 | // TextChunk represents a text chunk. 11 | type TextChunk C.lol_html_text_chunk_t 12 | 13 | // TextChunkHandlerFunc is a callback handler function to do something with a TextChunk. 14 | type TextChunkHandlerFunc func(*TextChunk) RewriterDirective 15 | 16 | // Content returns the text chunk's content. 17 | func (t *TextChunk) Content() string { 18 | text := (textChunkContent)(C.lol_html_text_chunk_content_get((*C.lol_html_text_chunk_t)(t))) 19 | return text.String() 20 | } 21 | 22 | // IsLastInTextNode returns whether the text chunk is the last in the text node. 23 | func (t *TextChunk) IsLastInTextNode() bool { 24 | return (bool)(C.lol_html_text_chunk_is_last_in_text_node((*C.lol_html_text_chunk_t)(t))) 25 | } 26 | 27 | type textChunkAlter int 28 | 29 | const ( 30 | textChunkInsertBefore textChunkAlter = iota 31 | textChunkInsertAfter 32 | textChunkReplace 33 | ) 34 | 35 | func (t *TextChunk) alter(content string, alter textChunkAlter, isHTML bool) error { 36 | contentC := C.CString(content) 37 | defer C.free(unsafe.Pointer(contentC)) 38 | contentLen := len(content) 39 | var errCode C.int 40 | switch alter { 41 | case textChunkInsertBefore: 42 | errCode = C.lol_html_text_chunk_before((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 43 | case textChunkInsertAfter: 44 | errCode = C.lol_html_text_chunk_after((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 45 | case textChunkReplace: 46 | errCode = C.lol_html_text_chunk_replace((*C.lol_html_text_chunk_t)(t), contentC, C.size_t(contentLen), C.bool(isHTML)) 47 | default: 48 | panic("not implemented") 49 | } 50 | if errCode == 0 { 51 | return nil 52 | } 53 | return getError() 54 | } 55 | 56 | // InsertBeforeAsText inserts the given content before the text chunk. 57 | // 58 | // The rewriter will HTML-escape the content before insertion: 59 | // 60 | // `<` will be replaced with `<` 61 | // 62 | // `>` will be replaced with `>` 63 | // 64 | // `&` will be replaced with `&` 65 | func (t *TextChunk) InsertBeforeAsText(content string) error { 66 | return t.alter(content, textChunkInsertBefore, false) 67 | } 68 | 69 | // InsertBeforeAsHTML inserts the given content before the text chunk. 70 | // The content is inserted as is. 71 | func (t *TextChunk) InsertBeforeAsHTML(content string) error { 72 | return t.alter(content, textChunkInsertBefore, true) 73 | } 74 | 75 | // InsertAfterAsText inserts the given content after the text chunk. 76 | // 77 | // The rewriter will HTML-escape the content before insertion: 78 | // 79 | // `<` will be replaced with `<` 80 | // 81 | // `>` will be replaced with `>` 82 | // 83 | // `&` will be replaced with `&` 84 | func (t *TextChunk) InsertAfterAsText(content string) error { 85 | return t.alter(content, textChunkInsertAfter, false) 86 | } 87 | 88 | // InsertAfterAsHTML inserts the given content after the text chunk. 89 | // The content is inserted as is. 90 | func (t *TextChunk) InsertAfterAsHTML(content string) error { 91 | return t.alter(content, textChunkInsertAfter, true) 92 | } 93 | 94 | // ReplaceAsText replace the text chunk with the supplied content. 95 | // 96 | // The rewriter will HTML-escape the content: 97 | // 98 | // `<` will be replaced with `<` 99 | // 100 | // `>` will be replaced with `>` 101 | // 102 | // `&` will be replaced with `&` 103 | func (t *TextChunk) ReplaceAsText(content string) error { 104 | return t.alter(content, textChunkReplace, false) 105 | } 106 | 107 | // ReplaceAsHTML replace the text chunk with the supplied content. 108 | // The content is kept as is. 109 | func (t *TextChunk) ReplaceAsHTML(content string) error { 110 | return t.alter(content, textChunkReplace, true) 111 | } 112 | 113 | // Remove removes the text chunk. 114 | func (t *TextChunk) Remove() { 115 | C.lol_html_text_chunk_remove((*C.lol_html_text_chunk_t)(t)) 116 | } 117 | 118 | // IsRemoved returns whether the text chunk is removed or not. 119 | func (t *TextChunk) IsRemoved() bool { 120 | return (bool)(C.lol_html_text_chunk_is_removed((*C.lol_html_text_chunk_t)(t))) 121 | } 122 | -------------------------------------------------------------------------------- /textchunk_test.go: -------------------------------------------------------------------------------- 1 | package lolhtml_test 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/coolspring8/go-lolhtml" 8 | ) 9 | 10 | func TestTextChunk_InsertBeforeAndAfter(t *testing.T) { 11 | var buf bytes.Buffer 12 | w, err := lolhtml.NewWriter( 13 | &buf, 14 | &lolhtml.Handlers{ 15 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 16 | { 17 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 18 | content := tc.Content() 19 | if len(content) > 0 { 20 | if content != "Hey 42" { 21 | t.Errorf("got %s, want Hey 42", content) 22 | } 23 | if tc.IsLastInTextNode() { 24 | t.Error("text chunk last in text node flag incorrect, expected false, got true") 25 | } 26 | if tc.IsRemoved() { 27 | t.Error("text chunk removed flag incorrect, expected false, got true") 28 | } 29 | if err := tc.InsertBeforeAsHTML("
"); err != nil { 30 | t.Error(err) 31 | } 32 | if err := tc.InsertAfterAsText("
"); err != nil { 33 | t.Error(err) 34 | } 35 | } else { 36 | if !tc.IsLastInTextNode() { 37 | t.Error("text chunk last in text node flag incorrect, expected true, got false") 38 | } 39 | } 40 | return lolhtml.Continue 41 | }, 42 | }, 43 | }, 44 | }, 45 | ) 46 | if err != nil { 47 | t.Error(err) 48 | } 49 | 50 | if _, err := w.Write([]byte("Hey 42")); err != nil { 51 | t.Error(err) 52 | } 53 | if err := w.Close(); err != nil { 54 | t.Error(err) 55 | } 56 | wantedText := "
Hey 42</div>" 57 | if finalText := buf.String(); finalText != wantedText { 58 | t.Errorf("want %s got %s \n", wantedText, finalText) 59 | } 60 | } 61 | 62 | func TestTextChunk_Replace(t *testing.T) { 63 | var buf bytes.Buffer 64 | w, err := lolhtml.NewWriter( 65 | &buf, 66 | &lolhtml.Handlers{ 67 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 68 | { 69 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 70 | if len(tc.Content()) > 0 { 71 | if err := tc.ReplaceAsHTML(""); err != nil { 72 | t.Error(err) 73 | } 74 | if !tc.IsRemoved() { 75 | t.FailNow() 76 | } 77 | } 78 | return lolhtml.Continue 79 | }, 80 | }, 81 | }, 82 | }, 83 | ) 84 | if err != nil { 85 | t.Error(err) 86 | } 87 | 88 | if _, err := w.Write([]byte("
Hello
")); err != nil { 89 | t.Error(err) 90 | } 91 | if err := w.Close(); err != nil { 92 | t.Error(err) 93 | } 94 | wantedText := "
" 95 | if finalText := buf.String(); finalText != wantedText { 96 | t.Errorf("want %s got %s \n", wantedText, finalText) 97 | } 98 | } 99 | 100 | func TestTextChunk_InsertAfter(t *testing.T) { 101 | var buf bytes.Buffer 102 | w, err := lolhtml.NewWriter( 103 | &buf, 104 | &lolhtml.Handlers{ 105 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 106 | { 107 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 108 | if len(tc.Content()) > 0 { 109 | if err := tc.InsertAfterAsHTML(""); err != nil { 110 | t.Error(err) 111 | } 112 | } 113 | return lolhtml.Continue 114 | }, 115 | }, 116 | }, 117 | }, 118 | ) 119 | if err != nil { 120 | t.Error(err) 121 | } 122 | 123 | if _, err := w.Write([]byte("
hello
")); err != nil { 124 | t.Error(err) 125 | } 126 | if err := w.Close(); err != nil { 127 | t.Error(err) 128 | } 129 | wantedText := "
hello
" 130 | if finalText := buf.String(); finalText != wantedText { 131 | t.Errorf("want %s got %s \n", wantedText, finalText) 132 | } 133 | } 134 | 135 | func TestTextChunk_Remove(t *testing.T) { 136 | var buf bytes.Buffer 137 | w, err := lolhtml.NewWriter( 138 | &buf, 139 | &lolhtml.Handlers{ 140 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 141 | { 142 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 143 | if tc.IsRemoved() { 144 | t.FailNow() 145 | } 146 | tc.Remove() 147 | if !tc.IsRemoved() { 148 | t.FailNow() 149 | } 150 | return lolhtml.Continue 151 | }, 152 | }, 153 | }, 154 | }, 155 | ) 156 | if err != nil { 157 | t.Error(err) 158 | } 159 | 160 | if _, err := w.Write([]byte("0_0")); err != nil { 161 | t.Error(err) 162 | } 163 | if err := w.Close(); err != nil { 164 | t.Error(err) 165 | } 166 | wantedText := "" 167 | if finalText := buf.String(); finalText != wantedText { 168 | t.Errorf("want %s got %s \n", wantedText, finalText) 169 | } 170 | } 171 | 172 | func TestTextChunk_StopRewriting(t *testing.T) { 173 | var buf bytes.Buffer 174 | w, err := lolhtml.NewWriter( 175 | &buf, 176 | &lolhtml.Handlers{ 177 | DocumentContentHandler: []lolhtml.DocumentContentHandler{ 178 | { 179 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 180 | return lolhtml.Stop 181 | }, 182 | }, 183 | }, 184 | }, 185 | ) 186 | if err != nil { 187 | t.Error(err) 188 | } 189 | 190 | _, err = w.Write([]byte("42")) 191 | if err == nil { 192 | t.FailNow() 193 | } 194 | if err.Error() != "The rewriter has been stopped." { 195 | t.Error(err) 196 | } 197 | } 198 | 199 | func TestTextChunk_StopRewritingWithSelector(t *testing.T) { 200 | var buf bytes.Buffer 201 | w, err := lolhtml.NewWriter( 202 | &buf, 203 | &lolhtml.Handlers{ 204 | ElementContentHandler: []lolhtml.ElementContentHandler{ 205 | { 206 | Selector: "*", 207 | TextChunkHandler: func(tc *lolhtml.TextChunk) lolhtml.RewriterDirective { 208 | return lolhtml.Stop 209 | }, 210 | }, 211 | }, 212 | }, 213 | ) 214 | if err != nil { 215 | t.Error(err) 216 | } 217 | 218 | _, err = w.Write([]byte("
42
")) 219 | if err == nil { 220 | t.FailNow() 221 | } 222 | if err.Error() != "The rewriter has been stopped." { 223 | t.Error(err) 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /writer.go: -------------------------------------------------------------------------------- 1 | package lolhtml 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | ) 7 | 8 | // Writer takes data written to it and writes the rewritten form of that data to an 9 | // underlying writer (see NewWriter). 10 | type Writer struct { 11 | w io.Writer 12 | rewriter *rewriter 13 | err error 14 | closed bool 15 | } 16 | 17 | // NewWriter returns a new Writer with Handlers and an optional Config configured. 18 | // Writes to the returned Writer are rewritten and written to w. 19 | // 20 | // It is the caller's responsibility to call Close on the Writer when done. 21 | // Writes may be buffered and not flushed until Close. There is no Flush method, 22 | // so before using the content written by w, it is necessary to call Close 23 | // to ensure w has finished writing. 24 | func NewWriter(w io.Writer, handlers *Handlers, config ...Config) (*Writer, error) { 25 | var c Config 26 | var sink OutputSink 27 | if config != nil { 28 | c = config[0] 29 | if c.Sink != nil { 30 | sink = c.Sink 31 | } else if w == nil { 32 | sink = func([]byte) {} 33 | } else { 34 | sink = func(p []byte) { 35 | _, _ = w.Write(p) 36 | } 37 | } 38 | } else { 39 | c = newDefaultConfig() 40 | if w == nil { 41 | sink = func([]byte) {} 42 | } else { 43 | sink = func(p []byte) { 44 | _, _ = w.Write(p) 45 | } 46 | } 47 | } 48 | 49 | rb := newRewriterBuilder() 50 | var selectors []*selector 51 | if handlers != nil { 52 | for _, dh := range handlers.DocumentContentHandler { 53 | rb.AddDocumentContentHandlers( 54 | dh.DoctypeHandler, 55 | dh.CommentHandler, 56 | dh.TextChunkHandler, 57 | dh.DocumentEndHandler, 58 | ) 59 | } 60 | for _, eh := range handlers.ElementContentHandler { 61 | s, err := newSelector(eh.Selector) 62 | if err != nil { 63 | return nil, err 64 | } 65 | selectors = append(selectors, s) 66 | rb.AddElementContentHandlers( 67 | s, 68 | eh.ElementHandler, 69 | eh.CommentHandler, 70 | eh.TextChunkHandler, 71 | ) 72 | } 73 | } 74 | r, err := rb.Build(sink, c) 75 | if err != nil { 76 | return nil, err 77 | } 78 | rb.Free() 79 | for _, s := range selectors { 80 | s.Free() 81 | } 82 | 83 | return &Writer{w: w, rewriter: r}, nil 84 | } 85 | 86 | func (w *Writer) Write(p []byte) (n int, err error) { 87 | if w.err != nil { 88 | return 0, w.err 89 | } 90 | if len(p) == 0 { 91 | return 0, nil 92 | } 93 | n, err = w.rewriter.Write(p) 94 | if err != nil { 95 | w.err = err 96 | return 97 | } 98 | return 99 | } 100 | 101 | // WriteString writes a string to the Writer. 102 | func (w *Writer) WriteString(s string) (n int, err error) { 103 | if w.err != nil { 104 | return 0, w.err 105 | } 106 | if len(s) == 0 { 107 | return 0, nil 108 | } 109 | n, err = w.rewriter.WriteString(s) 110 | if err != nil { 111 | w.err = err 112 | return 113 | } 114 | return 115 | } 116 | 117 | // Close closes the Writer, flushing any unwritten data to the underlying io.Writer, 118 | // but does not close the underlying io.Writer. 119 | // Subsequent calls to Close is a no-op. 120 | func (w *Writer) Close() error { 121 | if w == nil || w.closed { 122 | return nil 123 | } 124 | w.closed = true 125 | if w.err == nil { 126 | w.err = w.rewriter.End() 127 | } 128 | w.rewriter.Free() 129 | return w.err 130 | } 131 | 132 | // RewriteString rewrites the given string with the provided Handlers and Config. 133 | func RewriteString(s string, handlers *Handlers, config ...Config) (string, error) { 134 | var buf bytes.Buffer 135 | var w *Writer 136 | var err error 137 | if config != nil { 138 | w, err = NewWriter(&buf, handlers, config[0]) 139 | } else { 140 | w, err = NewWriter(&buf, handlers) 141 | } 142 | if err != nil { 143 | return "", err 144 | } 145 | 146 | _, err = w.WriteString(s) 147 | if err != nil { 148 | return "", err 149 | } 150 | 151 | err = w.Close() 152 | if err != nil { 153 | return "", err 154 | } 155 | 156 | return buf.String(), nil 157 | } 158 | --------------------------------------------------------------------------------