├── .codeclimate.yml ├── .github └── workflows │ ├── ci.yaml │ └── lint.yaml ├── .golangci.yaml ├── LICENSE ├── PATENTS ├── README.md ├── go.mod ├── strip.go └── strip_test.go /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | engines: 2 | gofmt: 3 | enabled: true 4 | golint: 5 | enabled: true 6 | govet: 7 | enabled: true 8 | 9 | ratings: 10 | paths: 11 | - "**.go" 12 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | workflow_dispatch: 10 | jobs: 11 | test: 12 | strategy: 13 | matrix: 14 | go-version: [1.24.x, 1.16.x] 15 | platform: [ubuntu-latest, macos-latest, windows-latest] 16 | runs-on: ${{ matrix.platform }} 17 | steps: 18 | - name: Install Go 19 | if: success() 20 | uses: actions/setup-go@v2 21 | with: 22 | go-version: ${{ matrix.go-version }} 23 | - name: Checkout code 24 | uses: actions/checkout@v3 25 | - name: Run tests 26 | run: go test -v -covermode=count ./... 27 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | workflow_dispatch: 4 | jobs: 5 | lint: 6 | strategy: 7 | matrix: 8 | go-version: [1.x] 9 | platform: [ubuntu-latest] 10 | runs-on: ${{ matrix.platform }} 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: golangci-lint 14 | uses: golangci/golangci-lint-action@v3 15 | with: 16 | version: latest 17 | args: --timeout 3m --verbose 18 | -------------------------------------------------------------------------------- /.golangci.yaml: -------------------------------------------------------------------------------- 1 | linters: 2 | enable: 3 | - dogsled 4 | - dupl 5 | - gofmt 6 | - goimports 7 | - gosec 8 | - misspell 9 | - nakedret 10 | - stylecheck 11 | - unconvert 12 | - unparam 13 | - whitespace 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Additional IP Rights Grant (Patents) 2 | 3 | "This implementation" means the copyrightable works distributed by 4 | Google as part of the Go project. 5 | 6 | Google hereby grants to You a perpetual, worldwide, non-exclusive, 7 | no-charge, royalty-free, irrevocable (except as stated in this section) 8 | patent license to make, have made, use, offer to sell, sell, import, 9 | transfer and otherwise run, modify and propagate the contents of this 10 | implementation of Go, where such license applies only to those patent 11 | claims, both currently owned or controlled by Google and acquired in 12 | the future, licensable by Google that are necessarily infringed by this 13 | implementation of Go. This grant does not include claims that would be 14 | infringed only as a consequence of further modification of this 15 | implementation. If you or your agent or exclusive licensee institute or 16 | order or agree to the institution of patent litigation against any 17 | entity (including a cross-claim or counterclaim in a lawsuit) alleging 18 | that this implementation of Go or any code incorporated within this 19 | implementation of Go constitutes direct or contributory patent 20 | infringement, or inducement of patent infringement, then any patent 21 | rights granted to you under this License for this implementation of Go 22 | shall terminate as of the date such litigation is filed. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HTML StripTags for Go 2 | ===================== 3 | 4 | [![Used By][used-by-svg]][used-by-url] 5 | [![Build Status][build-status-svg]][build-status-url] 6 | [![Go Report Card][goreport-svg]][goreport-url] 7 | [![Docs][docs-godoc-svg]][docs-godoc-url] 8 | [![License][license-svg]][license-url] 9 | 10 | This is a Go package containing an extracted version of the unexported `stripTags` function in `html/template/html.go`. 11 | 12 | :warning: This package does not protect against untrusted input. Please use [bluemonday](https://github.com/microcosm-cc/bluemonday) if you have untrusted data :warning: 13 | 14 | ## Background 15 | 16 | * The `stripTags` function in `html/template/html.go` is very useful, however, it is not exported. 17 | * Requests were made [on GitHub](https://github.com/golang/go/issues/5884) without success. 18 | * This package is a repo for work done by [Christopher Hesse](https://github.com/christopherhesse) provided in this [Gist](https://gist.github.com/christopherhesse/d422447a086d373a967f). 19 | 20 | ## Installation 21 | 22 | ```bash 23 | $ go get github.com/grokify/html-strip-tags-go 24 | ``` 25 | 26 | ## Usage 27 | 28 | ```go 29 | import( 30 | "github.com/grokify/html-strip-tags-go" // => strip 31 | ) 32 | 33 | func main() { 34 | original := "

Hello World

" 35 | stripped := strip.StripTags(original) // => "Hello World" 36 | } 37 | ``` 38 | 39 | [used-by-svg]: https://sourcegraph.com/github.com/grokify/html-strip-tags-go/-/badge.svg 40 | [used-by-url]: https://sourcegraph.com/github.com/grokify/html-strip-tags-go?badge 41 | [goreport-svg]: https://goreportcard.com/badge/github.com/grokify/html-strip-tags-go 42 | [goreport-url]: https://goreportcard.com/report/github.com/grokify/html-strip-tags-go 43 | [build-status-svg]: https://github.com/grokify/html-strip-tags-go/actions/workflows/ci.yaml/badge.svg?branch=main 44 | [build-status-url]: https://github.com/grokify/html-strip-tags-go/actions/workflows/ci.yaml 45 | [coverage-status-svg]: https://coveralls.io/repos/grokify/html-strip-tags-go/badge.svg?branch=master 46 | [coverage-status-url]: https://coveralls.io/r/grokify/html-strip-tags-go?branch=master 47 | [codeclimate-status-svg]: https://codeclimate.com/github/grokify/html-strip-tags-go/badges/gpa.svg 48 | [codeclimate-status-url]: https://codeclimate.com/github/grokify/html-strip-tags-go 49 | [docs-godoc-svg]: https://pkg.go.dev/badge/github.com/grokify/html-strip-tags-go 50 | [docs-godoc-url]: https://pkg.go.dev/github.com/grokify/html-strip-tags-go 51 | [license-svg]: https://img.shields.io/badge/license-BSD--style+patent--grant-blue.svg 52 | [license-url]: https://github.com/grokify/html-strip-tags-go/blob/master/LICENSE 53 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/grokify/html-strip-tags-go 2 | 3 | go 1.16 4 | -------------------------------------------------------------------------------- /strip.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package strip 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "fmt" 11 | "html" 12 | "io" 13 | "os" 14 | "path/filepath" 15 | "reflect" 16 | "strings" 17 | "sync" 18 | "text/template" 19 | "text/template/parse" 20 | "unicode" 21 | "unicode/utf8" 22 | ) 23 | 24 | // htmlNospaceEscaper escapes for inclusion in unquoted attribute values. 25 | func htmlNospaceEscaper(args ...interface{}) string { 26 | s, t := stringify(args...) 27 | if t == contentTypeHTML { 28 | return htmlReplacer(StripTags(s), htmlNospaceNormReplacementTable, false) 29 | } 30 | return htmlReplacer(s, htmlNospaceReplacementTable, false) 31 | } 32 | 33 | // attrEscaper escapes for inclusion in quoted attribute values. 34 | func attrEscaper(args ...interface{}) string { 35 | s, t := stringify(args...) 36 | if t == contentTypeHTML { 37 | return htmlReplacer(StripTags(s), htmlNormReplacementTable, true) 38 | } 39 | return htmlReplacer(s, htmlReplacementTable, true) 40 | } 41 | 42 | // rcdataEscaper escapes for inclusion in an RCDATA element body. 43 | func rcdataEscaper(args ...interface{}) string { 44 | s, t := stringify(args...) 45 | if t == contentTypeHTML { 46 | return htmlReplacer(s, htmlNormReplacementTable, true) 47 | } 48 | return htmlReplacer(s, htmlReplacementTable, true) 49 | } 50 | 51 | // htmlEscaper escapes for inclusion in HTML text. 52 | func htmlEscaper(args ...interface{}) string { 53 | s, t := stringify(args...) 54 | if t == contentTypeHTML { 55 | return s 56 | } 57 | return htmlReplacer(s, htmlReplacementTable, true) 58 | } 59 | 60 | // htmlReplacementTable contains the runes that need to be escaped 61 | // inside a quoted attribute value or in a text node. 62 | var htmlReplacementTable = []string{ 63 | // http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state 64 | // U+0000 NULL Parse error. Append a U+FFFD REPLACEMENT 65 | // CHARACTER character to the current attribute's value. 66 | // " 67 | // and similarly 68 | // http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state 69 | 0: "\uFFFD", 70 | '"': """, 71 | '&': "&", 72 | '\'': "'", 73 | '+': "+", 74 | '<': "<", 75 | '>': ">", 76 | } 77 | 78 | // htmlNormReplacementTable is like htmlReplacementTable but without '&' to 79 | // avoid over-encoding existing entities. 80 | var htmlNormReplacementTable = []string{ 81 | 0: "\uFFFD", 82 | '"': """, 83 | '\'': "'", 84 | '+': "+", 85 | '<': "<", 86 | '>': ">", 87 | } 88 | 89 | // htmlNospaceReplacementTable contains the runes that need to be escaped 90 | // inside an unquoted attribute value. 91 | // The set of runes escaped is the union of the HTML specials and 92 | // those determined by running the JS below in browsers: 93 | //
94 | // 106 | var htmlNospaceReplacementTable = []string{ 107 | 0: "�", 108 | '\t': " ", 109 | '\n': " ", 110 | '\v': " ", 111 | '\f': " ", 112 | '\r': " ", 113 | ' ': " ", 114 | '"': """, 115 | '&': "&", 116 | '\'': "'", 117 | '+': "+", 118 | '<': "<", 119 | '=': "=", 120 | '>': ">", 121 | // A parse error in the attribute value (unquoted) and 122 | // before attribute value states. 123 | // Treated as a quoting character by IE. 124 | '`': "`", 125 | } 126 | 127 | // htmlNospaceNormReplacementTable is like htmlNospaceReplacementTable but 128 | // without '&' to avoid over-encoding existing entities. 129 | var htmlNospaceNormReplacementTable = []string{ 130 | 0: "�", 131 | '\t': " ", 132 | '\n': " ", 133 | '\v': " ", 134 | '\f': " ", 135 | '\r': " ", 136 | ' ': " ", 137 | '"': """, 138 | '\'': "'", 139 | '+': "+", 140 | '<': "<", 141 | '=': "=", 142 | '>': ">", 143 | // A parse error in the attribute value (unquoted) and 144 | // before attribute value states. 145 | // Treated as a quoting character by IE. 146 | '`': "`", 147 | } 148 | 149 | // htmlReplacer returns s with runes replaced according to replacementTable 150 | // and when badRunes is true, certain bad runes are allowed through unescaped. 151 | func htmlReplacer(s string, replacementTable []string, badRunes bool) string { 152 | written, b := 0, new(bytes.Buffer) 153 | for i, r := range s { 154 | if int(r) < len(replacementTable) { 155 | if repl := replacementTable[r]; len(repl) != 0 { 156 | b.WriteString(s[written:i]) 157 | b.WriteString(repl) 158 | // Valid as long as replacementTable doesn't 159 | // include anything above 0x7f. 160 | written = i + utf8.RuneLen(r) 161 | } 162 | } else if badRunes { 163 | // No-op. 164 | // IE does not allow these ranges in unquoted attrs. 165 | } else if 0xfdd0 <= r && r <= 0xfdef || 0xfff0 <= r && r <= 0xffff { 166 | fmt.Fprintf(b, "%s&#x%x;", s[written:i], r) 167 | written = i + utf8.RuneLen(r) 168 | } 169 | } 170 | if written == 0 { 171 | return s 172 | } 173 | b.WriteString(s[written:]) 174 | return b.String() 175 | } 176 | 177 | // stripTags takes a snippet of HTML and returns only the text content. 178 | // For example, `¡Hi! ` -> `¡Hi! `. 179 | func StripTags(html string) string { 180 | var b bytes.Buffer 181 | s, c, i, allText := []byte(html), context{}, 0, true 182 | // Using the transition funcs helps us avoid mangling 183 | // `
` or `I <3 Ponies!`. 184 | for i != len(s) { 185 | if c.delim == delimNone { 186 | st := c.state 187 | // Use RCDATA instead of parsing into JS or CSS styles. 188 | if c.element != elementNone && !isInTag(st) { 189 | st = stateRCDATA 190 | } 191 | d, nread := transitionFunc[st](c, s[i:]) 192 | i1 := i + nread 193 | if c.state == stateText || c.state == stateRCDATA { 194 | // Emit text up to the start of the tag or comment. 195 | j := i1 196 | if d.state != c.state { 197 | for j1 := j - 1; j1 >= i; j1-- { 198 | if s[j1] == '<' { 199 | j = j1 200 | break 201 | } 202 | } 203 | } 204 | b.Write(s[i:j]) 205 | } else { 206 | allText = false 207 | } 208 | c, i = d, i1 209 | continue 210 | } 211 | i1 := i + bytes.IndexAny(s[i:], delimEnds[c.delim]) 212 | if i1 < i { 213 | break 214 | } 215 | if c.delim != delimSpaceOrTagEnd { 216 | // Consume any quote. 217 | i1++ 218 | } 219 | c, i = context{state: stateTag, element: c.element}, i1 220 | } 221 | if allText { 222 | return html 223 | } else if c.state == stateText || c.state == stateRCDATA { 224 | b.Write(s[i:]) 225 | } 226 | return b.String() 227 | } 228 | 229 | // htmlNameFilter accepts valid parts of an HTML attribute or tag name or 230 | // a known-safe HTML attribute. 231 | func htmlNameFilter(args ...interface{}) string { 232 | s, t := stringify(args...) 233 | if t == contentTypeHTMLAttr { 234 | return s 235 | } 236 | if len(s) == 0 { 237 | // Avoid violation of structure preservation. 238 | // . 239 | // Without this, if .K is empty then .V is the value of 240 | // checked, but otherwise .V is the value of the attribute 241 | // named .K. 242 | return filterFailsafe 243 | } 244 | s = strings.ToLower(s) 245 | if t := attrType(s); t != contentTypePlain { 246 | // TODO: Split attr and element name part filters so we can whitelist 247 | // attributes. 248 | return filterFailsafe 249 | } 250 | for _, r := range s { 251 | switch { 252 | case '0' <= r && r <= '9': 253 | case 'a' <= r && r <= 'z': 254 | default: 255 | return filterFailsafe 256 | } 257 | } 258 | return s 259 | } 260 | 261 | // commentEscaper returns the empty string regardless of input. 262 | // Comment content does not correspond to any parsed structure or 263 | // human-readable content, so the simplest and most secure policy is to drop 264 | // content interpolated into comments. 265 | // This approach is equally valid whether or not static comment content is 266 | // removed from the template. 267 | func commentEscaper(args ...interface{}) string { 268 | return "" 269 | } 270 | 271 | // Copyright 2011 The Go Authors. All rights reserved. 272 | // Use of this source code is governed by a BSD-style 273 | // license that can be found in the LICENSE file. 274 | 275 | // context describes the state an HTML parser must be in when it reaches the 276 | // portion of HTML produced by evaluating a particular template node. 277 | // 278 | // The zero value of type context is the start context for a template that 279 | // produces an HTML fragment as defined at 280 | // http://www.w3.org/TR/html5/syntax.html#the-end 281 | // where the context element is null. 282 | type context struct { 283 | state state 284 | delim delim 285 | urlPart urlPart 286 | jsCtx jsCtx 287 | attr attr 288 | element element 289 | err *Error 290 | } 291 | 292 | func (c context) String() string { 293 | return fmt.Sprintf("{%v %v %v %v %v %v %v}", c.state, c.delim, c.urlPart, c.jsCtx, c.attr, c.element, c.err) 294 | } 295 | 296 | // eq reports whether two contexts are equal. 297 | func (c context) eq(d context) bool { 298 | return c.state == d.state && 299 | c.delim == d.delim && 300 | c.urlPart == d.urlPart && 301 | c.jsCtx == d.jsCtx && 302 | c.attr == d.attr && 303 | c.element == d.element && 304 | c.err == d.err 305 | } 306 | 307 | // mangle produces an identifier that includes a suffix that distinguishes it 308 | // from template names mangled with different contexts. 309 | func (c context) mangle(templateName string) string { 310 | // The mangled name for the default context is the input templateName. 311 | if c.state == stateText { 312 | return templateName 313 | } 314 | s := templateName + "$htmltemplate_" + c.state.String() 315 | if c.delim != 0 { 316 | s += "_" + c.delim.String() 317 | } 318 | if c.urlPart != 0 { 319 | s += "_" + c.urlPart.String() 320 | } 321 | if c.jsCtx != 0 { 322 | s += "_" + c.jsCtx.String() 323 | } 324 | if c.attr != 0 { 325 | s += "_" + c.attr.String() 326 | } 327 | if c.element != 0 { 328 | s += "_" + c.element.String() 329 | } 330 | return s 331 | } 332 | 333 | // state describes a high-level HTML parser state. 334 | // 335 | // It bounds the top of the element stack, and by extension the HTML insertion 336 | // mode, but also contains state that does not correspond to anything in the 337 | // HTML5 parsing algorithm because a single token production in the HTML 338 | // grammar may contain embedded actions in a template. For instance, the quoted 339 | // HTML attribute produced by 340 | // 341 | //
342 | // 343 | // is a single token in HTML's grammar but in a template spans several nodes. 344 | type state uint8 345 | 346 | const ( 347 | // stateText is parsed character data. An HTML parser is in 348 | // this state when its parse position is outside an HTML tag, 349 | // directive, comment, and special element body. 350 | stateText state = iota 351 | // stateTag occurs before an HTML attribute or the end of a tag. 352 | stateTag 353 | // stateAttrName occurs inside an attribute name. 354 | // It occurs between the ^'s in ` ^name^ = value`. 355 | stateAttrName 356 | // stateAfterName occurs after an attr name has ended but before any 357 | // equals sign. It occurs between the ^'s in ` name^ ^= value`. 358 | stateAfterName 359 | // stateBeforeValue occurs after the equals sign but before the value. 360 | // It occurs between the ^'s in ` name =^ ^value`. 361 | stateBeforeValue 362 | // stateHTMLCmt occurs inside an . 363 | stateHTMLCmt 364 | // stateRCDATA occurs inside an RCDATA element ( Baz", "Foo Bar Baz"}, 16 | {"Foo Baz", "Foo Baz"}, 17 | {"<", "<"}, 18 | {"foo < bar", "foo < bar"}, 19 | {`FooBar`, "FooBar"}, 20 | {`Foo
Bar`, "FooBar"}, 21 | {`I <3 Ponies!`, `I <3 Ponies!`}, 22 | {``, ``}, 23 | } 24 | 25 | for _, test := range tests { 26 | if got := StripTags(test.input); got != test.want { 27 | t.Errorf("%q: want %q, got %q", test.input, test.want, got) 28 | } 29 | } 30 | } 31 | --------------------------------------------------------------------------------