├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── sanitize.go
└── sanitize_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Mechanism Design. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | sanitize [![GoDoc](https://godoc.org/github.com/kennygrant/sanitize?status.svg)](https://godoc.org/github.com/kennygrant/sanitize) [![Go Report Card](https://goreportcard.com/badge/github.com/kennygrant/sanitize)](https://goreportcard.com/report/github.com/kennygrant/sanitize) [![CircleCI](https://circleci.com/gh/kennygrant/sanitize.svg?style=svg)](https://circleci.com/gh/kennygrant/sanitize)
 2 | ========
 3 | 
 4 | Package sanitize provides functions to sanitize html and paths with go (golang).
 5 | 
 6 | FUNCTIONS
 7 | 
 8 | 
 9 | ```go
10 | sanitize.Accents(s string) string
11 | ```
12 | 
13 | Accents replaces a set of accented characters with ascii equivalents.
14 | 
15 | ```go
16 | sanitize.BaseName(s string) string
17 | ```
18 | 
19 | BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path.
20 | 
21 | ```go
22 | sanitize.HTML(s string) string
23 | ```
24 | 
25 | HTML strips html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text. 
26 | 
27 | ```go
28 | sanitize.HTMLAllowing(s string, args...[]string) (string, error)
29 | ```
30 | 
31 | HTMLAllowing parses html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used. 
32 | 
33 | ```go
34 | sanitize.Name(s string) string
35 | ```
36 | 
37 | Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
38 | 
39 | ```go
40 | sanitize.Path(s string) string
41 | ```
42 | 
43 | Path makes a string safe to use as an url path.
44 | 
45 | 
46 | Changes
47 | -------
48 | 
49 | Version 1.2
50 | 
51 | Adjusted HTML function to avoid linter warning
52 | Added more tests from https://githubengineering.com/githubs-post-csp-journey/
53 | Chnaged name of license file
54 | Added badges and change log to readme
55 | 
56 | Version 1.1
57 | Fixed type in comments. 
58 | Merge pull request from Povilas Balzaravicius Pawka 
59 |  - replace br tags with newline even when they contain a space
60 | 
61 | Version 1.0
62 | First release


--------------------------------------------------------------------------------
/sanitize.go:
--------------------------------------------------------------------------------
  1 | // Package sanitize provides functions for sanitizing text.
  2 | package sanitize
  3 | 
  4 | import (
  5 | 	"bytes"
  6 | 	"html"
  7 | 	"html/template"
  8 | 	"io"
  9 | 	"path"
 10 | 	"regexp"
 11 | 	"strings"
 12 | 
 13 | 	parser "golang.org/x/net/html"
 14 | )
 15 | 
 16 | var (
 17 | 	ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
 18 | 
 19 | 	defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
 20 | 
 21 | 	defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
 22 | )
 23 | 
 24 | // HTMLAllowing sanitizes html, allowing some tags.
 25 | // Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
 26 | func HTMLAllowing(s string, args ...[]string) (string, error) {
 27 | 
 28 | 	allowedTags := defaultTags
 29 | 	if len(args) > 0 {
 30 | 		allowedTags = args[0]
 31 | 	}
 32 | 	allowedAttributes := defaultAttributes
 33 | 	if len(args) > 1 {
 34 | 		allowedAttributes = args[1]
 35 | 	}
 36 | 
 37 | 	// Parse the html
 38 | 	tokenizer := parser.NewTokenizer(strings.NewReader(s))
 39 | 
 40 | 	buffer := bytes.NewBufferString("")
 41 | 	ignore := ""
 42 | 
 43 | 	for {
 44 | 		tokenType := tokenizer.Next()
 45 | 		token := tokenizer.Token()
 46 | 
 47 | 		switch tokenType {
 48 | 
 49 | 		case parser.ErrorToken:
 50 | 			err := tokenizer.Err()
 51 | 			if err == io.EOF {
 52 | 				return buffer.String(), nil
 53 | 			}
 54 | 			return "", err
 55 | 
 56 | 		case parser.StartTagToken:
 57 | 
 58 | 			if len(ignore) == 0 && includes(allowedTags, token.Data) {
 59 | 				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
 60 | 				buffer.WriteString(token.String())
 61 | 			} else if includes(ignoreTags, token.Data) {
 62 | 				ignore = token.Data
 63 | 			}
 64 | 
 65 | 		case parser.SelfClosingTagToken:
 66 | 
 67 | 			if len(ignore) == 0 && includes(allowedTags, token.Data) {
 68 | 				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
 69 | 				buffer.WriteString(token.String())
 70 | 			} else if token.Data == ignore {
 71 | 				ignore = ""
 72 | 			}
 73 | 
 74 | 		case parser.EndTagToken:
 75 | 			if len(ignore) == 0 && includes(allowedTags, token.Data) {
 76 | 				token.Attr = []parser.Attribute{}
 77 | 				buffer.WriteString(token.String())
 78 | 			} else if token.Data == ignore {
 79 | 				ignore = ""
 80 | 			}
 81 | 
 82 | 		case parser.TextToken:
 83 | 			// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
 84 | 			if ignore == "" {
 85 | 				buffer.WriteString(token.String())
 86 | 			}
 87 | 		case parser.CommentToken:
 88 | 			// We ignore comments by default
 89 | 		case parser.DoctypeToken:
 90 | 			// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
 91 | 		default:
 92 | 			// We ignore unknown token types by default
 93 | 
 94 | 		}
 95 | 
 96 | 	}
 97 | 
 98 | }
 99 | 
100 | // HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
101 | // Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
102 | func HTML(s string) (output string) {
103 | 
104 | 	// Shortcut strings with no tags in them
105 | 	if !strings.ContainsAny(s, "<>") {
106 | 		output = s
107 | 	} else {
108 | 
109 | 		// First remove line breaks etc as these have no meaning outside html tags (except pre)
110 | 		// this means pre sections will lose formatting... but will result in less unintentional paras.
111 | 		s = strings.Replace(s, "\n", "", -1)
112 | 
113 | 		// Then replace line breaks with newlines, to preserve that formatting
114 | 		s = strings.Replace(s, "</p>", "\n", -1)
115 | 		s = strings.Replace(s, "<br>", "\n", -1)
116 | 		s = strings.Replace(s, "</br>", "\n", -1)
117 | 		s = strings.Replace(s, "<br/>", "\n", -1)
118 | 		s = strings.Replace(s, "<br />", "\n", -1)
119 | 
120 | 		// Walk through the string removing all tags
121 | 		b := bytes.NewBufferString("")
122 | 		inTag := false
123 | 		for _, r := range s {
124 | 			switch r {
125 | 			case '<':
126 | 				inTag = true
127 | 			case '>':
128 | 				inTag = false
129 | 			default:
130 | 				if !inTag {
131 | 					b.WriteRune(r)
132 | 				}
133 | 			}
134 | 		}
135 | 		output = b.String()
136 | 	}
137 | 
138 | 	// Remove a few common harmless entities, to arrive at something more like plain text
139 | 	output = strings.Replace(output, "&#8216;", "'", -1)
140 | 	output = strings.Replace(output, "&#8217;", "'", -1)
141 | 	output = strings.Replace(output, "&#8220;", "\"", -1)
142 | 	output = strings.Replace(output, "&#8221;", "\"", -1)
143 | 	output = strings.Replace(output, "&nbsp;", " ", -1)
144 | 	output = strings.Replace(output, "&quot;", "\"", -1)
145 | 	output = strings.Replace(output, "&apos;", "'", -1)
146 | 
147 | 	// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
148 | 	output = html.UnescapeString(output)
149 | 
150 | 	// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
151 | 	output = template.HTMLEscapeString(output)
152 | 
153 | 	// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
154 | 	output = strings.Replace(output, "&#34;", "\"", -1)
155 | 	output = strings.Replace(output, "&#39;", "'", -1)
156 | 	output = strings.Replace(output, "&amp; ", "& ", -1)     // NB space after
157 | 	output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
158 | 
159 | 	return output
160 | }
161 | 
162 | // We are very restrictive as this is intended for ascii url slugs
163 | var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
164 | 
165 | // Path makes a string safe to use as a URL path,
166 | // removing accents and replacing separators with -.
167 | // The path may still start at / and is not intended
168 | // for use as a file system path without prefix.
169 | func Path(s string) string {
170 | 	// Start with lowercase string
171 | 	filePath := strings.ToLower(s)
172 | 	filePath = strings.Replace(filePath, "..", "", -1)
173 | 	filePath = path.Clean(filePath)
174 | 
175 | 	// Remove illegal characters for paths, flattening accents
176 | 	// and replacing some common separators with -
177 | 	filePath = cleanString(filePath, illegalPath)
178 | 
179 | 	// NB this may be of length 0, caller must check
180 | 	return filePath
181 | }
182 | 
183 | // Remove all other unrecognised characters apart from
184 | var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
185 | 
186 | // Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
187 | func Name(s string) string {
188 | 	// Start with lowercase string
189 | 	fileName := strings.ToLower(s)
190 | 	fileName = path.Clean(path.Base(fileName))
191 | 
192 | 	// Remove illegal characters for names, replacing some common separators with -
193 | 	fileName = cleanString(fileName, illegalName)
194 | 
195 | 	// NB this may be of length 0, caller must check
196 | 	return fileName
197 | }
198 | 
199 | // Replace these separators with -
200 | var baseNameSeparators = regexp.MustCompile(`[./]`)
201 | 
202 | // BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
203 | // No attempt is made to normalise a path or normalise case.
204 | func BaseName(s string) string {
205 | 
206 | 	// Replace certain joining characters with a dash
207 | 	baseName := baseNameSeparators.ReplaceAllString(s, "-")
208 | 
209 | 	// Remove illegal characters for names, replacing some common separators with -
210 | 	baseName = cleanString(baseName, illegalName)
211 | 
212 | 	// NB this may be of length 0, caller must check
213 | 	return baseName
214 | }
215 | 
216 | // A very limited list of transliterations to catch common european names translated to urls.
217 | // This set could be expanded with at least caps and many more characters.
218 | var transliterations = map[rune]string{
219 | 	'À': "A",
220 | 	'Á': "A",
221 | 	'Â': "A",
222 | 	'Ã': "A",
223 | 	'Ä': "A",
224 | 	'Å': "AA",
225 | 	'Æ': "AE",
226 | 	'Ç': "C",
227 | 	'È': "E",
228 | 	'É': "E",
229 | 	'Ê': "E",
230 | 	'Ë': "E",
231 | 	'Ì': "I",
232 | 	'Í': "I",
233 | 	'Î': "I",
234 | 	'Ï': "I",
235 | 	'Ð': "D",
236 | 	'Ł': "L",
237 | 	'Ñ': "N",
238 | 	'Ò': "O",
239 | 	'Ó': "O",
240 | 	'Ô': "O",
241 | 	'Õ': "O",
242 | 	'Ö': "OE",
243 | 	'Ø': "OE",
244 | 	'Œ': "OE",
245 | 	'Ù': "U",
246 | 	'Ú': "U",
247 | 	'Ü': "UE",
248 | 	'Û': "U",
249 | 	'Ý': "Y",
250 | 	'Þ': "TH",
251 | 	'ẞ': "SS",
252 | 	'à': "a",
253 | 	'á': "a",
254 | 	'â': "a",
255 | 	'ã': "a",
256 | 	'ä': "ae",
257 | 	'å': "aa",
258 | 	'æ': "ae",
259 | 	'ç': "c",
260 | 	'è': "e",
261 | 	'é': "e",
262 | 	'ê': "e",
263 | 	'ë': "e",
264 | 	'ì': "i",
265 | 	'í': "i",
266 | 	'î': "i",
267 | 	'ï': "i",
268 | 	'ð': "d",
269 | 	'ł': "l",
270 | 	'ñ': "n",
271 | 	'ń': "n",
272 | 	'ò': "o",
273 | 	'ó': "o",
274 | 	'ô': "o",
275 | 	'õ': "o",
276 | 	'ō': "o",
277 | 	'ö': "oe",
278 | 	'ø': "oe",
279 | 	'œ': "oe",
280 | 	'ś': "s",
281 | 	'ù': "u",
282 | 	'ú': "u",
283 | 	'û': "u",
284 | 	'ū': "u",
285 | 	'ü': "ue",
286 | 	'ý': "y",
287 | 	'ÿ': "y",
288 | 	'ż': "z",
289 | 	'þ': "th",
290 | 	'ß': "ss",
291 | }
292 | 
293 | // Accents replaces a set of accented characters with ascii equivalents.
294 | func Accents(s string) string {
295 | 	// Replace some common accent characters
296 | 	b := bytes.NewBufferString("")
297 | 	for _, c := range s {
298 | 		// Check transliterations first
299 | 		if val, ok := transliterations[c]; ok {
300 | 			b.WriteString(val)
301 | 		} else {
302 | 			b.WriteRune(c)
303 | 		}
304 | 	}
305 | 	return b.String()
306 | }
307 | 
308 | var (
309 | 	// If the attribute contains data: or javascript: anywhere, ignore it
310 | 	// we don't allow this in attributes as it is so frequently used for xss
311 | 	// NB we allow spaces in the value, and lowercase.
312 | 	illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
313 | 
314 | 	// We are far more restrictive with href attributes.
315 | 	legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
316 | )
317 | 
318 | // cleanAttributes returns an array of attributes after removing malicious ones.
319 | func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
320 | 	if len(a) == 0 {
321 | 		return a
322 | 	}
323 | 
324 | 	var cleaned []parser.Attribute
325 | 	for _, attr := range a {
326 | 		if includes(allowed, attr.Key) {
327 | 
328 | 			val := strings.ToLower(attr.Val)
329 | 
330 | 			// Check for illegal attribute values
331 | 			if illegalAttr.FindString(val) != "" {
332 | 				attr.Val = ""
333 | 			}
334 | 
335 | 			// Check for legal href values - / mailto:// http:// or https://
336 | 			if attr.Key == "href" {
337 | 				if legalHrefAttr.FindString(val) == "" {
338 | 					attr.Val = ""
339 | 				}
340 | 			}
341 | 
342 | 			// If we still have an attribute, append it to the array
343 | 			if attr.Val != "" {
344 | 				cleaned = append(cleaned, attr)
345 | 			}
346 | 		}
347 | 	}
348 | 	return cleaned
349 | }
350 | 
351 | // A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
352 | var (
353 | 	separators = regexp.MustCompile(`[ &_=+:]`)
354 | 
355 | 	dashes = regexp.MustCompile(`[\-]+`)
356 | )
357 | 
358 | // cleanString replaces separators with - and removes characters listed in the regexp provided from string.
359 | // Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
360 | func cleanString(s string, r *regexp.Regexp) string {
361 | 
362 | 	// Remove any trailing space to avoid ending on -
363 | 	s = strings.Trim(s, " ")
364 | 
365 | 	// Flatten accents first so that if we remove non-ascii we still get a legible name
366 | 	s = Accents(s)
367 | 
368 | 	// Replace certain joining characters with a dash
369 | 	s = separators.ReplaceAllString(s, "-")
370 | 
371 | 	// Remove all other unrecognised characters - NB we do allow any printable characters
372 | 	s = r.ReplaceAllString(s, "")
373 | 
374 | 	// Remove any multiple dashes caused by replacements above
375 | 	s = dashes.ReplaceAllString(s, "-")
376 | 
377 | 	return s
378 | }
379 | 
380 | // includes checks for inclusion of a string in a []string.
381 | func includes(a []string, s string) bool {
382 | 	for _, as := range a {
383 | 		if as == s {
384 | 			return true
385 | 		}
386 | 	}
387 | 	return false
388 | }
389 | 


--------------------------------------------------------------------------------
/sanitize_test.go:
--------------------------------------------------------------------------------
  1 | // Utility functions for working with text
  2 | package sanitize
  3 | 
  4 | import (
  5 | 	"testing"
  6 | )
  7 | 
  8 | var Format = "\ninput:    %q\nexpected: %q\noutput:   %q"
  9 | 
 10 | type Test struct {
 11 | 	input    string
 12 | 	expected string
 13 | }
 14 | 
 15 | // NB the treatment of accents - they are removed and replaced with ascii transliterations
 16 | var urls = []Test{
 17 | 	{"ReAd ME.md", `read-me.md`},
 18 | 	{"E88E08A7-279C-4CC1-8B90-86DE0D7044_3C.html", `e88e08a7-279c-4cc1-8b90-86de0d7044-3c.html`},
 19 | 	{"/user/test/I am a long url's_-?ASDF@£$%£%^testé.html", `/user/test/i-am-a-long-urls-asdfteste.html`},
 20 | 	{"/../../4-icon.jpg", `/4-icon.jpg`},
 21 | 	{"/Images_dir/../4-icon.jpg", `/images-dir/4-icon.jpg`},
 22 | 	{"../4 icon.*", `/4-icon.`},
 23 | 	{"Spac ey/Nôm/test før url", `spac-ey/nom/test-foer-url`},
 24 | 	{"../*", `/`},
 25 | }
 26 | 
 27 | func TestPath(t *testing.T) {
 28 | 	for _, test := range urls {
 29 | 		output := Path(test.input)
 30 | 		if output != test.expected {
 31 | 			t.Fatalf(Format, test.input, test.expected, output)
 32 | 		}
 33 | 	}
 34 | }
 35 | 
 36 | func BenchmarkPath(b *testing.B) {
 37 | 	for i := 0; i < b.N; i++ {
 38 | 		for _, test := range urls {
 39 | 			output := Path(test.input)
 40 | 			if output != test.expected {
 41 | 				b.Fatalf(Format, test.input, test.expected, output)
 42 | 			}
 43 | 		}
 44 | 	}
 45 | }
 46 | 
 47 | var fileNames = []Test{
 48 | 	{"ReAd ME.md", `read-me.md`},
 49 | 	{"/var/etc/jobs/go/go/src/pkg/foo/bar.go", `bar.go`},
 50 | 	{"I am a long url's_-?ASDF@£$%£%^é.html", `i-am-a-long-urls-asdfe.html`},
 51 | 	{"/../../4-icon.jpg", `4-icon.jpg`},
 52 | 	{"/Images/../4-icon.jpg", `4-icon.jpg`},
 53 | 	{"../4 icon.jpg", `4-icon.jpg`},
 54 | 	{"../4 icon-testé *8%^\"'\".jpg ", `4-icon-teste-8.jpg`},
 55 | 	{"Überfluß an Döner macht schöner.JPEG", `ueberfluss-an-doener-macht-schoener.jpeg`},
 56 | 	{"Ä-_-Ü_:()_Ö-_-ä-_-ü-_-ö-_ß.webm", `ae-ue-oe-ae-ue-oe-ss.webm`},
 57 | }
 58 | 
 59 | func TestName(t *testing.T) {
 60 | 	for _, test := range fileNames {
 61 | 		output := Name(test.input)
 62 | 		if output != test.expected {
 63 | 			t.Fatalf(Format, test.input, test.expected, output)
 64 | 		}
 65 | 	}
 66 | }
 67 | 
 68 | func BenchmarkName(b *testing.B) {
 69 | 	for i := 0; i < b.N; i++ {
 70 | 		for _, test := range fileNames {
 71 | 			output := Name(test.input)
 72 | 			if output != test.expected {
 73 | 				b.Fatalf(Format, test.input, test.expected, output)
 74 | 			}
 75 | 		}
 76 | 	}
 77 | }
 78 | 
 79 | var baseFileNames = []Test{
 80 | 	{"The power & the Glory jpg file. The end", `The-power-the-Glory-jpg-file-The-end`},
 81 | 	{"/../../4-iCoN.jpg", `-4-iCoN-jpg`},
 82 | 	{"And/Or", `And-Or`},
 83 | 	{"Sonic.EXE", `Sonic-EXE`},
 84 | 	{"012: #Fetch for Defaults", `012-Fetch-for-Defaults`},
 85 | }
 86 | 
 87 | func TestBaseName(t *testing.T) {
 88 | 	for _, test := range baseFileNames {
 89 | 		output := BaseName(test.input)
 90 | 		if output != test.expected {
 91 | 			t.Fatalf(Format, test.input, test.expected, output)
 92 | 		}
 93 | 	}
 94 | }
 95 | 
 96 | // Test with some malformed or malicious html
 97 | // NB because we remove all tokens after a < until the next >
 98 | // and do not attempt to parse, we should be safe from invalid html,
 99 | // but will sometimes completely empty the string if we have invalid input
100 | // Note we sometimes use " in order to keep things on one line and use the ` character
101 | var htmlTests = []Test{
102 | 	{`&nbsp;`, " "},
103 | 	{`&amp;#x000D;`, `&amp;#x000D;`},
104 | 	{`<invalid attr="invalid"<,<p><p><p><p><p>`, ``},
105 | 	{"<b><p>Bold </b> Not bold</p>\nAlso not bold.", "Bold  Not bold\nAlso not bold."},
106 | 	{`FOO&#x000D;ZOO`, "FOO\rZOO"},
107 | 	{`<script><!--<script </s`, ``},
108 | 	{`<a href="/" alt="Fab.com | Aqua Paper Map 22"" title="Fab.com | Aqua Paper Map 22" - fab.com">test</a>`, `test`},
109 | 	{`<p</p>?> or <p id=0</p> or <<</>><ASDF><@$!@£M<<>>>>>>>>>>>>>><>***************aaaaaaaaaaaaaaaaaaaaaaaaaa>`, ` or ***************aaaaaaaaaaaaaaaaaaaaaaaaaa`},
110 | 	{`<p>Some text</p><frameset src="testing.html"></frameset>`, "Some text\n"},
111 | 	{`Something<br/>Some more`, "Something\nSome more"},
112 | 	{`<a href="http://www.example.com"?>This is a 'test' of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.<//data>><alert><script CDATA[:Asdfjk2354115nkjafdgs]>. <div src=">">><><img src="">`, "This is a 'test' of bold & italic \n invalid markup.. \""},
113 | 	{`<![CDATA[<sender>John Smith</sender>]]>`, `John Smith]]`},
114 | 	{`<!-- <script src='blah.js' data-rel='fsd'> --> This is text`, ` -- This is text`},
115 | 	{`<style>body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}</style>`, `body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}`},
116 | 	{`&lt;iframe src="" attr=""&gt;>>>>>`, `&lt;iframe src="" attr=""&gt;`},
117 | 	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `alert("XSS")"`},
118 | 	{`<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, ``},
119 | 	{`<IMG SRC=JaVaScRiPt:alert('XSS')&gt;`, ``},
120 | 	{`<IMG SRC="javascript:alert('XSS')" <test`, ``},
121 | 	{`<a href="javascript:alert('XSS')" src="javascript:alert('XSS')" onclick="javascript:alert('XSS')"></a>`, ``},
122 | 	{`&gt & test &lt`, `&gt; & test &lt;`},
123 | 	{`<img></IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, ``},
124 | 	{`&#8220;hello&#8221; it&#8217;s for &#8216;real&#8217;`, `"hello" it's for 'real'`},
125 | 	{`<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&
126 | #0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>`, ``},
127 | 	{`'';!--"<XSS>=&{()}`, `'';!--"=&amp;{()}`},
128 | 	{"LINE 1<br />\nLINE 2", "LINE 1\nLINE 2"},
129 | 
130 | 	// Examples from https://githubengineering.com/githubs-post-csp-journey/
131 | 	{`<img src='https://example.com/log_csrf?html=`, ``},
132 | 	{`<img src='https://example.com/log_csrf?html=
133 | <form action="https://example.com/account/public_keys/19023812091023">
134 | ...
135 | <input type="hidden" name="csrf_token" value="some_csrf_token_value">
136 | </form>`, `...`},
137 | 	{`<img src='https://example.com?d=https%3A%2F%2Fsome-evil-site.com%2Fimages%2Favatar.jpg%2f
138 | 	<p>secret</p>`, `secret
139 | `},
140 | 	{`<form action="https://some-evil-site.com"><button>Click</button><textarea name='
141 | <!-- </textarea> --><!-- '" -->
142 | <form action="/logout">
143 |   <input name="authenticity_token" type="hidden" value="secret1">
144 | </form>`, `Click --  `},
145 | }
146 | 
147 | func TestHTML(t *testing.T) {
148 | 	for _, test := range htmlTests {
149 | 		output := HTML(test.input)
150 | 		if output != test.expected {
151 | 			t.Fatalf(Format, test.input, test.expected, output)
152 | 		}
153 | 	}
154 | }
155 | 
156 | var htmlTestsAllowing = []Test{
157 | 	{`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`, `<img>`},
158 | 	{`<i>hello world</i href="javascript:alert('hello world')">`, `<i>hello world</i>`},
159 | 	{`hello<br ><br / ><hr /><hr    >rulers`, `hello<br><br><hr/><hr>rulers`},
160 | 	{`<span class="testing" id="testid" name="testname" style="font-color:red;text-size:gigantic;"><p>Span</p></span>`, `<span class="testing" id="testid" name="testname"><p>Span</p></span>`},
161 | 	{`<div class="divclass">Div</div><h4><h3>test</h4>invalid</h3><p>test</p>`, `<div class="divclass">Div</div><h4><h3>test</h4>invalid</h3><p>test</p>`},
162 | 	{`<p>Some text</p><exotic><iframe>test</iframe><frameset src="testing.html"></frameset>`, `<p>Some text</p>`},
163 | 	{`<b>hello world</b>`, `<b>hello world</b>`},
164 | 	{`text<p>inside<p onclick='alert()'/>too`, `text<p>inside<p/>too`},
165 | 	{`&amp;#x000D;`, `&amp;#x000D;`},
166 | 	{`<invalid attr="invalid"<,<p><p><p><p><p>`, `<p><p><p><p>`},
167 | 	{"<b><p>Bold </b> Not bold</p>\nAlso not bold.", "<b><p>Bold </b> Not bold</p>\nAlso not bold."},
168 | 	{"`FOO&#x000D;ZOO", "`FOO&#13;ZOO"},
169 | 	{`<script><!--<script </s`, ``},
170 | 	{`<a href="/" alt="Fab.com | Aqua Paper Map 22"" title="Fab.com | Aqua Paper Map 22" - fab.com">test</a>`, `<a href="/" alt="Fab.com | Aqua Paper Map 22" title="Fab.com | Aqua Paper Map 22">test</a>`},
171 | 	{"<p</p>?> or <p id=0</p> or <<</>><ASDF><@$!@£M<<>>>>>>>>>>>>>><>***************aaaaaaaaaaaaaaaaaaaaaaaaaa>", "?&gt; or <p id=\"0&lt;/p\"> or &lt;&lt;&gt;&lt;@$!@£M&lt;&lt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&lt;&gt;***************aaaaaaaaaaaaaaaaaaaaaaaaaa&gt;"},
172 | 	{`<p>Some text</p><exotic><iframe><frameset src="testing.html"></frameset>`, `<p>Some text</p>`},
173 | 	{"Something<br/>Some more", `Something<br/>Some more`},
174 | 	{`<a href="http://www.example.com"?>This is a 'test' of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.</data><alert><script CDATA[:Asdfjk2354115nkjafdgs]>. <div src=">escape;inside script tag"><img src="">`, `<a href="http://www.example.com">This is a &#39;test&#39; of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.`},
175 | 	{"<sender ignore=me>John Smith</sender>", `John Smith`},
176 | 	{"<!-- <script src='blah.js' data-rel='fsd'> --> This is text", ` This is text`},
177 | 	{"<style>body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}</style>", ``},
178 | 	{`&lt;iframe src="" attr=""&gt;`, `&lt;iframe src=&#34;&#34; attr=&#34;&#34;&gt;`},
179 | 	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `<img>&#34;&gt;`},
180 | 	{`<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, `<img>`},
181 | 	{`<IMG SRC=JaVaScRiPt:alert('XSS')&gt;`, ``},
182 | 	{`<IMG SRC="javascript:alert('XSS')">>> <test`, `<img>&gt;&gt; `},
183 | 	{`&gt & test &lt`, `&gt; &amp; test &lt;`},
184 | 	{`<img></IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, `<img></img>`},
185 | 	{`<img src="data:text/javascript;alert('alert');">`, `<img>`},
186 | 	{`<iframe src=http://... <`, ``},
187 | 	{`<iframe src="data:CSS"><img><a><</a>;sdf<iframe>`, ``},
188 | 	{`<img src=javascript:alert(document.cookie)>`, `<img>`},
189 | 	{`<?php echo('hello world')>`, ``},
190 | 	{`Hello <STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A>World`, `Hello <a class="XSS"></a>World`},
191 | 	{`<a href="javascript:alert('XSS1')" onmouseover="alert('XSS2')">XSS<a>`, `<a>XSS<a>`},
192 | 	{`<a href="http://www.google.com/"><img src="https://ssl.gstatic.com/accounts/ui/logo_2x.png"/></a>`,
193 | 		`<a href="http://www.google.com/"><img src="https://ssl.gstatic.com/accounts/ui/logo_2x.png"/></a>`},
194 | 	{`<a href="javascript:alert(&#39;XSS1&#39;)" "document.write('<HTML> Tags and markup');">XSS<a>`, `<a> Tags and markup&#39;);&#34;&gt;XSS<a>`},
195 | 	{`<a <script>document.write("UNTRUSTED INPUT: " + document.location.hash);<script/> >`, `<a>document.write(&#34;UNTRUSTED INPUT: &#34; + document.location.hash); &gt;`},
196 | 	{`<a href="#anchor">foo</a>`, `<a href="#anchor">foo</a>`},
197 | 	{`<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>`, `<img>`},
198 | 	{`<IMG SRC="jav	ascript:alert('XSS');">`, `<img>`},
199 | 	{`<IMG SRC="jav&#x09;ascript:alert('XSS');">`, `<img>`},
200 | 	{`<HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`, ` +ADw-SCRIPT+AD4-alert(&#39;XSS&#39;);+ADw-/SCRIPT+AD4-`},
201 | 	{`<SCRIPT>document.write("<SCRI");</SCRIPT>PT SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, `PT SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;`},
202 | 	{`<a href="javascript:alert('XSS')" src="javascript:alert('XSS')" onclick="javascript:alert('XSS')"></a>`, `<a></a>`},
203 | 	{`'';!--"<XSS>=&{()}`, `&#39;&#39;;!--&#34;=&amp;{()}`},
204 | 	{`<IMG SRC=javascript:alert('XSS')`, ``},
205 | 	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `<img>&#34;&gt;`},
206 | 	{`<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&
207 | #0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>`, `<img>`},
208 | 	{`<a href="mailto:cool@test.com?subject=cooool">cool guy</a>`, `<a href="mailto:cool@test.com?subject=cooool">cool guy</a>`},
209 | }
210 | 
211 | func TestHTMLAllowed(t *testing.T) {
212 | 
213 | 	for _, test := range htmlTestsAllowing {
214 | 		output, err := HTMLAllowing(test.input)
215 | 		if err != nil {
216 | 			t.Fatalf(Format, test.input, test.expected, output, err)
217 | 		}
218 | 		if output != test.expected {
219 | 			t.Fatalf(Format, test.input, test.expected, output)
220 | 		}
221 | 	}
222 | }
223 | 
224 | func BenchmarkHTMLAllowed(b *testing.B) {
225 | 	for i := 0; i < b.N; i++ {
226 | 		for _, test := range htmlTestsAllowing {
227 | 			output, err := HTMLAllowing(test.input)
228 | 			if err != nil {
229 | 				b.Fatalf(Format, test.input, test.expected, output, err)
230 | 			}
231 | 			if output != test.expected {
232 | 				b.Fatalf(Format, test.input, test.expected, output)
233 | 			}
234 | 		}
235 | 	}
236 | }
237 | 


--------------------------------------------------------------------------------