├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── sanitize.go
└── sanitize_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 |
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 |
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 |
20 | _testmain.go
21 |
22 | *.exe
23 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Mechanism Design. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | * Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 | * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 | * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | sanitize [](https://godoc.org/github.com/kennygrant/sanitize) [](https://goreportcard.com/report/github.com/kennygrant/sanitize) [](https://circleci.com/gh/kennygrant/sanitize)
2 | ========
3 |
4 | Package sanitize provides functions to sanitize html and paths with go (golang).
5 |
6 | FUNCTIONS
7 |
8 |
9 | ```go
10 | sanitize.Accents(s string) string
11 | ```
12 |
13 | Accents replaces a set of accented characters with ascii equivalents.
14 |
15 | ```go
16 | sanitize.BaseName(s string) string
17 | ```
18 |
19 | BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path.
20 |
21 | ```go
22 | sanitize.HTML(s string) string
23 | ```
24 |
25 | HTML strips html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text.
26 |
27 | ```go
28 | sanitize.HTMLAllowing(s string, args...[]string) (string, error)
29 | ```
30 |
31 | HTMLAllowing parses html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used.
32 |
33 | ```go
34 | sanitize.Name(s string) string
35 | ```
36 |
37 | Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
38 |
39 | ```go
40 | sanitize.Path(s string) string
41 | ```
42 |
43 | Path makes a string safe to use as an url path.
44 |
45 |
46 | Changes
47 | -------
48 |
49 | Version 1.2
50 |
51 | Adjusted HTML function to avoid linter warning
52 | Added more tests from https://githubengineering.com/githubs-post-csp-journey/
53 | Chnaged name of license file
54 | Added badges and change log to readme
55 |
56 | Version 1.1
57 | Fixed type in comments.
58 | Merge pull request from Povilas Balzaravicius Pawka
59 | - replace br tags with newline even when they contain a space
60 |
61 | Version 1.0
62 | First release
--------------------------------------------------------------------------------
/sanitize.go:
--------------------------------------------------------------------------------
1 | // Package sanitize provides functions for sanitizing text.
2 | package sanitize
3 |
4 | import (
5 | "bytes"
6 | "html"
7 | "html/template"
8 | "io"
9 | "path"
10 | "regexp"
11 | "strings"
12 |
13 | parser "golang.org/x/net/html"
14 | )
15 |
16 | var (
17 | ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
18 |
19 | defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
20 |
21 | defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
22 | )
23 |
24 | // HTMLAllowing sanitizes html, allowing some tags.
25 | // Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
26 | func HTMLAllowing(s string, args ...[]string) (string, error) {
27 |
28 | allowedTags := defaultTags
29 | if len(args) > 0 {
30 | allowedTags = args[0]
31 | }
32 | allowedAttributes := defaultAttributes
33 | if len(args) > 1 {
34 | allowedAttributes = args[1]
35 | }
36 |
37 | // Parse the html
38 | tokenizer := parser.NewTokenizer(strings.NewReader(s))
39 |
40 | buffer := bytes.NewBufferString("")
41 | ignore := ""
42 |
43 | for {
44 | tokenType := tokenizer.Next()
45 | token := tokenizer.Token()
46 |
47 | switch tokenType {
48 |
49 | case parser.ErrorToken:
50 | err := tokenizer.Err()
51 | if err == io.EOF {
52 | return buffer.String(), nil
53 | }
54 | return "", err
55 |
56 | case parser.StartTagToken:
57 |
58 | if len(ignore) == 0 && includes(allowedTags, token.Data) {
59 | token.Attr = cleanAttributes(token.Attr, allowedAttributes)
60 | buffer.WriteString(token.String())
61 | } else if includes(ignoreTags, token.Data) {
62 | ignore = token.Data
63 | }
64 |
65 | case parser.SelfClosingTagToken:
66 |
67 | if len(ignore) == 0 && includes(allowedTags, token.Data) {
68 | token.Attr = cleanAttributes(token.Attr, allowedAttributes)
69 | buffer.WriteString(token.String())
70 | } else if token.Data == ignore {
71 | ignore = ""
72 | }
73 |
74 | case parser.EndTagToken:
75 | if len(ignore) == 0 && includes(allowedTags, token.Data) {
76 | token.Attr = []parser.Attribute{}
77 | buffer.WriteString(token.String())
78 | } else if token.Data == ignore {
79 | ignore = ""
80 | }
81 |
82 | case parser.TextToken:
83 | // We allow text content through, unless ignoring this entire tag and its contents (including other tags)
84 | if ignore == "" {
85 | buffer.WriteString(token.String())
86 | }
87 | case parser.CommentToken:
88 | // We ignore comments by default
89 | case parser.DoctypeToken:
90 | // We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
91 | default:
92 | // We ignore unknown token types by default
93 |
94 | }
95 |
96 | }
97 |
98 | }
99 |
100 | // HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
101 | // Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
102 | func HTML(s string) (output string) {
103 |
104 | // Shortcut strings with no tags in them
105 | if !strings.ContainsAny(s, "<>") {
106 | output = s
107 | } else {
108 |
109 | // First remove line breaks etc as these have no meaning outside html tags (except pre)
110 | // this means pre sections will lose formatting... but will result in less unintentional paras.
111 | s = strings.Replace(s, "\n", "", -1)
112 |
113 | // Then replace line breaks with newlines, to preserve that formatting
114 | s = strings.Replace(s, "
", "\n", -1)
115 | s = strings.Replace(s, " ", "\n", -1)
116 | s = strings.Replace(s, "", "\n", -1)
117 | s = strings.Replace(s, " ", "\n", -1)
118 | s = strings.Replace(s, " ", "\n", -1)
119 |
120 | // Walk through the string removing all tags
121 | b := bytes.NewBufferString("")
122 | inTag := false
123 | for _, r := range s {
124 | switch r {
125 | case '<':
126 | inTag = true
127 | case '>':
128 | inTag = false
129 | default:
130 | if !inTag {
131 | b.WriteRune(r)
132 | }
133 | }
134 | }
135 | output = b.String()
136 | }
137 |
138 | // Remove a few common harmless entities, to arrive at something more like plain text
139 | output = strings.Replace(output, "‘", "'", -1)
140 | output = strings.Replace(output, "’", "'", -1)
141 | output = strings.Replace(output, "“", "\"", -1)
142 | output = strings.Replace(output, "”", "\"", -1)
143 | output = strings.Replace(output, " ", " ", -1)
144 | output = strings.Replace(output, """, "\"", -1)
145 | output = strings.Replace(output, "'", "'", -1)
146 |
147 | // Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
148 | output = html.UnescapeString(output)
149 |
150 | // In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
151 | output = template.HTMLEscapeString(output)
152 |
153 | // After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
154 | output = strings.Replace(output, """, "\"", -1)
155 | output = strings.Replace(output, "'", "'", -1)
156 | output = strings.Replace(output, "& ", "& ", -1) // NB space after
157 | output = strings.Replace(output, "& ", "& ", -1) // NB space after
158 |
159 | return output
160 | }
161 |
162 | // We are very restrictive as this is intended for ascii url slugs
163 | var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
164 |
165 | // Path makes a string safe to use as a URL path,
166 | // removing accents and replacing separators with -.
167 | // The path may still start at / and is not intended
168 | // for use as a file system path without prefix.
169 | func Path(s string) string {
170 | // Start with lowercase string
171 | filePath := strings.ToLower(s)
172 | filePath = strings.Replace(filePath, "..", "", -1)
173 | filePath = path.Clean(filePath)
174 |
175 | // Remove illegal characters for paths, flattening accents
176 | // and replacing some common separators with -
177 | filePath = cleanString(filePath, illegalPath)
178 |
179 | // NB this may be of length 0, caller must check
180 | return filePath
181 | }
182 |
183 | // Remove all other unrecognised characters apart from
184 | var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
185 |
186 | // Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
187 | func Name(s string) string {
188 | // Start with lowercase string
189 | fileName := strings.ToLower(s)
190 | fileName = path.Clean(path.Base(fileName))
191 |
192 | // Remove illegal characters for names, replacing some common separators with -
193 | fileName = cleanString(fileName, illegalName)
194 |
195 | // NB this may be of length 0, caller must check
196 | return fileName
197 | }
198 |
199 | // Replace these separators with -
200 | var baseNameSeparators = regexp.MustCompile(`[./]`)
201 |
202 | // BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
203 | // No attempt is made to normalise a path or normalise case.
204 | func BaseName(s string) string {
205 |
206 | // Replace certain joining characters with a dash
207 | baseName := baseNameSeparators.ReplaceAllString(s, "-")
208 |
209 | // Remove illegal characters for names, replacing some common separators with -
210 | baseName = cleanString(baseName, illegalName)
211 |
212 | // NB this may be of length 0, caller must check
213 | return baseName
214 | }
215 |
216 | // A very limited list of transliterations to catch common european names translated to urls.
217 | // This set could be expanded with at least caps and many more characters.
218 | var transliterations = map[rune]string{
219 | 'À': "A",
220 | 'Á': "A",
221 | 'Â': "A",
222 | 'Ã': "A",
223 | 'Ä': "A",
224 | 'Å': "AA",
225 | 'Æ': "AE",
226 | 'Ç': "C",
227 | 'È': "E",
228 | 'É': "E",
229 | 'Ê': "E",
230 | 'Ë': "E",
231 | 'Ì': "I",
232 | 'Í': "I",
233 | 'Î': "I",
234 | 'Ï': "I",
235 | 'Ð': "D",
236 | 'Ł': "L",
237 | 'Ñ': "N",
238 | 'Ò': "O",
239 | 'Ó': "O",
240 | 'Ô': "O",
241 | 'Õ': "O",
242 | 'Ö': "OE",
243 | 'Ø': "OE",
244 | 'Œ': "OE",
245 | 'Ù': "U",
246 | 'Ú': "U",
247 | 'Ü': "UE",
248 | 'Û': "U",
249 | 'Ý': "Y",
250 | 'Þ': "TH",
251 | 'ẞ': "SS",
252 | 'à': "a",
253 | 'á': "a",
254 | 'â': "a",
255 | 'ã': "a",
256 | 'ä': "ae",
257 | 'å': "aa",
258 | 'æ': "ae",
259 | 'ç': "c",
260 | 'è': "e",
261 | 'é': "e",
262 | 'ê': "e",
263 | 'ë': "e",
264 | 'ì': "i",
265 | 'í': "i",
266 | 'î': "i",
267 | 'ï': "i",
268 | 'ð': "d",
269 | 'ł': "l",
270 | 'ñ': "n",
271 | 'ń': "n",
272 | 'ò': "o",
273 | 'ó': "o",
274 | 'ô': "o",
275 | 'õ': "o",
276 | 'ō': "o",
277 | 'ö': "oe",
278 | 'ø': "oe",
279 | 'œ': "oe",
280 | 'ś': "s",
281 | 'ù': "u",
282 | 'ú': "u",
283 | 'û': "u",
284 | 'ū': "u",
285 | 'ü': "ue",
286 | 'ý': "y",
287 | 'ÿ': "y",
288 | 'ż': "z",
289 | 'þ': "th",
290 | 'ß': "ss",
291 | }
292 |
293 | // Accents replaces a set of accented characters with ascii equivalents.
294 | func Accents(s string) string {
295 | // Replace some common accent characters
296 | b := bytes.NewBufferString("")
297 | for _, c := range s {
298 | // Check transliterations first
299 | if val, ok := transliterations[c]; ok {
300 | b.WriteString(val)
301 | } else {
302 | b.WriteRune(c)
303 | }
304 | }
305 | return b.String()
306 | }
307 |
308 | var (
309 | // If the attribute contains data: or javascript: anywhere, ignore it
310 | // we don't allow this in attributes as it is so frequently used for xss
311 | // NB we allow spaces in the value, and lowercase.
312 | illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
313 |
314 | // We are far more restrictive with href attributes.
315 | legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
316 | )
317 |
318 | // cleanAttributes returns an array of attributes after removing malicious ones.
319 | func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
320 | if len(a) == 0 {
321 | return a
322 | }
323 |
324 | var cleaned []parser.Attribute
325 | for _, attr := range a {
326 | if includes(allowed, attr.Key) {
327 |
328 | val := strings.ToLower(attr.Val)
329 |
330 | // Check for illegal attribute values
331 | if illegalAttr.FindString(val) != "" {
332 | attr.Val = ""
333 | }
334 |
335 | // Check for legal href values - / mailto:// http:// or https://
336 | if attr.Key == "href" {
337 | if legalHrefAttr.FindString(val) == "" {
338 | attr.Val = ""
339 | }
340 | }
341 |
342 | // If we still have an attribute, append it to the array
343 | if attr.Val != "" {
344 | cleaned = append(cleaned, attr)
345 | }
346 | }
347 | }
348 | return cleaned
349 | }
350 |
351 | // A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
352 | var (
353 | separators = regexp.MustCompile(`[ &_=+:]`)
354 |
355 | dashes = regexp.MustCompile(`[\-]+`)
356 | )
357 |
358 | // cleanString replaces separators with - and removes characters listed in the regexp provided from string.
359 | // Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
360 | func cleanString(s string, r *regexp.Regexp) string {
361 |
362 | // Remove any trailing space to avoid ending on -
363 | s = strings.Trim(s, " ")
364 |
365 | // Flatten accents first so that if we remove non-ascii we still get a legible name
366 | s = Accents(s)
367 |
368 | // Replace certain joining characters with a dash
369 | s = separators.ReplaceAllString(s, "-")
370 |
371 | // Remove all other unrecognised characters - NB we do allow any printable characters
372 | s = r.ReplaceAllString(s, "")
373 |
374 | // Remove any multiple dashes caused by replacements above
375 | s = dashes.ReplaceAllString(s, "-")
376 |
377 | return s
378 | }
379 |
380 | // includes checks for inclusion of a string in a []string.
381 | func includes(a []string, s string) bool {
382 | for _, as := range a {
383 | if as == s {
384 | return true
385 | }
386 | }
387 | return false
388 | }
389 |
--------------------------------------------------------------------------------
/sanitize_test.go:
--------------------------------------------------------------------------------
1 | // Utility functions for working with text
2 | package sanitize
3 |
4 | import (
5 | "testing"
6 | )
7 |
8 | var Format = "\ninput: %q\nexpected: %q\noutput: %q"
9 |
10 | type Test struct {
11 | input string
12 | expected string
13 | }
14 |
15 | // NB the treatment of accents - they are removed and replaced with ascii transliterations
16 | var urls = []Test{
17 | {"ReAd ME.md", `read-me.md`},
18 | {"E88E08A7-279C-4CC1-8B90-86DE0D7044_3C.html", `e88e08a7-279c-4cc1-8b90-86de0d7044-3c.html`},
19 | {"/user/test/I am a long url's_-?ASDF@£$%£%^testé.html", `/user/test/i-am-a-long-urls-asdfteste.html`},
20 | {"/../../4-icon.jpg", `/4-icon.jpg`},
21 | {"/Images_dir/../4-icon.jpg", `/images-dir/4-icon.jpg`},
22 | {"../4 icon.*", `/4-icon.`},
23 | {"Spac ey/Nôm/test før url", `spac-ey/nom/test-foer-url`},
24 | {"../*", `/`},
25 | }
26 |
27 | func TestPath(t *testing.T) {
28 | for _, test := range urls {
29 | output := Path(test.input)
30 | if output != test.expected {
31 | t.Fatalf(Format, test.input, test.expected, output)
32 | }
33 | }
34 | }
35 |
36 | func BenchmarkPath(b *testing.B) {
37 | for i := 0; i < b.N; i++ {
38 | for _, test := range urls {
39 | output := Path(test.input)
40 | if output != test.expected {
41 | b.Fatalf(Format, test.input, test.expected, output)
42 | }
43 | }
44 | }
45 | }
46 |
47 | var fileNames = []Test{
48 | {"ReAd ME.md", `read-me.md`},
49 | {"/var/etc/jobs/go/go/src/pkg/foo/bar.go", `bar.go`},
50 | {"I am a long url's_-?ASDF@£$%£%^é.html", `i-am-a-long-urls-asdfe.html`},
51 | {"/../../4-icon.jpg", `4-icon.jpg`},
52 | {"/Images/../4-icon.jpg", `4-icon.jpg`},
53 | {"../4 icon.jpg", `4-icon.jpg`},
54 | {"../4 icon-testé *8%^\"'\".jpg ", `4-icon-teste-8.jpg`},
55 | {"Überfluß an Döner macht schöner.JPEG", `ueberfluss-an-doener-macht-schoener.jpeg`},
56 | {"Ä-_-Ü_:()_Ö-_-ä-_-ü-_-ö-_ß.webm", `ae-ue-oe-ae-ue-oe-ss.webm`},
57 | }
58 |
59 | func TestName(t *testing.T) {
60 | for _, test := range fileNames {
61 | output := Name(test.input)
62 | if output != test.expected {
63 | t.Fatalf(Format, test.input, test.expected, output)
64 | }
65 | }
66 | }
67 |
68 | func BenchmarkName(b *testing.B) {
69 | for i := 0; i < b.N; i++ {
70 | for _, test := range fileNames {
71 | output := Name(test.input)
72 | if output != test.expected {
73 | b.Fatalf(Format, test.input, test.expected, output)
74 | }
75 | }
76 | }
77 | }
78 |
79 | var baseFileNames = []Test{
80 | {"The power & the Glory jpg file. The end", `The-power-the-Glory-jpg-file-The-end`},
81 | {"/../../4-iCoN.jpg", `-4-iCoN-jpg`},
82 | {"And/Or", `And-Or`},
83 | {"Sonic.EXE", `Sonic-EXE`},
84 | {"012: #Fetch for Defaults", `012-Fetch-for-Defaults`},
85 | }
86 |
87 | func TestBaseName(t *testing.T) {
88 | for _, test := range baseFileNames {
89 | output := BaseName(test.input)
90 | if output != test.expected {
91 | t.Fatalf(Format, test.input, test.expected, output)
92 | }
93 | }
94 | }
95 |
96 | // Test with some malformed or malicious html
97 | // NB because we remove all tokens after a < until the next >
98 | // and do not attempt to parse, we should be safe from invalid html,
99 | // but will sometimes completely empty the string if we have invalid input
100 | // Note we sometimes use " in order to keep things on one line and use the ` character
101 | var htmlTests = []Test{
102 | {` `, " "},
103 | {`
`, `
`},
104 | {`