├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── bench_test.go ├── benchmarks └── v0.1.0 ├── example_test.go ├── go.mod ├── go.sum ├── purell.go ├── purell_test.go ├── urlesc.go ├── urlesc_test.go └── urlnorm_test.go /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | 8 | jobs: 9 | ci: 10 | runs-on: ubuntu-22.04 11 | strategy: 12 | matrix: 13 | go: ['1.19.13', '1.20.10', '1.21.3'] 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Setup Go 19 | uses: actions/setup-go@v3 20 | with: 21 | go-version: ${{ matrix.go }} 22 | cache: true 23 | 24 | - name: Test 25 | run: go test -v ./... 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-* 2 | .DS_Store 3 | *.swp 4 | *.swo 5 | tags 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2022, The Go Authors 2 | Copyright (c) 2012-2022, Martin Angers, Yuki Okushi & Contributors 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Purell 2 | 3 | Purell is a tiny Go library to normalize URLs. It returns a pure URL. Pure-ell. Sanitizer and all. Yeah, I know... 4 | 5 | Based on the [wikipedia paper][wiki] and the [RFC 3986 document][rfc]. 6 | 7 | [![CI](https://github.com/PuerkitoBio/purell/actions/workflows/ci.yml/badge.svg)](https://github.com/PuerkitoBio/purell/actions/workflows/ci.yml) 8 | 9 | ## Install 10 | 11 | `go get github.com/PuerkitoBio/purell` 12 | 13 | ## Changelog 14 | 15 | * **v1.1.1** : Fix failing test due to Go1.12 changes (thanks to @ianlancetaylor). 16 | * **2016-11-14 (v1.1.0)** : IDN: Conform to RFC 5895: Fold character width (thanks to @beeker1121). 17 | * **2016-07-27 (v1.0.0)** : Normalize IDN to ASCII (thanks to @zenovich). 18 | * **2015-02-08** : Add fix for relative paths issue ([PR #5][pr5]) and add fix for unnecessary encoding of reserved characters ([see issue #7][iss7]). 19 | * **v0.2.0** : Add benchmarks, Attempt IDN support. 20 | * **v0.1.0** : Initial release. 21 | 22 | ## Examples 23 | 24 | From `example_test.go` (note that in your code, you would import "github.com/PuerkitoBio/purell", and would prefix references to its methods and constants with "purell."): 25 | 26 | ```go 27 | package purell 28 | 29 | import ( 30 | "fmt" 31 | "net/url" 32 | ) 33 | 34 | func ExampleNormalizeURLString() { 35 | if normalized, err := NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%3f/url/", 36 | FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes); err != nil { 37 | panic(err) 38 | } else { 39 | fmt.Print(normalized) 40 | } 41 | // Output: http://somewebsite.com:80/Amazing%3F/url/ 42 | } 43 | 44 | func ExampleMustNormalizeURLString() { 45 | normalized := MustNormalizeURLString("hTTpS://someWEBsite.com:443/Amazing%fa/url/", 46 | FlagsUnsafeGreedy) 47 | fmt.Print(normalized) 48 | 49 | // Output: http://somewebsite.com/Amazing%FA/url 50 | } 51 | 52 | func ExampleNormalizeURL() { 53 | if u, err := url.Parse("Http://SomeUrl.com:8080/a/b/.././c///g?c=3&a=1&b=9&c=0#target"); err != nil { 54 | panic(err) 55 | } else { 56 | normalized := NormalizeURL(u, FlagsUsuallySafeGreedy|FlagRemoveDuplicateSlashes|FlagRemoveFragment) 57 | fmt.Print(normalized) 58 | } 59 | 60 | // Output: http://someurl.com:8080/a/c/g?c=3&a=1&b=9&c=0 61 | } 62 | ``` 63 | 64 | ## API 65 | 66 | As seen in the examples above, purell offers three methods, `NormalizeURLString(string, NormalizationFlags) (string, error)`, `MustNormalizeURLString(string, NormalizationFlags) (string)` and `NormalizeURL(*url.URL, NormalizationFlags) (string)`. They all normalize the provided URL based on the specified flags. Here are the available flags: 67 | 68 | ```go 69 | const ( 70 | // Safe normalizations 71 | FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 72 | FlagLowercaseHost // http://HOST -> http://host 73 | FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF 74 | FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA 75 | FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ 76 | FlagRemoveDefaultPort // http://host:80 -> http://host 77 | FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path 78 | 79 | // Usually safe normalizations 80 | FlagRemoveTrailingSlash // http://host/path/ -> http://host/path 81 | FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) 82 | FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c 83 | 84 | // Unsafe normalizations 85 | FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ 86 | FlagRemoveFragment // http://host/path#fragment -> http://host/path 87 | FlagForceHTTP // https://host -> http://host 88 | FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b 89 | FlagRemoveWWW // http://www.host/ -> http://host/ 90 | FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) 91 | FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 92 | 93 | // Normalizations not in the wikipedia article, required to cover tests cases 94 | // submitted by jehiah 95 | FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 96 | FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 97 | FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 98 | FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path 99 | FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path 100 | 101 | // Convenience set of safe normalizations 102 | FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator 103 | 104 | // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, 105 | // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". 106 | 107 | // Convenience set of usually safe normalizations (includes FlagsSafe) 108 | FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments 109 | FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments 110 | 111 | // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) 112 | FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery 113 | FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery 114 | 115 | // Convenience set of all available flags 116 | FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 117 | FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 118 | ) 119 | ``` 120 | 121 | For convenience, the set of flags `FlagsSafe`, `FlagsUsuallySafe[Greedy|NonGreedy]`, `FlagsUnsafe[Greedy|NonGreedy]` and `FlagsAll[Greedy|NonGreedy]` are provided for the similarly grouped normalizations on [wikipedia's URL normalization page][wiki]. You can add (using the bitwise OR `|` operator) or remove (using the bitwise AND NOT `&^` operator) individual flags from the sets if required, to build your own custom set. 122 | 123 | The [full godoc reference is available on gopkgdoc][godoc]. 124 | 125 | Some things to note: 126 | 127 | * `FlagDecodeUnnecessaryEscapes`, `FlagEncodeNecessaryEscapes`, `FlagUppercaseEscapes` and `FlagRemoveEmptyQuerySeparator` are always implicitly set, because internally, the URL string is parsed as an URL object, which automatically decodes unnecessary escapes, uppercases and encodes necessary ones, and removes empty query separators (an unnecessary `?` at the end of the url). So this operation cannot **not** be done. For this reason, `FlagRemoveEmptyQuerySeparator` (as well as the other three) has been included in the `FlagsSafe` convenience set, instead of `FlagsUnsafe`, where Wikipedia puts it. 128 | 129 | * The `FlagDecodeUnnecessaryEscapes` decodes the following escapes (*from -> to*): 130 | - %24 -> $ 131 | - %26 -> & 132 | - %2B-%3B -> +,-./0123456789:; 133 | - %3D -> = 134 | - %40-%5A -> @ABCDEFGHIJKLMNOPQRSTUVWXYZ 135 | - %5F -> _ 136 | - %61-%7A -> abcdefghijklmnopqrstuvwxyz 137 | - %7E -> ~ 138 | 139 | 140 | * When the `NormalizeURL` function is used (passing an URL object), this source URL object is modified (that is, after the call, the URL object will be modified to reflect the normalization). 141 | 142 | * The *replace IP with domain name* normalization (`http://208.77.188.166/ → http://www.example.com/`) is obviously not possible for a library without making some network requests. This is not implemented in purell. 143 | 144 | * The *remove unused query string parameters* and *remove default query parameters* are also not implemented, since this is a very case-specific normalization, and it is quite trivial to do with an URL object. 145 | 146 | ### Safe vs Usually Safe vs Unsafe 147 | 148 | Purell allows you to control the level of risk you take while normalizing an URL. You can aggressively normalize, play it totally safe, or anything in between. 149 | 150 | Consider the following URL: 151 | 152 | `HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid` 153 | 154 | Normalizing with the `FlagsSafe` gives: 155 | 156 | `https://www.root.com/toto/tE%1F///a/./b/../c/?z=3&w=2&a=4&w=1#invalid` 157 | 158 | With the `FlagsUsuallySafeGreedy`: 159 | 160 | `https://www.root.com/toto/tE%1F///a/c?z=3&w=2&a=4&w=1#invalid` 161 | 162 | And with `FlagsUnsafeGreedy`: 163 | 164 | `http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3` 165 | 166 | ## TODOs 167 | 168 | * Add a class/default instance to allow specifying custom directory index names? At the moment, removing directory index removes `(^|/)((?:default|index)\.\w{1,4})$`. 169 | 170 | ## Thanks / Contributions 171 | 172 | @rogpeppe 173 | @jehiah 174 | @opennota 175 | @pchristopher1275 176 | @zenovich 177 | @beeker1121 178 | 179 | ## License 180 | 181 | The [BSD 3-Clause license][bsd]. 182 | 183 | [bsd]: http://opensource.org/licenses/BSD-3-Clause 184 | [wiki]: http://en.wikipedia.org/wiki/URL_normalization 185 | [rfc]: http://tools.ietf.org/html/rfc3986#section-6 186 | [godoc]: http://go.pkgdoc.org/github.com/PuerkitoBio/purell 187 | [pr5]: https://github.com/PuerkitoBio/purell/pull/5 188 | [iss7]: https://github.com/PuerkitoBio/purell/issues/7 189 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | package purell 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | safeUrl = "HttPS://..iaMHost..Test:443/paTh^A%ef//./%41PaTH/..//?" 9 | usuallySafeUrl = "HttPS://..iaMHost..Test:443/paTh^A%ef//./%41PaTH/../final/" 10 | unsafeUrl = "HttPS://..www.iaMHost..Test:443/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment" 11 | allDWORDUrl = "HttPS://1113982867:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment" 12 | allOctalUrl = "HttPS://0102.0146.07.0223:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment" 13 | allHexUrl = "HttPS://0x42660793:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment" 14 | allCombinedUrl = "HttPS://..0x42660793.:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment" 15 | ) 16 | 17 | func BenchmarkSafe(b *testing.B) { 18 | for i := 0; i < b.N; i++ { 19 | NormalizeURLString(safeUrl, FlagsSafe) 20 | } 21 | } 22 | 23 | func BenchmarkUsuallySafe(b *testing.B) { 24 | for i := 0; i < b.N; i++ { 25 | NormalizeURLString(usuallySafeUrl, FlagsUsuallySafeGreedy) 26 | } 27 | } 28 | 29 | func BenchmarkUnsafe(b *testing.B) { 30 | for i := 0; i < b.N; i++ { 31 | NormalizeURLString(unsafeUrl, FlagsUnsafeGreedy) 32 | } 33 | } 34 | 35 | func BenchmarkAllDWORD(b *testing.B) { 36 | for i := 0; i < b.N; i++ { 37 | NormalizeURLString(allDWORDUrl, FlagsAllGreedy) 38 | } 39 | } 40 | 41 | func BenchmarkAllOctal(b *testing.B) { 42 | for i := 0; i < b.N; i++ { 43 | NormalizeURLString(allOctalUrl, FlagsAllGreedy) 44 | } 45 | } 46 | 47 | func BenchmarkAllHex(b *testing.B) { 48 | for i := 0; i < b.N; i++ { 49 | NormalizeURLString(allHexUrl, FlagsAllGreedy) 50 | } 51 | } 52 | 53 | func BenchmarkAllCombined(b *testing.B) { 54 | for i := 0; i < b.N; i++ { 55 | NormalizeURLString(allCombinedUrl, FlagsAllGreedy) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /benchmarks/v0.1.0: -------------------------------------------------------------------------------- 1 | PASS 2 | BenchmarkSafe 500000 6131 ns/op 3 | BenchmarkUsuallySafe 200000 7864 ns/op 4 | BenchmarkUnsafe 100000 28560 ns/op 5 | BenchmarkAllDWORD 50000 38722 ns/op 6 | BenchmarkAllOctal 50000 40941 ns/op 7 | BenchmarkAllHex 50000 44063 ns/op 8 | BenchmarkAllCombined 50000 33613 ns/op 9 | ok github.com/PuerkitoBio/purell 17.404s 10 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package purell 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | ) 7 | 8 | func ExampleNormalizeURLString() { 9 | if normalized, err := NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%3f/url/", 10 | FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes); err != nil { 11 | panic(err) 12 | } else { 13 | fmt.Print(normalized) 14 | } 15 | // Output: http://somewebsite.com:80/Amazing%3F/url/ 16 | } 17 | 18 | func ExampleMustNormalizeURLString() { 19 | normalized := MustNormalizeURLString("hTTpS://someWEBsite.com:443/Amazing%fa/url/", 20 | FlagsUnsafeGreedy) 21 | fmt.Print(normalized) 22 | 23 | // Output: http://somewebsite.com/Amazing%FA/url 24 | } 25 | 26 | func ExampleNormalizeURL() { 27 | if u, err := url.Parse("Http://SomeUrl.com:8080/a/b/.././c///g?c=3&a=1&b=9&c=0#target"); err != nil { 28 | panic(err) 29 | } else { 30 | normalized := NormalizeURL(u, FlagsUsuallySafeGreedy|FlagRemoveDuplicateSlashes|FlagRemoveFragment) 31 | fmt.Print(normalized) 32 | } 33 | 34 | // Output: http://someurl.com:8080/a/c/g?c=3&a=1&b=9&c=0 35 | } 36 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/PuerkitoBio/purell 2 | 3 | go 1.21 4 | 5 | require ( 6 | golang.org/x/net v0.17.0 7 | golang.org/x/text v0.13.0 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= 2 | golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= 3 | golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= 4 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 5 | -------------------------------------------------------------------------------- /purell.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package purell offers URL normalization as described on the wikipedia page: 3 | http://en.wikipedia.org/wiki/URL_normalization 4 | */ 5 | package purell 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "net/url" 11 | "regexp" 12 | "sort" 13 | "strconv" 14 | "strings" 15 | 16 | "golang.org/x/net/idna" 17 | "golang.org/x/text/unicode/norm" 18 | "golang.org/x/text/width" 19 | ) 20 | 21 | // A set of normalization flags determines how a URL will 22 | // be normalized. 23 | type NormalizationFlags uint 24 | 25 | const ( 26 | // Safe normalizations 27 | FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 28 | FlagLowercaseHost // http://HOST -> http://host 29 | FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF 30 | FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA 31 | FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ 32 | FlagRemoveDefaultPort // http://host:80 -> http://host 33 | FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path 34 | 35 | // Usually safe normalizations 36 | FlagRemoveTrailingSlash // http://host/path/ -> http://host/path 37 | FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) 38 | FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c 39 | 40 | // Unsafe normalizations 41 | FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ 42 | FlagRemoveFragment // http://host/path#fragment -> http://host/path 43 | FlagForceHTTP // https://host -> http://host 44 | FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b 45 | FlagRemoveWWW // http://www.host/ -> http://host/ 46 | FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) 47 | FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 48 | 49 | // Normalizations not in the wikipedia article, required to cover tests cases 50 | // submitted by jehiah 51 | FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 52 | FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 53 | FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 54 | FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path 55 | FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path 56 | 57 | // Convenience set of safe normalizations 58 | FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator 59 | 60 | // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, 61 | // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". 62 | 63 | // Convenience set of usually safe normalizations (includes FlagsSafe) 64 | FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments 65 | FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments 66 | 67 | // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) 68 | FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery 69 | FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery 70 | 71 | // Convenience set of all available flags 72 | FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 73 | FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 74 | ) 75 | 76 | const ( 77 | defaultHttpPort = ":80" 78 | defaultHttpsPort = ":443" 79 | ) 80 | 81 | // Regular expressions used by the normalizations 82 | var rxPort = regexp.MustCompile(`(:\d+)/?$`) 83 | var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`) 84 | var rxDupSlashes = regexp.MustCompile(`/{2,}`) 85 | var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`) 86 | var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`) 87 | var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`) 88 | var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`) 89 | var rxHostInteriorDots = regexp.MustCompile(`\.+`) 90 | var rxEmptyPort = regexp.MustCompile(`:+$`) 91 | 92 | // Map of flags to implementation function. 93 | // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically 94 | // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator. 95 | 96 | // Since maps have undefined traversing order, make a slice of ordered keys 97 | var flagsOrder = []NormalizationFlags{ 98 | FlagLowercaseScheme, 99 | FlagLowercaseHost, 100 | FlagRemoveDefaultPort, 101 | FlagRemoveDirectoryIndex, 102 | FlagRemoveDotSegments, 103 | FlagRemoveFragment, 104 | FlagForceHTTP, // Must be after remove default port (because https=443/http=80) 105 | FlagRemoveDuplicateSlashes, 106 | FlagRemoveWWW, 107 | FlagAddWWW, 108 | FlagSortQuery, 109 | FlagDecodeDWORDHost, 110 | FlagDecodeOctalHost, 111 | FlagDecodeHexHost, 112 | FlagRemoveUnnecessaryHostDots, 113 | FlagRemoveEmptyPortSeparator, 114 | FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last 115 | FlagAddTrailingSlash, 116 | } 117 | 118 | // ... and then the map, where order is unimportant 119 | var flags = map[NormalizationFlags]func(*url.URL){ 120 | FlagLowercaseScheme: lowercaseScheme, 121 | FlagLowercaseHost: lowercaseHost, 122 | FlagRemoveDefaultPort: removeDefaultPort, 123 | FlagRemoveDirectoryIndex: removeDirectoryIndex, 124 | FlagRemoveDotSegments: removeDotSegments, 125 | FlagRemoveFragment: removeFragment, 126 | FlagForceHTTP: forceHTTP, 127 | FlagRemoveDuplicateSlashes: removeDuplicateSlashes, 128 | FlagRemoveWWW: removeWWW, 129 | FlagAddWWW: addWWW, 130 | FlagSortQuery: sortQuery, 131 | FlagDecodeDWORDHost: decodeDWORDHost, 132 | FlagDecodeOctalHost: decodeOctalHost, 133 | FlagDecodeHexHost: decodeHexHost, 134 | FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots, 135 | FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator, 136 | FlagRemoveTrailingSlash: removeTrailingSlash, 137 | FlagAddTrailingSlash: addTrailingSlash, 138 | } 139 | 140 | // MustNormalizeURLString returns the normalized string, and panics if an error occurs. 141 | // It takes an URL string as input, as well as the normalization flags. 142 | func MustNormalizeURLString(u string, f NormalizationFlags) string { 143 | result, e := NormalizeURLString(u, f) 144 | if e != nil { 145 | panic(e) 146 | } 147 | return result 148 | } 149 | 150 | // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object. 151 | // It takes an URL string as input, as well as the normalization flags. 152 | func NormalizeURLString(u string, f NormalizationFlags) (string, error) { 153 | parsed, err := url.Parse(u) 154 | if err != nil { 155 | return "", err 156 | } 157 | 158 | if f&FlagLowercaseHost == FlagLowercaseHost { 159 | parsed.Host = strings.ToLower(parsed.Host) 160 | } 161 | 162 | // The idna package doesn't fully conform to RFC 5895 163 | // (https://tools.ietf.org/html/rfc5895), so we do it here. 164 | // Taken from Go 1.8 cycle source, courtesy of bradfitz. 165 | // TODO: Remove when (if?) idna package conforms to RFC 5895. 166 | parsed.Host = width.Fold.String(parsed.Host) 167 | parsed.Host = norm.NFC.String(parsed.Host) 168 | if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil { 169 | return "", err 170 | } 171 | 172 | return NormalizeURL(parsed, f), nil 173 | } 174 | 175 | // NormalizeURL returns the normalized string. 176 | // It takes a parsed URL object as input, as well as the normalization flags. 177 | func NormalizeURL(u *url.URL, f NormalizationFlags) string { 178 | for _, k := range flagsOrder { 179 | if f&k == k { 180 | flags[k](u) 181 | } 182 | } 183 | return escapeURL(u) 184 | } 185 | 186 | func lowercaseScheme(u *url.URL) { 187 | if len(u.Scheme) > 0 { 188 | u.Scheme = strings.ToLower(u.Scheme) 189 | } 190 | } 191 | 192 | func lowercaseHost(u *url.URL) { 193 | if len(u.Host) > 0 { 194 | u.Host = strings.ToLower(u.Host) 195 | } 196 | } 197 | 198 | func removeDefaultPort(u *url.URL) { 199 | if len(u.Host) > 0 { 200 | scheme := strings.ToLower(u.Scheme) 201 | u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string { 202 | if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) { 203 | return "" 204 | } 205 | return val 206 | }) 207 | } 208 | } 209 | 210 | func removeTrailingSlash(u *url.URL) { 211 | if l := len(u.Path); l > 0 { 212 | if strings.HasSuffix(u.Path, "/") { 213 | u.Path = u.Path[:l-1] 214 | } 215 | } else if l = len(u.Host); l > 0 { 216 | if strings.HasSuffix(u.Host, "/") { 217 | u.Host = u.Host[:l-1] 218 | } 219 | } 220 | } 221 | 222 | func addTrailingSlash(u *url.URL) { 223 | if l := len(u.Path); l > 0 { 224 | if !strings.HasSuffix(u.Path, "/") { 225 | u.Path += "/" 226 | } 227 | } else if l = len(u.Host); l > 0 { 228 | if !strings.HasSuffix(u.Host, "/") { 229 | u.Host += "/" 230 | } 231 | } 232 | } 233 | 234 | func removeDotSegments(u *url.URL) { 235 | if len(u.Path) > 0 { 236 | var dotFree []string 237 | var lastIsDot bool 238 | 239 | sections := strings.Split(u.Path, "/") 240 | for _, s := range sections { 241 | if s == ".." { 242 | if len(dotFree) > 0 { 243 | dotFree = dotFree[:len(dotFree)-1] 244 | } 245 | } else if s != "." { 246 | dotFree = append(dotFree, s) 247 | } 248 | lastIsDot = (s == "." || s == "..") 249 | } 250 | // Special case if host does not end with / and new path does not begin with / 251 | u.Path = strings.Join(dotFree, "/") 252 | if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") { 253 | u.Path = "/" + u.Path 254 | } 255 | // Special case if the last segment was a dot, make sure the path ends with a slash 256 | if lastIsDot && !strings.HasSuffix(u.Path, "/") { 257 | u.Path += "/" 258 | } 259 | } 260 | } 261 | 262 | func removeDirectoryIndex(u *url.URL) { 263 | if len(u.Path) > 0 { 264 | u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1") 265 | } 266 | } 267 | 268 | func removeFragment(u *url.URL) { 269 | u.Fragment = "" 270 | } 271 | 272 | func forceHTTP(u *url.URL) { 273 | if strings.ToLower(u.Scheme) == "https" { 274 | u.Scheme = "http" 275 | } 276 | } 277 | 278 | func removeDuplicateSlashes(u *url.URL) { 279 | if len(u.Path) > 0 { 280 | u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/") 281 | } 282 | } 283 | 284 | func removeWWW(u *url.URL) { 285 | if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") { 286 | u.Host = u.Host[4:] 287 | } 288 | } 289 | 290 | func addWWW(u *url.URL) { 291 | if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") { 292 | u.Host = "www." + u.Host 293 | } 294 | } 295 | 296 | func sortQuery(u *url.URL) { 297 | q := u.Query() 298 | 299 | if len(q) > 0 { 300 | arKeys := make([]string, len(q)) 301 | i := 0 302 | for k := range q { 303 | arKeys[i] = k 304 | i++ 305 | } 306 | sort.Strings(arKeys) 307 | buf := new(bytes.Buffer) 308 | for _, k := range arKeys { 309 | sort.Strings(q[k]) 310 | for _, v := range q[k] { 311 | if buf.Len() > 0 { 312 | buf.WriteRune('&') 313 | } 314 | buf.WriteString(fmt.Sprintf("%s=%s", k, url.QueryEscape(v))) 315 | } 316 | } 317 | 318 | // Rebuild the raw query string 319 | u.RawQuery = buf.String() 320 | } 321 | } 322 | 323 | func decodeDWORDHost(u *url.URL) { 324 | if len(u.Host) > 0 { 325 | if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 { 326 | var parts [4]int64 327 | 328 | dword, _ := strconv.ParseInt(matches[1], 10, 0) 329 | for i, shift := range []uint{24, 16, 8, 0} { 330 | parts[i] = dword >> shift & 0xFF 331 | } 332 | u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2]) 333 | } 334 | } 335 | } 336 | 337 | func decodeOctalHost(u *url.URL) { 338 | if len(u.Host) > 0 { 339 | if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 { 340 | var parts [4]int64 341 | 342 | for i := 1; i <= 4; i++ { 343 | parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0) 344 | } 345 | u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5]) 346 | } 347 | } 348 | } 349 | 350 | func decodeHexHost(u *url.URL) { 351 | if len(u.Host) > 0 { 352 | if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 { 353 | // Conversion is safe because of regex validation 354 | parsed, _ := strconv.ParseInt(matches[1], 16, 0) 355 | // Set host as DWORD (base 10) encoded host 356 | u.Host = fmt.Sprintf("%d%s", parsed, matches[2]) 357 | // The rest is the same as decoding a DWORD host 358 | decodeDWORDHost(u) 359 | } 360 | } 361 | } 362 | 363 | func removeUnncessaryHostDots(u *url.URL) { 364 | if len(u.Host) > 0 { 365 | if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 { 366 | // Trim the leading and trailing dots 367 | u.Host = strings.Trim(matches[1], ".") 368 | if len(matches) > 2 { 369 | u.Host += matches[2] 370 | } 371 | } 372 | u.Host = rxHostInteriorDots.ReplaceAllString(u.Host, ".") 373 | } 374 | } 375 | 376 | func removeEmptyPortSeparator(u *url.URL) { 377 | if len(u.Host) > 0 { 378 | u.Host = rxEmptyPort.ReplaceAllString(u.Host, "") 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /purell_test.go: -------------------------------------------------------------------------------- 1 | package purell 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "testing" 7 | "unicode" 8 | ) 9 | 10 | type testCase struct { 11 | nm string 12 | src string 13 | flgs NormalizationFlags 14 | res string 15 | parsed bool 16 | } 17 | 18 | var ( 19 | cases = [...]*testCase{ 20 | { 21 | "LowerScheme", 22 | "HTTP://www.SRC.ca", 23 | FlagLowercaseScheme, 24 | "http://www.SRC.ca", 25 | false, 26 | }, 27 | { 28 | "LowerScheme2", 29 | "http://www.SRC.ca", 30 | FlagLowercaseScheme, 31 | "http://www.SRC.ca", 32 | false, 33 | }, 34 | { 35 | "LowerHost", 36 | "HTTP://www.SRC.ca/", 37 | FlagLowercaseHost, 38 | "http://www.src.ca/", // Since Go1.1, scheme is automatically lowercased 39 | false, 40 | }, 41 | { 42 | "UpperEscapes", 43 | `http://www.whatever.com/Some%aa%20Special%8Ecases/`, 44 | FlagUppercaseEscapes, 45 | "http://www.whatever.com/Some%AA%20Special%8Ecases/", 46 | false, 47 | }, 48 | { 49 | "UnnecessaryEscapes", 50 | `http://www.toto.com/%41%42%2E%44/%32%33%52%2D/%5f%7E`, 51 | FlagDecodeUnnecessaryEscapes, 52 | "http://www.toto.com/AB.D/23R-/_~", 53 | false, 54 | }, 55 | { 56 | "RemoveDefaultPort", 57 | "HTTP://www.SRC.ca:80/", 58 | FlagRemoveDefaultPort, 59 | "http://www.SRC.ca/", // Since Go1.1, scheme is automatically lowercased 60 | false, 61 | }, 62 | { 63 | "RemoveDefaultPort2", 64 | "HTTP://www.SRC.ca:80", 65 | FlagRemoveDefaultPort, 66 | "http://www.SRC.ca", // Since Go1.1, scheme is automatically lowercased 67 | false, 68 | }, 69 | { 70 | "RemoveDefaultPort3", 71 | "HTTP://www.SRC.ca:8080", 72 | FlagRemoveDefaultPort, 73 | "http://www.SRC.ca:8080", // Since Go1.1, scheme is automatically lowercased 74 | false, 75 | }, 76 | { 77 | "Safe", 78 | "HTTP://www.SRC.ca:80/to%1ato%8b%ee/OKnow%41%42%43%7e", 79 | FlagsSafe, 80 | "http://www.src.ca/to%1Ato%8B%EE/OKnowABC~", 81 | false, 82 | }, 83 | { 84 | "BothLower", 85 | "HTTP://www.SRC.ca:80/to%1ato%8b%ee/OKnow%41%42%43%7e", 86 | FlagLowercaseHost | FlagLowercaseScheme, 87 | "http://www.src.ca:80/to%1Ato%8B%EE/OKnowABC~", 88 | false, 89 | }, 90 | { 91 | "RemoveTrailingSlash", 92 | "HTTP://www.SRC.ca:80/", 93 | FlagRemoveTrailingSlash, 94 | "http://www.SRC.ca:80", // Since Go1.1, scheme is automatically lowercased 95 | false, 96 | }, 97 | { 98 | "RemoveTrailingSlash2", 99 | "HTTP://www.SRC.ca:80/toto/titi/", 100 | FlagRemoveTrailingSlash, 101 | "http://www.SRC.ca:80/toto/titi", // Since Go1.1, scheme is automatically lowercased 102 | false, 103 | }, 104 | { 105 | "RemoveTrailingSlash3", 106 | "HTTP://www.SRC.ca:80/toto/titi/fin/?a=1", 107 | FlagRemoveTrailingSlash, 108 | "http://www.SRC.ca:80/toto/titi/fin?a=1", // Since Go1.1, scheme is automatically lowercased 109 | false, 110 | }, 111 | { 112 | "AddTrailingSlash", 113 | "HTTP://www.SRC.ca:80", 114 | FlagAddTrailingSlash, 115 | "http://www.SRC.ca:80/", // Since Go1.1, scheme is automatically lowercased 116 | false, 117 | }, 118 | { 119 | "AddTrailingSlash2", 120 | "HTTP://www.SRC.ca:80/toto/titi.html", 121 | FlagAddTrailingSlash, 122 | "http://www.SRC.ca:80/toto/titi.html/", // Since Go1.1, scheme is automatically lowercased 123 | false, 124 | }, 125 | { 126 | "AddTrailingSlash3", 127 | "HTTP://www.SRC.ca:80/toto/titi/fin?a=1", 128 | FlagAddTrailingSlash, 129 | "http://www.SRC.ca:80/toto/titi/fin/?a=1", // Since Go1.1, scheme is automatically lowercased 130 | false, 131 | }, 132 | { 133 | "RemoveDotSegments", 134 | "HTTP://root/a/b/./../../c/", 135 | FlagRemoveDotSegments, 136 | "http://root/c/", // Since Go1.1, scheme is automatically lowercased 137 | false, 138 | }, 139 | { 140 | "RemoveDotSegments2", 141 | "HTTP://root/../a/b/./../c/../d", 142 | FlagRemoveDotSegments, 143 | "http://root/a/d", // Since Go1.1, scheme is automatically lowercased 144 | false, 145 | }, 146 | { 147 | "UsuallySafe", 148 | "HTTP://www.SRC.ca:80/to%1ato%8b%ee/./c/d/../OKnow%41%42%43%7e/?a=b#test", 149 | FlagsUsuallySafeGreedy, 150 | "http://www.src.ca/to%1Ato%8B%EE/c/OKnowABC~?a=b#test", 151 | false, 152 | }, 153 | { 154 | "RemoveDirectoryIndex", 155 | "HTTP://root/a/b/c/default.aspx", 156 | FlagRemoveDirectoryIndex, 157 | "http://root/a/b/c/", // Since Go1.1, scheme is automatically lowercased 158 | false, 159 | }, 160 | { 161 | "RemoveDirectoryIndex2", 162 | "HTTP://root/a/b/c/default#a=b", 163 | FlagRemoveDirectoryIndex, 164 | "http://root/a/b/c/default#a=b", // Since Go1.1, scheme is automatically lowercased 165 | false, 166 | }, 167 | { 168 | "RemoveFragment", 169 | "HTTP://root/a/b/c/default#toto=tata", 170 | FlagRemoveFragment, 171 | "http://root/a/b/c/default", // Since Go1.1, scheme is automatically lowercased 172 | false, 173 | }, 174 | { 175 | "ForceHTTP", 176 | "https://root/a/b/c/default#toto=tata", 177 | FlagForceHTTP, 178 | "http://root/a/b/c/default#toto=tata", 179 | false, 180 | }, 181 | { 182 | "RemoveDuplicateSlashes", 183 | "https://root/a//b///c////default#toto=tata", 184 | FlagRemoveDuplicateSlashes, 185 | "https://root/a/b/c/default#toto=tata", 186 | false, 187 | }, 188 | { 189 | "RemoveDuplicateSlashes2", 190 | "https://root//a//b///c////default#toto=tata", 191 | FlagRemoveDuplicateSlashes, 192 | "https://root/a/b/c/default#toto=tata", 193 | false, 194 | }, 195 | { 196 | "RemoveWWW", 197 | "https://www.root/a/b/c/", 198 | FlagRemoveWWW, 199 | "https://root/a/b/c/", 200 | false, 201 | }, 202 | { 203 | "RemoveWWW2", 204 | "https://WwW.Root/a/b/c/", 205 | FlagRemoveWWW, 206 | "https://Root/a/b/c/", 207 | false, 208 | }, 209 | { 210 | "AddWWW", 211 | "https://Root/a/b/c/", 212 | FlagAddWWW, 213 | "https://www.Root/a/b/c/", 214 | false, 215 | }, 216 | { 217 | "SortQuery", 218 | "http://root/toto/?b=4&a=1&c=3&b=2&a=5", 219 | FlagSortQuery, 220 | "http://root/toto/?a=1&a=5&b=2&b=4&c=3", 221 | false, 222 | }, 223 | { 224 | "RemoveEmptyQuerySeparator", 225 | "http://root/toto/?", 226 | FlagRemoveEmptyQuerySeparator, 227 | "http://root/toto/", 228 | false, 229 | }, 230 | { 231 | "Unsafe", 232 | "HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid", 233 | FlagsUnsafeGreedy, 234 | "http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3", 235 | false, 236 | }, 237 | { 238 | "Safe2", 239 | "HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid", 240 | FlagsSafe, 241 | "https://www.root.com/toto/tE%1F///a/./b/../c/?z=3&w=2&a=4&w=1#invalid", 242 | false, 243 | }, 244 | { 245 | "UsuallySafe2", 246 | "HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid", 247 | FlagsUsuallySafeGreedy, 248 | "https://www.root.com/toto/tE%1F///a/c?z=3&w=2&a=4&w=1#invalid", 249 | false, 250 | }, 251 | { 252 | "AddTrailingSlashBug", 253 | "http://src.ca/", 254 | FlagsAllNonGreedy, 255 | "http://www.src.ca/", 256 | false, 257 | }, 258 | { 259 | "SourceModified", 260 | "HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid", 261 | FlagsUnsafeGreedy, 262 | "http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3", 263 | true, 264 | }, 265 | { 266 | "IPv6-1", 267 | "http://[2001:db8:1f70::999:de8:7648:6e8]/test", 268 | FlagsSafe | FlagRemoveDotSegments, 269 | "http://[2001:db8:1f70::999:de8:7648:6e8]/test", 270 | false, 271 | }, 272 | { 273 | "IPv6-2", 274 | "http://[::ffff:192.168.1.1]/test", 275 | FlagsSafe | FlagRemoveDotSegments, 276 | "http://[::ffff:192.168.1.1]/test", 277 | false, 278 | }, 279 | { 280 | "IPv6-3", 281 | "http://[::ffff:192.168.1.1]:80/test", 282 | FlagsSafe | FlagRemoveDotSegments, 283 | "http://[::ffff:192.168.1.1]/test", 284 | false, 285 | }, 286 | { 287 | "IPv6-4", 288 | "htTps://[::fFff:192.168.1.1]:443/test", 289 | FlagsSafe | FlagRemoveDotSegments, 290 | "https://[::ffff:192.168.1.1]/test", 291 | false, 292 | }, 293 | { 294 | "FTP", 295 | "ftp://user:pass@ftp.foo.net/foo/bar", 296 | FlagsSafe | FlagRemoveDotSegments, 297 | "ftp://user:pass@ftp.foo.net/foo/bar", 298 | false, 299 | }, 300 | { 301 | "Standard-1", 302 | "http://www.foo.com:80/foo", 303 | FlagsSafe | FlagRemoveDotSegments, 304 | "http://www.foo.com/foo", 305 | false, 306 | }, 307 | { 308 | "Standard-2", 309 | "http://www.foo.com:8000/foo", 310 | FlagsSafe | FlagRemoveDotSegments, 311 | "http://www.foo.com:8000/foo", 312 | false, 313 | }, 314 | { 315 | "Standard-3", 316 | "http://www.foo.com/%7ebar", 317 | FlagsSafe | FlagRemoveDotSegments, 318 | "http://www.foo.com/~bar", 319 | false, 320 | }, 321 | { 322 | "Standard-4", 323 | "http://www.foo.com/%7Ebar", 324 | FlagsSafe | FlagRemoveDotSegments, 325 | "http://www.foo.com/~bar", 326 | false, 327 | }, 328 | { 329 | "Standard-5", 330 | "http://USER:pass@www.Example.COM/foo/bar", 331 | FlagsSafe | FlagRemoveDotSegments, 332 | "http://USER:pass@www.example.com/foo/bar", 333 | false, 334 | }, 335 | { 336 | "Standard-6", 337 | "http://test.example/?a=%26&b=1", 338 | FlagsSafe | FlagRemoveDotSegments, 339 | "http://test.example/?a=%26&b=1", 340 | false, 341 | }, 342 | { 343 | "Standard-7", 344 | "http://test.example/%25/?p=%20val%20%25", 345 | FlagsSafe | FlagRemoveDotSegments, 346 | "http://test.example/%25/?p=%20val%20%25", 347 | false, 348 | }, 349 | { 350 | "Standard-8", 351 | "http://test.example/path/with a%20space+/", 352 | FlagsSafe | FlagRemoveDotSegments, 353 | "http://test.example/path/with%20a%20space+/", 354 | false, 355 | }, 356 | { 357 | "Standard-9", 358 | "http://test.example/?", 359 | FlagsSafe | FlagRemoveDotSegments, 360 | "http://test.example/", 361 | false, 362 | }, 363 | { 364 | "Standard-10", 365 | "http://a.COM/path/?b&a", 366 | FlagsSafe | FlagRemoveDotSegments, 367 | "http://a.com/path/?b&a", 368 | false, 369 | }, 370 | { 371 | "StandardCasesAddTrailingSlash", 372 | "http://test.example?", 373 | FlagsSafe | FlagAddTrailingSlash, 374 | "http://test.example/", 375 | false, 376 | }, 377 | { 378 | "OctalIP-1", 379 | "http://0123.011.0.4/", 380 | FlagsSafe | FlagDecodeOctalHost, 381 | "http://0123.011.0.4/", 382 | false, 383 | }, 384 | { 385 | "OctalIP-2", 386 | "http://0102.0146.07.0223/", 387 | FlagsSafe | FlagDecodeOctalHost, 388 | "http://66.102.7.147/", 389 | false, 390 | }, 391 | { 392 | "OctalIP-3", 393 | "http://0102.0146.07.0223.:23/", 394 | FlagsSafe | FlagDecodeOctalHost, 395 | "http://66.102.7.147.:23/", 396 | false, 397 | }, 398 | { 399 | "OctalIP-4", 400 | "http://USER:pass@0102.0146.07.0223../", 401 | FlagsSafe | FlagDecodeOctalHost, 402 | "http://USER:pass@66.102.7.147../", 403 | false, 404 | }, 405 | { 406 | "DWORDIP-1", 407 | "http://123.1113982867/", 408 | FlagsSafe | FlagDecodeDWORDHost, 409 | "http://123.1113982867/", 410 | false, 411 | }, 412 | { 413 | "DWORDIP-2", 414 | "http://1113982867/", 415 | FlagsSafe | FlagDecodeDWORDHost, 416 | "http://66.102.7.147/", 417 | false, 418 | }, 419 | { 420 | "DWORDIP-3", 421 | "http://1113982867.:23/", 422 | FlagsSafe | FlagDecodeDWORDHost, 423 | "http://66.102.7.147.:23/", 424 | false, 425 | }, 426 | { 427 | "DWORDIP-4", 428 | "http://USER:pass@1113982867../", 429 | FlagsSafe | FlagDecodeDWORDHost, 430 | "http://USER:pass@66.102.7.147../", 431 | false, 432 | }, 433 | { 434 | "HexIP-1", 435 | "http://0x123.1113982867/", 436 | FlagsSafe | FlagDecodeHexHost, 437 | "http://0x123.1113982867/", 438 | false, 439 | }, 440 | { 441 | "HexIP-2", 442 | "http://0x42660793/", 443 | FlagsSafe | FlagDecodeHexHost, 444 | "http://66.102.7.147/", 445 | false, 446 | }, 447 | { 448 | "HexIP-3", 449 | "http://0x42660793.:23/", 450 | FlagsSafe | FlagDecodeHexHost, 451 | "http://66.102.7.147.:23/", 452 | false, 453 | }, 454 | { 455 | "HexIP-4", 456 | "http://USER:pass@0x42660793../", 457 | FlagsSafe | FlagDecodeHexHost, 458 | "http://USER:pass@66.102.7.147../", 459 | false, 460 | }, 461 | { 462 | "UnnecessaryHostDots-1", 463 | "http://.www.foo.com../foo/bar.html", 464 | FlagsSafe | FlagRemoveUnnecessaryHostDots, 465 | "http://www.foo.com/foo/bar.html", 466 | false, 467 | }, 468 | { 469 | "UnnecessaryHostDots-2", 470 | "http://www.foo.com./foo/bar.html", 471 | FlagsSafe | FlagRemoveUnnecessaryHostDots, 472 | "http://www.foo.com/foo/bar.html", 473 | false, 474 | }, 475 | { 476 | "UnnecessaryHostDots-3", 477 | "http://www.foo.com.:81/foo", 478 | FlagsSafe | FlagRemoveUnnecessaryHostDots, 479 | "http://www.foo.com:81/foo", 480 | false, 481 | }, 482 | { 483 | "UnnecessaryHostDots-4", 484 | "http://www.example.com./", 485 | FlagsSafe | FlagRemoveUnnecessaryHostDots, 486 | "http://www.example.com/", 487 | false, 488 | }, 489 | { 490 | "UnnecessaryHostDots-5", 491 | "http://www..example...com/", 492 | FlagsSafe | FlagRemoveUnnecessaryHostDots, 493 | "http://www.example.com/", 494 | false, 495 | }, 496 | { 497 | "EmptyPort-1", 498 | "http://www.thedraymin.co.uk:/main/?p=308", 499 | FlagsSafe | FlagRemoveEmptyPortSeparator, 500 | "http://www.thedraymin.co.uk/main/?p=308", 501 | false, 502 | }, 503 | { 504 | "EmptyPort-2", 505 | "http://www.src.ca:", 506 | FlagsSafe | FlagRemoveEmptyPortSeparator, 507 | "http://www.src.ca", 508 | false, 509 | }, 510 | { 511 | "Slashes-1", 512 | "http://test.example/foo/bar/.", 513 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 514 | "http://test.example/foo/bar/", 515 | false, 516 | }, 517 | { 518 | "Slashes-2", 519 | "http://test.example/foo/bar/./", 520 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 521 | "http://test.example/foo/bar/", 522 | false, 523 | }, 524 | { 525 | "Slashes-3", 526 | "http://test.example/foo/bar/..", 527 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 528 | "http://test.example/foo/", 529 | false, 530 | }, 531 | { 532 | "Slashes-4", 533 | "http://test.example/foo/bar/../", 534 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 535 | "http://test.example/foo/", 536 | false, 537 | }, 538 | { 539 | "Slashes-5", 540 | "http://test.example/foo/bar/../baz", 541 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 542 | "http://test.example/foo/baz", 543 | false, 544 | }, 545 | { 546 | "Slashes-6", 547 | "http://test.example/foo/bar/../..", 548 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 549 | "http://test.example/", 550 | false, 551 | }, 552 | { 553 | "Slashes-7", 554 | "http://test.example/foo/bar/../../", 555 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 556 | "http://test.example/", 557 | false, 558 | }, 559 | { 560 | "Slashes-8", 561 | "http://test.example/foo/bar/../../baz", 562 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 563 | "http://test.example/baz", 564 | false, 565 | }, 566 | { 567 | "Slashes-9", 568 | "http://test.example/foo/bar/../../../baz", 569 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 570 | "http://test.example/baz", 571 | false, 572 | }, 573 | { 574 | "Slashes-10", 575 | "http://test.example/foo/bar/../../../../baz", 576 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 577 | "http://test.example/baz", 578 | false, 579 | }, 580 | { 581 | "Slashes-11", 582 | "http://test.example/./foo", 583 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 584 | "http://test.example/foo", 585 | false, 586 | }, 587 | { 588 | "Slashes-12", 589 | "http://test.example/../foo", 590 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 591 | "http://test.example/foo", 592 | false, 593 | }, 594 | { 595 | "Slashes-13", 596 | "http://test.example/foo.", 597 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 598 | "http://test.example/foo.", 599 | false, 600 | }, 601 | { 602 | "Slashes-14", 603 | "http://test.example/.foo", 604 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 605 | "http://test.example/.foo", 606 | false, 607 | }, 608 | { 609 | "Slashes-15", 610 | "http://test.example/foo..", 611 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 612 | "http://test.example/foo..", 613 | false, 614 | }, 615 | { 616 | "Slashes-16", 617 | "http://test.example/..foo", 618 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 619 | "http://test.example/..foo", 620 | false, 621 | }, 622 | { 623 | "Slashes-17", 624 | "http://test.example/./../foo", 625 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 626 | "http://test.example/foo", 627 | false, 628 | }, 629 | { 630 | "Slashes-18", 631 | "http://test.example/./foo/.", 632 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 633 | "http://test.example/foo/", 634 | false, 635 | }, 636 | { 637 | "Slashes-19", 638 | "http://test.example/foo/./bar", 639 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 640 | "http://test.example/foo/bar", 641 | false, 642 | }, 643 | { 644 | "Slashes-20", 645 | "http://test.example/foo/../bar", 646 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 647 | "http://test.example/bar", 648 | false, 649 | }, 650 | { 651 | "Slashes-21", 652 | "http://test.example/foo//", 653 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 654 | "http://test.example/foo/", 655 | false, 656 | }, 657 | { 658 | "Slashes-22", 659 | "http://test.example/foo///bar//", 660 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 661 | "http://test.example/foo/bar/", 662 | false, 663 | }, 664 | { 665 | "Relative", 666 | "foo/bar", 667 | FlagsAllGreedy, 668 | "foo/bar", 669 | false, 670 | }, 671 | { 672 | "Relative-1", 673 | "./../foo", 674 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 675 | "foo", 676 | false, 677 | }, 678 | { 679 | "Relative-2", 680 | "./foo/bar/../baz/../bang/..", 681 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 682 | "foo/", 683 | false, 684 | }, 685 | { 686 | "Relative-3", 687 | "foo///bar//", 688 | FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes, 689 | "foo/bar/", 690 | false, 691 | }, 692 | { 693 | "Relative-4", 694 | "www.youtube.com", 695 | FlagsUsuallySafeGreedy, 696 | "www.youtube.com", 697 | false, 698 | }, 699 | { 700 | "Issue-#24", 701 | "///foo///bar///", 702 | FlagRemoveDuplicateSlashes | FlagRemoveTrailingSlash, 703 | "/foo/bar", 704 | false, 705 | }, 706 | /*&testCase{ 707 | "UrlNorm-5", 708 | "http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3", 709 | FlagsSafe | FlagRemoveDotSegments, 710 | "http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3", 711 | false, 712 | }, 713 | &testCase{ 714 | "UrlNorm-1", 715 | "http://test.example/?a=%e3%82%82%26", 716 | FlagsAllGreedy, 717 | "http://test.example/?a=\xe3\x82\x82%26", 718 | false, 719 | },*/ 720 | } 721 | ) 722 | 723 | func TestRunner(t *testing.T) { 724 | for _, tc := range cases { 725 | runCase(tc, t) 726 | } 727 | } 728 | 729 | func runCase(tc *testCase, t *testing.T) { 730 | t.Logf("running %s...", tc.nm) 731 | if tc.parsed { 732 | u, e := url.Parse(tc.src) 733 | if e != nil { 734 | t.Errorf("%s - FAIL : %s", tc.nm, e) 735 | return 736 | } else { 737 | NormalizeURL(u, tc.flgs) 738 | if s := u.String(); s != tc.res { 739 | t.Errorf("%s - FAIL expected '%s', got '%s'", tc.nm, tc.res, s) 740 | } 741 | } 742 | } else { 743 | if s, e := NormalizeURLString(tc.src, tc.flgs); e != nil { 744 | t.Errorf("%s - FAIL : %s", tc.nm, e) 745 | } else if s != tc.res { 746 | t.Errorf("%s - FAIL expected '%s', got '%s'", tc.nm, tc.res, s) 747 | } 748 | } 749 | } 750 | 751 | func TestDecodeUnnecessaryEscapesAll(t *testing.T) { 752 | var url = "http://host/" 753 | 754 | for i := 0; i < 256; i++ { 755 | url += fmt.Sprintf("%%%02x", i) 756 | } 757 | s, err := NormalizeURLString(url, FlagDecodeUnnecessaryEscapes) 758 | if err != nil { 759 | t.Fatalf("parse error: %s", err) 760 | } 761 | 762 | const want = "http://host/%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23$%25&'()*+,-./0123456789:;%3C=%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF" 763 | if s != want { 764 | t.Errorf("DecodeUnnecessaryEscapesAll:\nwant\n%s\ngot\n%s", want, s) 765 | } 766 | } 767 | 768 | func TestEncodeNecessaryEscapesAll(t *testing.T) { 769 | const base = "http://host/" 770 | var path []byte 771 | 772 | for i := 0; i < 256; i++ { 773 | // Since go1.12, url.Parse fails if the raw URL contains ASCII control characters, 774 | // meaning anything < 0x20 and 0x7f (DEL), so do not add those bytes to the constructed url. 775 | // See https://github.com/PuerkitoBio/purell/issues/28 776 | if i != 0x25 && !unicode.IsControl(rune(i)) { 777 | path = append(path, byte(i)) 778 | } 779 | } 780 | s, err := NormalizeURLString(base+string(path), FlagEncodeNecessaryEscapes) 781 | if err != nil { 782 | t.Fatalf("parse error: %s", err) 783 | } 784 | 785 | const want = "http://host/%20!%22#$&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF" 786 | if s != want { 787 | t.Errorf("EncodeNecessaryEscapesAll:\nwant\n%s\ngot\n%s", want, s) 788 | } 789 | } 790 | -------------------------------------------------------------------------------- /urlesc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // This file implements query escaping as per RFC 3986. 6 | // It contains some parts of the net/url package, modified so as to allow 7 | // some reserved characters incorrectly escaped by net/url. 8 | // See https://github.com/golang/go/issues/5684 9 | package purell 10 | 11 | import ( 12 | "bytes" 13 | "net/url" 14 | "strings" 15 | ) 16 | 17 | type encoding int 18 | 19 | const ( 20 | encodePath encoding = 1 + iota 21 | encodeUserPassword 22 | encodeQueryComponent 23 | encodeFragment 24 | ) 25 | 26 | // Return true if the specified character should be escaped when 27 | // appearing in a URL string, according to RFC 3986. 28 | func shouldEscape(c byte, mode encoding) bool { 29 | // §2.3 Unreserved characters (alphanum) 30 | if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' { 31 | return false 32 | } 33 | 34 | switch c { 35 | case '-', '.', '_', '~': // §2.3 Unreserved characters (mark) 36 | return false 37 | 38 | // §2.2 Reserved characters (reserved) 39 | case ':', '/', '?', '#', '[', ']', '@', // gen-delims 40 | '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=': // sub-delims 41 | // Different sections of the URL allow a few of 42 | // the reserved characters to appear unescaped. 43 | switch mode { 44 | case encodePath: // §3.3 45 | // The RFC allows sub-delims and : @. 46 | // '/', '[' and ']' can be used to assign meaning to individual path 47 | // segments. This package only manipulates the path as a whole, 48 | // so we allow those as well. That leaves only ? and # to escape. 49 | return c == '?' || c == '#' 50 | 51 | case encodeUserPassword: // §3.2.1 52 | // The RFC allows : and sub-delims in 53 | // userinfo. The parsing of userinfo treats ':' as special so we must escape 54 | // all the gen-delims. 55 | return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || c == ']' || c == '@' 56 | 57 | case encodeQueryComponent: // §3.4 58 | // The RFC allows / and ?. 59 | return c != '/' && c != '?' 60 | 61 | case encodeFragment: // §4.1 62 | // The RFC text is silent but the grammar allows 63 | // everything, so escape nothing but # 64 | return c == '#' 65 | } 66 | } 67 | 68 | // Everything else must be escaped. 69 | return true 70 | } 71 | 72 | func escape(s string, mode encoding) string { 73 | spaceCount, hexCount := 0, 0 74 | for i := 0; i < len(s); i++ { 75 | c := s[i] 76 | if shouldEscape(c, mode) { 77 | if c == ' ' && mode == encodeQueryComponent { 78 | spaceCount++ 79 | } else { 80 | hexCount++ 81 | } 82 | } 83 | } 84 | 85 | if spaceCount == 0 && hexCount == 0 { 86 | return s 87 | } 88 | 89 | t := make([]byte, len(s)+2*hexCount) 90 | j := 0 91 | for i := 0; i < len(s); i++ { 92 | switch c := s[i]; { 93 | case c == ' ' && mode == encodeQueryComponent: 94 | t[j] = '+' 95 | j++ 96 | case shouldEscape(c, mode): 97 | t[j] = '%' 98 | t[j+1] = "0123456789ABCDEF"[c>>4] 99 | t[j+2] = "0123456789ABCDEF"[c&15] 100 | j += 3 101 | default: 102 | t[j] = s[i] 103 | j++ 104 | } 105 | } 106 | return string(t) 107 | } 108 | 109 | var uiReplacer = strings.NewReplacer( 110 | "%21", "!", 111 | "%27", "'", 112 | "%28", "(", 113 | "%29", ")", 114 | "%2A", "*", 115 | ) 116 | 117 | // unescapeUserinfo unescapes some characters that need not to be escaped as per RFC3986. 118 | func unescapeUserinfo(s string) string { 119 | return uiReplacer.Replace(s) 120 | } 121 | 122 | // Escape reassembles the URL into a valid URL string. 123 | // The general form of the result is one of: 124 | // 125 | // scheme:opaque 126 | // scheme://userinfo@host/path?query#fragment 127 | // 128 | // If u.Opaque is non-empty, String uses the first form; 129 | // otherwise it uses the second form. 130 | // 131 | // In the second form, the following rules apply: 132 | // - if u.Scheme is empty, scheme: is omitted. 133 | // - if u.User is nil, userinfo@ is omitted. 134 | // - if u.Host is empty, host/ is omitted. 135 | // - if u.Scheme and u.Host are empty and u.User is nil, 136 | // the entire scheme://userinfo@host/ is omitted. 137 | // - if u.Host is non-empty and u.Path begins with a /, 138 | // the form host/path does not add its own /. 139 | // - if u.RawQuery is empty, ?query is omitted. 140 | // - if u.Fragment is empty, #fragment is omitted. 141 | func escapeURL(u *url.URL) string { 142 | var buf bytes.Buffer 143 | if u.Scheme != "" { 144 | buf.WriteString(u.Scheme) 145 | buf.WriteByte(':') 146 | } 147 | if u.Opaque != "" { 148 | buf.WriteString(u.Opaque) 149 | } else { 150 | if u.Scheme != "" || u.Host != "" || u.User != nil { 151 | buf.WriteString("//") 152 | if ui := u.User; ui != nil { 153 | buf.WriteString(unescapeUserinfo(ui.String())) 154 | buf.WriteByte('@') 155 | } 156 | if h := u.Host; h != "" { 157 | buf.WriteString(h) 158 | } 159 | } 160 | if u.Path != "" && u.Path[0] != '/' && u.Host != "" { 161 | buf.WriteByte('/') 162 | } 163 | buf.WriteString(escape(u.Path, encodePath)) 164 | } 165 | if u.RawQuery != "" { 166 | buf.WriteByte('?') 167 | buf.WriteString(u.RawQuery) 168 | } 169 | if u.Fragment != "" { 170 | buf.WriteByte('#') 171 | buf.WriteString(escape(u.Fragment, encodeFragment)) 172 | } 173 | return buf.String() 174 | } 175 | -------------------------------------------------------------------------------- /urlesc_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package purell 6 | 7 | import ( 8 | "net/url" 9 | "testing" 10 | ) 11 | 12 | type URLTest struct { 13 | in string 14 | out *url.URL 15 | roundtrip string // expected result of reserializing the URL; empty means same as "in". 16 | } 17 | 18 | var urltests = []URLTest{ 19 | // no path 20 | { 21 | "http://www.google.com", 22 | &url.URL{ 23 | Scheme: "http", 24 | Host: "www.google.com", 25 | }, 26 | "", 27 | }, 28 | // path 29 | { 30 | "http://www.google.com/", 31 | &url.URL{ 32 | Scheme: "http", 33 | Host: "www.google.com", 34 | Path: "/", 35 | }, 36 | "", 37 | }, 38 | // path with hex escaping 39 | { 40 | "http://www.google.com/file%20one%26two", 41 | &url.URL{ 42 | Scheme: "http", 43 | Host: "www.google.com", 44 | Path: "/file one&two", 45 | }, 46 | "http://www.google.com/file%20one&two", 47 | }, 48 | // user 49 | { 50 | "ftp://webmaster@www.google.com/", 51 | &url.URL{ 52 | Scheme: "ftp", 53 | User: url.User("webmaster"), 54 | Host: "www.google.com", 55 | Path: "/", 56 | }, 57 | "", 58 | }, 59 | // escape sequence in username 60 | { 61 | "ftp://john%20doe@www.google.com/", 62 | &url.URL{ 63 | Scheme: "ftp", 64 | User: url.User("john doe"), 65 | Host: "www.google.com", 66 | Path: "/", 67 | }, 68 | "ftp://john%20doe@www.google.com/", 69 | }, 70 | // query 71 | { 72 | "http://www.google.com/?q=go+language", 73 | &url.URL{ 74 | Scheme: "http", 75 | Host: "www.google.com", 76 | Path: "/", 77 | RawQuery: "q=go+language", 78 | }, 79 | "", 80 | }, 81 | // query with hex escaping: NOT parsed 82 | { 83 | "http://www.google.com/?q=go%20language", 84 | &url.URL{ 85 | Scheme: "http", 86 | Host: "www.google.com", 87 | Path: "/", 88 | RawQuery: "q=go%20language", 89 | }, 90 | "", 91 | }, 92 | // %20 outside query 93 | { 94 | "http://www.google.com/a%20b?q=c+d", 95 | &url.URL{ 96 | Scheme: "http", 97 | Host: "www.google.com", 98 | Path: "/a b", 99 | RawQuery: "q=c+d", 100 | }, 101 | "", 102 | }, 103 | // path without leading /, so no parsing 104 | { 105 | "http:www.google.com/?q=go+language", 106 | &url.URL{ 107 | Scheme: "http", 108 | Opaque: "www.google.com/", 109 | RawQuery: "q=go+language", 110 | }, 111 | "http:www.google.com/?q=go+language", 112 | }, 113 | // path without leading /, so no parsing 114 | { 115 | "http:%2f%2fwww.google.com/?q=go+language", 116 | &url.URL{ 117 | Scheme: "http", 118 | Opaque: "%2f%2fwww.google.com/", 119 | RawQuery: "q=go+language", 120 | }, 121 | "http:%2f%2fwww.google.com/?q=go+language", 122 | }, 123 | // non-authority with path 124 | { 125 | "mailto:/webmaster@golang.org", 126 | &url.URL{ 127 | Scheme: "mailto", 128 | Path: "/webmaster@golang.org", 129 | }, 130 | "mailto:///webmaster@golang.org", // unfortunate compromise 131 | }, 132 | // non-authority 133 | { 134 | "mailto:webmaster@golang.org", 135 | &url.URL{ 136 | Scheme: "mailto", 137 | Opaque: "webmaster@golang.org", 138 | }, 139 | "", 140 | }, 141 | // unescaped :// in query should not create a scheme 142 | { 143 | "/foo?query=http://bad", 144 | &url.URL{ 145 | Path: "/foo", 146 | RawQuery: "query=http://bad", 147 | }, 148 | "", 149 | }, 150 | // leading // without scheme should create an authority 151 | { 152 | "//foo", 153 | &url.URL{ 154 | Host: "foo", 155 | }, 156 | "", 157 | }, 158 | // leading // without scheme, with userinfo, path, and query 159 | { 160 | "//user@foo/path?a=b", 161 | &url.URL{ 162 | User: url.User("user"), 163 | Host: "foo", 164 | Path: "/path", 165 | RawQuery: "a=b", 166 | }, 167 | "", 168 | }, 169 | // Three leading slashes isn't an authority, but doesn't return an error. 170 | // (We can't return an error, as this code is also used via 171 | // ServeHTTP -> ReadRequest -> Parse, which is arguably a 172 | // different URL parsing context, but currently shares the 173 | // same codepath) 174 | { 175 | "///threeslashes", 176 | &url.URL{ 177 | Path: "///threeslashes", 178 | }, 179 | "", 180 | }, 181 | { 182 | "http://user:password@google.com", 183 | &url.URL{ 184 | Scheme: "http", 185 | User: url.UserPassword("user", "password"), 186 | Host: "google.com", 187 | }, 188 | "http://user:password@google.com", 189 | }, 190 | // unescaped @ in username should not confuse host 191 | { 192 | "http://j@ne:password@google.com", 193 | &url.URL{ 194 | Scheme: "http", 195 | User: url.UserPassword("j@ne", "password"), 196 | Host: "google.com", 197 | }, 198 | "http://j%40ne:password@google.com", 199 | }, 200 | // unescaped @ in password should not confuse host 201 | { 202 | "http://jane:p@ssword@google.com", 203 | &url.URL{ 204 | Scheme: "http", 205 | User: url.UserPassword("jane", "p@ssword"), 206 | Host: "google.com", 207 | }, 208 | "http://jane:p%40ssword@google.com", 209 | }, 210 | { 211 | "http://j@ne:password@google.com/p@th?q=@go", 212 | &url.URL{ 213 | Scheme: "http", 214 | User: url.UserPassword("j@ne", "password"), 215 | Host: "google.com", 216 | Path: "/p@th", 217 | RawQuery: "q=@go", 218 | }, 219 | "http://j%40ne:password@google.com/p@th?q=@go", 220 | }, 221 | { 222 | "http://www.google.com/?q=go+language#foo", 223 | &url.URL{ 224 | Scheme: "http", 225 | Host: "www.google.com", 226 | Path: "/", 227 | RawQuery: "q=go+language", 228 | Fragment: "foo", 229 | }, 230 | "", 231 | }, 232 | { 233 | "http://www.google.com/?q=go+language#foo%26bar", 234 | &url.URL{ 235 | Scheme: "http", 236 | Host: "www.google.com", 237 | Path: "/", 238 | RawQuery: "q=go+language", 239 | Fragment: "foo&bar", 240 | }, 241 | "http://www.google.com/?q=go+language#foo&bar", 242 | }, 243 | { 244 | "file:///home/adg/rabbits", 245 | &url.URL{ 246 | Scheme: "file", 247 | Host: "", 248 | Path: "/home/adg/rabbits", 249 | }, 250 | "file:///home/adg/rabbits", 251 | }, 252 | // "Windows" paths are no exception to the rule. 253 | // See golang.org/issue/6027, especially comment #9. 254 | { 255 | "file:///C:/FooBar/Baz.txt", 256 | &url.URL{ 257 | Scheme: "file", 258 | Host: "", 259 | Path: "/C:/FooBar/Baz.txt", 260 | }, 261 | "file:///C:/FooBar/Baz.txt", 262 | }, 263 | // case-insensitive scheme 264 | { 265 | "MaIlTo:webmaster@golang.org", 266 | &url.URL{ 267 | Scheme: "mailto", 268 | Opaque: "webmaster@golang.org", 269 | }, 270 | "mailto:webmaster@golang.org", 271 | }, 272 | // Relative path 273 | { 274 | "a/b/c", 275 | &url.URL{ 276 | Path: "a/b/c", 277 | }, 278 | "a/b/c", 279 | }, 280 | // escaped '?' in username and password 281 | { 282 | "http://%3Fam:pa%3Fsword@google.com", 283 | &url.URL{ 284 | Scheme: "http", 285 | User: url.UserPassword("?am", "pa?sword"), 286 | Host: "google.com", 287 | }, 288 | "", 289 | }, 290 | // escaped '?' and '#' in path 291 | { 292 | "http://example.com/%3F%23", 293 | &url.URL{ 294 | Scheme: "http", 295 | Host: "example.com", 296 | Path: "?#", 297 | }, 298 | "", 299 | }, 300 | // unescaped [ ] ! ' ( ) * in path 301 | { 302 | "http://example.com/[]!'()*", 303 | &url.URL{ 304 | Scheme: "http", 305 | Host: "example.com", 306 | Path: "[]!'()*", 307 | }, 308 | "http://example.com/[]!'()*", 309 | }, 310 | // escaped : / ? # [ ] @ in username and password 311 | { 312 | "http://%3A%2F%3F:%23%5B%5D%40@example.com", 313 | &url.URL{ 314 | Scheme: "http", 315 | User: url.UserPassword(":/?", "#[]@"), 316 | Host: "example.com", 317 | }, 318 | "", 319 | }, 320 | // unescaped ! $ & ' ( ) * + , ; = in username and password 321 | { 322 | "http://!$&'():*+,;=@example.com", 323 | &url.URL{ 324 | Scheme: "http", 325 | User: url.UserPassword("!$&'()", "*+,;="), 326 | Host: "example.com", 327 | }, 328 | "", 329 | }, 330 | // unescaped = : / . ? = in query component 331 | { 332 | "http://example.com/?q=http://google.com/?q=", 333 | &url.URL{ 334 | Scheme: "http", 335 | Host: "example.com", 336 | Path: "/", 337 | RawQuery: "q=http://google.com/?q=", 338 | }, 339 | "", 340 | }, 341 | // unescaped : / ? [ ] @ ! $ & ' ( ) * + , ; = in fragment 342 | { 343 | "http://example.com/#:/?%23[]@!$&'()*+,;=", 344 | &url.URL{ 345 | Scheme: "http", 346 | Host: "example.com", 347 | Path: "/", 348 | Fragment: ":/?#[]@!$&'()*+,;=", 349 | }, 350 | "", 351 | }, 352 | } 353 | 354 | func DoTestString(t *testing.T, parse func(string) (*url.URL, error), name string, tests []URLTest) { 355 | for _, tt := range tests { 356 | u, err := parse(tt.in) 357 | if err != nil { 358 | t.Errorf("%s(%q) returned error %s", name, tt.in, err) 359 | continue 360 | } 361 | expected := tt.in 362 | if len(tt.roundtrip) > 0 { 363 | expected = tt.roundtrip 364 | } 365 | s := escapeURL(u) 366 | if s != expected { 367 | t.Errorf("Escape(%s(%q)) == %q (expected %q)", name, tt.in, s, expected) 368 | } 369 | } 370 | } 371 | 372 | func TestURLString(t *testing.T) { 373 | DoTestString(t, url.Parse, "Parse", urltests) 374 | 375 | // no leading slash on path should prepend 376 | // slash on String() call 377 | noslash := URLTest{ 378 | "http://www.google.com/search", 379 | &url.URL{ 380 | Scheme: "http", 381 | Host: "www.google.com", 382 | Path: "search", 383 | }, 384 | "", 385 | } 386 | s := escapeURL(noslash.out) 387 | if s != noslash.in { 388 | t.Errorf("Expected %s; go %s", noslash.in, s) 389 | } 390 | } 391 | 392 | var resolveReferenceTests = []struct { 393 | base, rel, expected string 394 | }{ 395 | // Absolute URL references 396 | {"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"}, 397 | {"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/?a=b"}, 398 | {"http://foo.com/bar", "mailto:foo@example.com", "mailto:foo@example.com"}, 399 | 400 | // Path-absolute references 401 | {"http://foo.com/bar", "/baz", "http://foo.com/baz"}, 402 | {"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"}, 403 | {"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz?c=d"}, 404 | 405 | // Scheme-relative 406 | {"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"}, 407 | 408 | // Path-relative references: 409 | 410 | // ... current directory 411 | {"http://foo.com", ".", "http://foo.com/"}, 412 | {"http://foo.com/bar", ".", "http://foo.com/"}, 413 | {"http://foo.com/bar/", ".", "http://foo.com/bar/"}, 414 | 415 | // ... going down 416 | {"http://foo.com", "bar", "http://foo.com/bar"}, 417 | {"http://foo.com/", "bar", "http://foo.com/bar"}, 418 | {"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"}, 419 | 420 | // ... going up 421 | {"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"}, 422 | {"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"}, 423 | {"http://foo.com/bar", "..", "http://foo.com/"}, 424 | {"http://foo.com/bar/baz", "./..", "http://foo.com/"}, 425 | // ".." in the middle (issue 3560) 426 | {"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"}, 427 | {"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"}, 428 | {"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"}, 429 | {"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"}, 430 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"}, 431 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"}, 432 | {"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"}, 433 | {"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"}, 434 | 435 | // Remove any dot-segments prior to forming the target URI. 436 | // http://tools.ietf.org/html/rfc3986#section-5.2.4 437 | {"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"}, 438 | 439 | // Triple dot isn't special 440 | {"http://foo.com/bar", "...", "http://foo.com/..."}, 441 | 442 | // Fragment 443 | {"http://foo.com/bar", ".#frag", "http://foo.com/#frag"}, 444 | 445 | // RFC 3986: Normal Examples 446 | // http://tools.ietf.org/html/rfc3986#section-5.4.1 447 | {"http://a/b/c/d;p?q", "g:h", "g:h"}, 448 | {"http://a/b/c/d;p?q", "g", "http://a/b/c/g"}, 449 | {"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"}, 450 | {"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"}, 451 | {"http://a/b/c/d;p?q", "/g", "http://a/g"}, 452 | {"http://a/b/c/d;p?q", "//g", "http://g"}, 453 | {"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p?y"}, 454 | {"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g?y"}, 455 | {"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p?q#s"}, 456 | {"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g#s"}, 457 | {"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g?y#s"}, 458 | {"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"}, 459 | {"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"}, 460 | {"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x?y#s"}, 461 | {"http://a/b/c/d;p?q", "", "http://a/b/c/d;p?q"}, 462 | {"http://a/b/c/d;p?q", ".", "http://a/b/c/"}, 463 | {"http://a/b/c/d;p?q", "./", "http://a/b/c/"}, 464 | {"http://a/b/c/d;p?q", "..", "http://a/b/"}, 465 | {"http://a/b/c/d;p?q", "../", "http://a/b/"}, 466 | {"http://a/b/c/d;p?q", "../g", "http://a/b/g"}, 467 | {"http://a/b/c/d;p?q", "../..", "http://a/"}, 468 | {"http://a/b/c/d;p?q", "../../", "http://a/"}, 469 | {"http://a/b/c/d;p?q", "../../g", "http://a/g"}, 470 | 471 | // RFC 3986: Abnormal Examples 472 | // http://tools.ietf.org/html/rfc3986#section-5.4.2 473 | {"http://a/b/c/d;p?q", "../../../g", "http://a/g"}, 474 | {"http://a/b/c/d;p?q", "../../../../g", "http://a/g"}, 475 | {"http://a/b/c/d;p?q", "/./g", "http://a/g"}, 476 | {"http://a/b/c/d;p?q", "/../g", "http://a/g"}, 477 | {"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."}, 478 | {"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"}, 479 | {"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."}, 480 | {"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"}, 481 | {"http://a/b/c/d;p?q", "./../g", "http://a/b/g"}, 482 | {"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"}, 483 | {"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"}, 484 | {"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"}, 485 | {"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"}, 486 | {"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"}, 487 | {"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g?y/./x"}, 488 | {"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g?y/../x"}, 489 | {"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g#s/./x"}, 490 | {"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g#s/../x"}, 491 | 492 | // Extras. 493 | {"https://a/b/c/d;p?q", "//g?q", "https://g?q"}, 494 | {"https://a/b/c/d;p?q", "//g#s", "https://g#s"}, 495 | {"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f?y#s"}, 496 | {"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p?y"}, 497 | {"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p?y"}, 498 | } 499 | 500 | func TestResolveReference(t *testing.T) { 501 | mustParse := func(url_ string) *url.URL { 502 | u, err := url.Parse(url_) 503 | if err != nil { 504 | t.Fatalf("Expected URL to parse: %q, got error: %v", url_, err) 505 | } 506 | return u 507 | } 508 | opaque := &url.URL{Scheme: "scheme", Opaque: "opaque"} 509 | for _, test := range resolveReferenceTests { 510 | base := mustParse(test.base) 511 | rel := mustParse(test.rel) 512 | url := base.ResolveReference(rel) 513 | if escapeURL(url) != test.expected { 514 | t.Errorf("URL(%q).ResolveReference(%q) == %q, got %q", test.base, test.rel, test.expected, escapeURL(url)) 515 | } 516 | // Ensure that new instances are returned. 517 | if base == url { 518 | t.Errorf("Expected URL.ResolveReference to return new URL instance.") 519 | } 520 | // Test the convenience wrapper too. 521 | url, err := base.Parse(test.rel) 522 | if err != nil { 523 | t.Errorf("URL(%q).Parse(%q) failed: %v", test.base, test.rel, err) 524 | } else if escapeURL(url) != test.expected { 525 | t.Errorf("URL(%q).Parse(%q) == %q, got %q", test.base, test.rel, test.expected, escapeURL(url)) 526 | } else if base == url { 527 | // Ensure that new instances are returned for the wrapper too. 528 | t.Errorf("Expected URL.Parse to return new URL instance.") 529 | } 530 | // Ensure Opaque resets the URL. 531 | url = base.ResolveReference(opaque) 532 | if *url != *opaque { 533 | t.Errorf("ResolveReference failed to resolve opaque URL: want %#v, got %#v", url, opaque) 534 | } 535 | // Test the convenience wrapper with an opaque URL too. 536 | url, err = base.Parse("scheme:opaque") 537 | if err != nil { 538 | t.Errorf(`URL(%q).Parse("scheme:opaque") failed: %v`, test.base, err) 539 | } else if *url != *opaque { 540 | t.Errorf("Parse failed to resolve opaque URL: want %#v, got %#v", url, opaque) 541 | } else if base == url { 542 | // Ensure that new instances are returned, again. 543 | t.Errorf("Expected URL.Parse to return new URL instance.") 544 | } 545 | } 546 | } 547 | 548 | type shouldEscapeTest struct { 549 | in byte 550 | mode encoding 551 | escape bool 552 | } 553 | 554 | var shouldEscapeTests = []shouldEscapeTest{ 555 | // Unreserved characters (§2.3) 556 | {'a', encodePath, false}, 557 | {'a', encodeUserPassword, false}, 558 | {'a', encodeQueryComponent, false}, 559 | {'a', encodeFragment, false}, 560 | {'z', encodePath, false}, 561 | {'A', encodePath, false}, 562 | {'Z', encodePath, false}, 563 | {'0', encodePath, false}, 564 | {'9', encodePath, false}, 565 | {'-', encodePath, false}, 566 | {'-', encodeUserPassword, false}, 567 | {'-', encodeQueryComponent, false}, 568 | {'-', encodeFragment, false}, 569 | {'.', encodePath, false}, 570 | {'_', encodePath, false}, 571 | {'~', encodePath, false}, 572 | 573 | // User information (§3.2.1) 574 | {':', encodeUserPassword, true}, 575 | {'/', encodeUserPassword, true}, 576 | {'?', encodeUserPassword, true}, 577 | {'@', encodeUserPassword, true}, 578 | {'$', encodeUserPassword, false}, 579 | {'&', encodeUserPassword, false}, 580 | {'+', encodeUserPassword, false}, 581 | {',', encodeUserPassword, false}, 582 | {';', encodeUserPassword, false}, 583 | {'=', encodeUserPassword, false}, 584 | } 585 | 586 | func TestShouldEscape(t *testing.T) { 587 | for _, tt := range shouldEscapeTests { 588 | if shouldEscape(tt.in, tt.mode) != tt.escape { 589 | t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape) 590 | } 591 | } 592 | } 593 | -------------------------------------------------------------------------------- /urlnorm_test.go: -------------------------------------------------------------------------------- 1 | package purell 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // Test cases merged from PR #1 8 | // Originally from https://github.com/jehiah/urlnorm/blob/master/test_urlnorm.py 9 | 10 | func assertMap(t *testing.T, cases map[string]string, f NormalizationFlags) { 11 | for bad, good := range cases { 12 | s, e := NormalizeURLString(bad, f) 13 | if e != nil { 14 | t.Errorf("%s normalizing %v to %v", e.Error(), bad, good) 15 | } else { 16 | if s != good { 17 | t.Errorf("source: %v expected: %v got: %v", bad, good, s) 18 | } 19 | } 20 | } 21 | } 22 | 23 | // This tests normalization to a unicode representation 24 | // precent escapes for unreserved values are unescaped to their unicode value 25 | // tests normalization to idna domains 26 | // test ip word handling, ipv6 address handling, and trailing domain periods 27 | // in general, this matches google chromes unescaping for things in the address bar. 28 | // spaces are converted to '+' (perhaphs controversial) 29 | // http://code.google.com/p/google-url/ probably is another good reference for this approach 30 | func TestUrlnorm(t *testing.T) { 31 | testcases := map[string]string{ 32 | "http://test.example/?a=%e3%82%82%26": "http://test.example/?a=%e3%82%82%26", 33 | //"http://test.example/?a=%e3%82%82%26": "http://test.example/?a=\xe3\x82\x82%26", //should return a unicode character 34 | "http://s.xn--q-bga.DE/": "http://s.xn--q-bga.de/", //should be in idna format 35 | "http://XBLA\u306eXbox.com": "http://xn--xblaxbox-jf4g.com", //test utf8 and unicode 36 | "http://президент.рф": "http://xn--d1abbgf6aiiy.xn--p1ai", 37 | "http://ПРЕЗИДЕНТ.РФ": "http://xn--d1abbgf6aiiy.xn--p1ai", 38 | "http://ab¥ヲ₩○.com": "http://xn--ab-ida8983azmfnvs.com", //test width folding 39 | "http://\u00e9.com": "http://xn--9ca.com", 40 | "http://e\u0301.com": "http://xn--9ca.com", 41 | "http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3": "http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3", 42 | //"http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3": "http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3", 43 | 44 | "http://test.example/\xe3\x82\xad": "http://test.example/%E3%82%AD", 45 | //"http://test.example/\xe3\x82\xad": "http://test.example/\xe3\x82\xad", 46 | "http://test.example/?p=%23val#test-%23-val%25": "http://test.example/?p=%23val#test-%23-val%25", //check that %23 (#) is not escaped where it shouldn't be 47 | 48 | "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n": "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n", 49 | //"http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n": "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n", 50 | } 51 | 52 | assertMap(t, testcases, FlagsSafe|FlagRemoveDotSegments) 53 | } 54 | --------------------------------------------------------------------------------