├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── bench_test.go
├── benchmarks
    └── v0.1.0
├── example_test.go
├── go.mod
├── go.sum
├── purell.go
├── purell_test.go
├── urlesc.go
├── urlesc_test.go
└── urlnorm_test.go


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   ci:
10 |     runs-on: ubuntu-22.04
11 |     strategy:
12 |       matrix:
13 |         go: ['1.19.13', '1.20.10', '1.21.3']
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v4
17 | 
18 |       - name: Setup Go
19 |         uses: actions/setup-go@v3
20 |         with:
21 |           go-version: ${{ matrix.go }}
22 |           cache: true
23 | 
24 |       - name: Test
25 |         run: go test -v ./...
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sublime-*
2 | .DS_Store
3 | *.swp
4 | *.swo
5 | tags
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2022, The Go Authors
 2 | Copyright (c) 2012-2022, Martin Angers, Yuki Okushi & Contributors
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 | * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Purell
  2 | 
  3 | Purell is a tiny Go library to normalize URLs. It returns a pure URL. Pure-ell. Sanitizer and all. Yeah, I know...
  4 | 
  5 | Based on the [wikipedia paper][wiki] and the [RFC 3986 document][rfc].
  6 | 
  7 | [![CI](https://github.com/PuerkitoBio/purell/actions/workflows/ci.yml/badge.svg)](https://github.com/PuerkitoBio/purell/actions/workflows/ci.yml)
  8 | 
  9 | ## Install
 10 | 
 11 | `go get github.com/PuerkitoBio/purell`
 12 | 
 13 | ## Changelog
 14 | 
 15 | *    **v1.1.1** : Fix failing test due to Go1.12 changes (thanks to @ianlancetaylor).
 16 | *    **2016-11-14 (v1.1.0)** : IDN: Conform to RFC 5895: Fold character width (thanks to @beeker1121).
 17 | *    **2016-07-27 (v1.0.0)** : Normalize IDN to ASCII (thanks to @zenovich).
 18 | *    **2015-02-08** : Add fix for relative paths issue ([PR #5][pr5]) and add fix for unnecessary encoding of reserved characters ([see issue #7][iss7]).
 19 | *    **v0.2.0** : Add benchmarks, Attempt IDN support.
 20 | *    **v0.1.0** : Initial release.
 21 | 
 22 | ## Examples
 23 | 
 24 | From `example_test.go` (note that in your code, you would import "github.com/PuerkitoBio/purell", and would prefix references to its methods and constants with "purell."):
 25 | 
 26 | ```go
 27 | package purell
 28 | 
 29 | import (
 30 |   "fmt"
 31 |   "net/url"
 32 | )
 33 | 
 34 | func ExampleNormalizeURLString() {
 35 |   if normalized, err := NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%3f/url/",
 36 |     FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes); err != nil {
 37 |     panic(err)
 38 |   } else {
 39 |     fmt.Print(normalized)
 40 |   }
 41 |   // Output: http://somewebsite.com:80/Amazing%3F/url/
 42 | }
 43 | 
 44 | func ExampleMustNormalizeURLString() {
 45 |   normalized := MustNormalizeURLString("hTTpS://someWEBsite.com:443/Amazing%fa/url/",
 46 |     FlagsUnsafeGreedy)
 47 |   fmt.Print(normalized)
 48 | 
 49 |   // Output: http://somewebsite.com/Amazing%FA/url
 50 | }
 51 | 
 52 | func ExampleNormalizeURL() {
 53 |   if u, err := url.Parse("Http://SomeUrl.com:8080/a/b/.././c///g?c=3&a=1&b=9&c=0#target"); err != nil {
 54 |     panic(err)
 55 |   } else {
 56 |     normalized := NormalizeURL(u, FlagsUsuallySafeGreedy|FlagRemoveDuplicateSlashes|FlagRemoveFragment)
 57 |     fmt.Print(normalized)
 58 |   }
 59 | 
 60 |   // Output: http://someurl.com:8080/a/c/g?c=3&a=1&b=9&c=0
 61 | }
 62 | ```
 63 | 
 64 | ## API
 65 | 
 66 | As seen in the examples above, purell offers three methods, `NormalizeURLString(string, NormalizationFlags) (string, error)`, `MustNormalizeURLString(string, NormalizationFlags) (string)` and `NormalizeURL(*url.URL, NormalizationFlags) (string)`. They all normalize the provided URL based on the specified flags. Here are the available flags:
 67 | 
 68 | ```go
 69 | const (
 70 | 	// Safe normalizations
 71 | 	FlagLowercaseScheme           NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
 72 | 	FlagLowercaseHost                                            // http://HOST -> http://host
 73 | 	FlagUppercaseEscapes                                         // http://host/t%ef -> http://host/t%EF
 74 | 	FlagDecodeUnnecessaryEscapes                                 // http://host/t%41 -> http://host/tA
 75 | 	FlagEncodeNecessaryEscapes                                   // http://host/!"#$ -> http://host/%21%22#$
 76 | 	FlagRemoveDefaultPort                                        // http://host:80 -> http://host
 77 | 	FlagRemoveEmptyQuerySeparator                                // http://host/path? -> http://host/path
 78 | 
 79 | 	// Usually safe normalizations
 80 | 	FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
 81 | 	FlagAddTrailingSlash    // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
 82 | 	FlagRemoveDotSegments   // http://host/path/./a/b/../c -> http://host/path/a/c
 83 | 
 84 | 	// Unsafe normalizations
 85 | 	FlagRemoveDirectoryIndex   // http://host/path/index.html -> http://host/path/
 86 | 	FlagRemoveFragment         // http://host/path#fragment -> http://host/path
 87 | 	FlagForceHTTP              // https://host -> http://host
 88 | 	FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
 89 | 	FlagRemoveWWW              // http://www.host/ -> http://host/
 90 | 	FlagAddWWW                 // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
 91 | 	FlagSortQuery              // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
 92 | 
 93 | 	// Normalizations not in the wikipedia article, required to cover tests cases
 94 | 	// submitted by jehiah
 95 | 	FlagDecodeDWORDHost           // http://1113982867 -> http://66.102.7.147
 96 | 	FlagDecodeOctalHost           // http://0102.0146.07.0223 -> http://66.102.7.147
 97 | 	FlagDecodeHexHost             // http://0x42660793 -> http://66.102.7.147
 98 | 	FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
 99 | 	FlagRemoveEmptyPortSeparator  // http://host:/path -> http://host/path
100 | 
101 | 	// Convenience set of safe normalizations
102 | 	FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
103 | 
104 | 	// For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
105 | 	// while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
106 | 
107 | 	// Convenience set of usually safe normalizations (includes FlagsSafe)
108 | 	FlagsUsuallySafeGreedy    NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
109 | 	FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
110 | 
111 | 	// Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
112 | 	FlagsUnsafeGreedy    NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
113 | 	FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
114 | 
115 | 	// Convenience set of all available flags
116 | 	FlagsAllGreedy    = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
117 | 	FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
118 | )
119 | ```
120 | 
121 | For convenience, the set of flags `FlagsSafe`, `FlagsUsuallySafe[Greedy|NonGreedy]`, `FlagsUnsafe[Greedy|NonGreedy]` and `FlagsAll[Greedy|NonGreedy]` are provided for the similarly grouped normalizations on [wikipedia's URL normalization page][wiki]. You can add (using the bitwise OR `|` operator) or remove (using the bitwise AND NOT `&^` operator) individual flags from the sets if required, to build your own custom set.
122 | 
123 | The [full godoc reference is available on gopkgdoc][godoc].
124 | 
125 | Some things to note:
126 | 
127 | *    `FlagDecodeUnnecessaryEscapes`, `FlagEncodeNecessaryEscapes`, `FlagUppercaseEscapes` and `FlagRemoveEmptyQuerySeparator` are always implicitly set, because internally, the URL string is parsed as an URL object, which automatically decodes unnecessary escapes, uppercases and encodes necessary ones, and removes empty query separators (an unnecessary `?` at the end of the url). So this operation cannot **not** be done. For this reason, `FlagRemoveEmptyQuerySeparator` (as well as the other three) has been included in the `FlagsSafe` convenience set, instead of `FlagsUnsafe`, where Wikipedia puts it.
128 | 
129 | *    The `FlagDecodeUnnecessaryEscapes` decodes the following escapes (*from -> to*):
130 |     -    %24 -> $
131 |     -    %26 -> &
132 |     -    %2B-%3B -> +,-./0123456789:;
133 |     -    %3D -> =
134 |     -    %40-%5A -> @ABCDEFGHIJKLMNOPQRSTUVWXYZ
135 |     -    %5F -> _
136 |     -    %61-%7A -> abcdefghijklmnopqrstuvwxyz
137 |     -    %7E -> ~
138 | 
139 | 
140 | *    When the `NormalizeURL` function is used (passing an URL object), this source URL object is modified (that is, after the call, the URL object will be modified to reflect the normalization).
141 | 
142 | *    The *replace IP with domain name* normalization (`http://208.77.188.166/ → http://www.example.com/`) is obviously not possible for a library without making some network requests. This is not implemented in purell.
143 | 
144 | *    The *remove unused query string parameters* and *remove default query parameters* are also not implemented, since this is a very case-specific normalization, and it is quite trivial to do with an URL object.
145 | 
146 | ### Safe vs Usually Safe vs Unsafe
147 | 
148 | Purell allows you to control the level of risk you take while normalizing an URL. You can aggressively normalize, play it totally safe, or anything in between.
149 | 
150 | Consider the following URL:
151 | 
152 | `HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid`
153 | 
154 | Normalizing with the `FlagsSafe` gives:
155 | 
156 | `https://www.root.com/toto/tE%1F///a/./b/../c/?z=3&w=2&a=4&w=1#invalid`
157 | 
158 | With the `FlagsUsuallySafeGreedy`:
159 | 
160 | `https://www.root.com/toto/tE%1F///a/c?z=3&w=2&a=4&w=1#invalid`
161 | 
162 | And with `FlagsUnsafeGreedy`:
163 | 
164 | `http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3`
165 | 
166 | ## TODOs
167 | 
168 | *    Add a class/default instance to allow specifying custom directory index names? At the moment, removing directory index removes `(^|/)((?:default|index)\.\w{1,4})$`.
169 | 
170 | ## Thanks / Contributions
171 | 
172 | @rogpeppe
173 | @jehiah
174 | @opennota
175 | @pchristopher1275
176 | @zenovich
177 | @beeker1121
178 | 
179 | ## License
180 | 
181 | The [BSD 3-Clause license][bsd].
182 | 
183 | [bsd]: http://opensource.org/licenses/BSD-3-Clause
184 | [wiki]: http://en.wikipedia.org/wiki/URL_normalization
185 | [rfc]: http://tools.ietf.org/html/rfc3986#section-6
186 | [godoc]: http://go.pkgdoc.org/github.com/PuerkitoBio/purell
187 | [pr5]: https://github.com/PuerkitoBio/purell/pull/5
188 | [iss7]: https://github.com/PuerkitoBio/purell/issues/7
189 | 


--------------------------------------------------------------------------------
/bench_test.go:
--------------------------------------------------------------------------------
 1 | package purell
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | var (
 8 | 	safeUrl        = "HttPS://..iaMHost..Test:443/paTh^A%ef//./%41PaTH/..//?"
 9 | 	usuallySafeUrl = "HttPS://..iaMHost..Test:443/paTh^A%ef//./%41PaTH/../final/"
10 | 	unsafeUrl      = "HttPS://..www.iaMHost..Test:443/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment"
11 | 	allDWORDUrl    = "HttPS://1113982867:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment"
12 | 	allOctalUrl    = "HttPS://0102.0146.07.0223:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment"
13 | 	allHexUrl      = "HttPS://0x42660793:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment"
14 | 	allCombinedUrl = "HttPS://..0x42660793.:/paTh^A%ef//./%41PaTH/../final/index.html?t=val1&a=val4&z=val5&a=val1#fragment"
15 | )
16 | 
17 | func BenchmarkSafe(b *testing.B) {
18 | 	for i := 0; i < b.N; i++ {
19 | 		NormalizeURLString(safeUrl, FlagsSafe)
20 | 	}
21 | }
22 | 
23 | func BenchmarkUsuallySafe(b *testing.B) {
24 | 	for i := 0; i < b.N; i++ {
25 | 		NormalizeURLString(usuallySafeUrl, FlagsUsuallySafeGreedy)
26 | 	}
27 | }
28 | 
29 | func BenchmarkUnsafe(b *testing.B) {
30 | 	for i := 0; i < b.N; i++ {
31 | 		NormalizeURLString(unsafeUrl, FlagsUnsafeGreedy)
32 | 	}
33 | }
34 | 
35 | func BenchmarkAllDWORD(b *testing.B) {
36 | 	for i := 0; i < b.N; i++ {
37 | 		NormalizeURLString(allDWORDUrl, FlagsAllGreedy)
38 | 	}
39 | }
40 | 
41 | func BenchmarkAllOctal(b *testing.B) {
42 | 	for i := 0; i < b.N; i++ {
43 | 		NormalizeURLString(allOctalUrl, FlagsAllGreedy)
44 | 	}
45 | }
46 | 
47 | func BenchmarkAllHex(b *testing.B) {
48 | 	for i := 0; i < b.N; i++ {
49 | 		NormalizeURLString(allHexUrl, FlagsAllGreedy)
50 | 	}
51 | }
52 | 
53 | func BenchmarkAllCombined(b *testing.B) {
54 | 	for i := 0; i < b.N; i++ {
55 | 		NormalizeURLString(allCombinedUrl, FlagsAllGreedy)
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/benchmarks/v0.1.0:
--------------------------------------------------------------------------------
 1 | PASS
 2 | BenchmarkSafe	  500000	      6131 ns/op
 3 | BenchmarkUsuallySafe	  200000	      7864 ns/op
 4 | BenchmarkUnsafe	  100000	     28560 ns/op
 5 | BenchmarkAllDWORD	   50000	     38722 ns/op
 6 | BenchmarkAllOctal	   50000	     40941 ns/op
 7 | BenchmarkAllHex	   50000	     44063 ns/op
 8 | BenchmarkAllCombined	   50000	     33613 ns/op
 9 | ok  	github.com/PuerkitoBio/purell	17.404s
10 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package purell
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net/url"
 6 | )
 7 | 
 8 | func ExampleNormalizeURLString() {
 9 | 	if normalized, err := NormalizeURLString("hTTp://someWEBsite.com:80/Amazing%3f/url/",
10 | 		FlagLowercaseScheme|FlagLowercaseHost|FlagUppercaseEscapes); err != nil {
11 | 		panic(err)
12 | 	} else {
13 | 		fmt.Print(normalized)
14 | 	}
15 | 	// Output: http://somewebsite.com:80/Amazing%3F/url/
16 | }
17 | 
18 | func ExampleMustNormalizeURLString() {
19 | 	normalized := MustNormalizeURLString("hTTpS://someWEBsite.com:443/Amazing%fa/url/",
20 | 		FlagsUnsafeGreedy)
21 | 	fmt.Print(normalized)
22 | 
23 | 	// Output: http://somewebsite.com/Amazing%FA/url
24 | }
25 | 
26 | func ExampleNormalizeURL() {
27 | 	if u, err := url.Parse("Http://SomeUrl.com:8080/a/b/.././c///g?c=3&a=1&b=9&c=0#target"); err != nil {
28 | 		panic(err)
29 | 	} else {
30 | 		normalized := NormalizeURL(u, FlagsUsuallySafeGreedy|FlagRemoveDuplicateSlashes|FlagRemoveFragment)
31 | 		fmt.Print(normalized)
32 | 	}
33 | 
34 | 	// Output: http://someurl.com:8080/a/c/g?c=3&a=1&b=9&c=0
35 | }
36 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/PuerkitoBio/purell
2 | 
3 | go 1.21
4 | 
5 | require (
6 | 	golang.org/x/net v0.17.0
7 | 	golang.org/x/text v0.13.0
8 | )
9 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
2 | golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
3 | golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
4 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
5 | 


--------------------------------------------------------------------------------
/purell.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package purell offers URL normalization as described on the wikipedia page:
  3 | http://en.wikipedia.org/wiki/URL_normalization
  4 | */
  5 | package purell
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"net/url"
 11 | 	"regexp"
 12 | 	"sort"
 13 | 	"strconv"
 14 | 	"strings"
 15 | 
 16 | 	"golang.org/x/net/idna"
 17 | 	"golang.org/x/text/unicode/norm"
 18 | 	"golang.org/x/text/width"
 19 | )
 20 | 
 21 | // A set of normalization flags determines how a URL will
 22 | // be normalized.
 23 | type NormalizationFlags uint
 24 | 
 25 | const (
 26 | 	// Safe normalizations
 27 | 	FlagLowercaseScheme           NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
 28 | 	FlagLowercaseHost                                            // http://HOST -> http://host
 29 | 	FlagUppercaseEscapes                                         // http://host/t%ef -> http://host/t%EF
 30 | 	FlagDecodeUnnecessaryEscapes                                 // http://host/t%41 -> http://host/tA
 31 | 	FlagEncodeNecessaryEscapes                                   // http://host/!"#$ -> http://host/%21%22#$
 32 | 	FlagRemoveDefaultPort                                        // http://host:80 -> http://host
 33 | 	FlagRemoveEmptyQuerySeparator                                // http://host/path? -> http://host/path
 34 | 
 35 | 	// Usually safe normalizations
 36 | 	FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
 37 | 	FlagAddTrailingSlash    // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
 38 | 	FlagRemoveDotSegments   // http://host/path/./a/b/../c -> http://host/path/a/c
 39 | 
 40 | 	// Unsafe normalizations
 41 | 	FlagRemoveDirectoryIndex   // http://host/path/index.html -> http://host/path/
 42 | 	FlagRemoveFragment         // http://host/path#fragment -> http://host/path
 43 | 	FlagForceHTTP              // https://host -> http://host
 44 | 	FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
 45 | 	FlagRemoveWWW              // http://www.host/ -> http://host/
 46 | 	FlagAddWWW                 // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
 47 | 	FlagSortQuery              // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
 48 | 
 49 | 	// Normalizations not in the wikipedia article, required to cover tests cases
 50 | 	// submitted by jehiah
 51 | 	FlagDecodeDWORDHost           // http://1113982867 -> http://66.102.7.147
 52 | 	FlagDecodeOctalHost           // http://0102.0146.07.0223 -> http://66.102.7.147
 53 | 	FlagDecodeHexHost             // http://0x42660793 -> http://66.102.7.147
 54 | 	FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
 55 | 	FlagRemoveEmptyPortSeparator  // http://host:/path -> http://host/path
 56 | 
 57 | 	// Convenience set of safe normalizations
 58 | 	FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
 59 | 
 60 | 	// For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
 61 | 	// while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
 62 | 
 63 | 	// Convenience set of usually safe normalizations (includes FlagsSafe)
 64 | 	FlagsUsuallySafeGreedy    NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
 65 | 	FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
 66 | 
 67 | 	// Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
 68 | 	FlagsUnsafeGreedy    NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
 69 | 	FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
 70 | 
 71 | 	// Convenience set of all available flags
 72 | 	FlagsAllGreedy    = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
 73 | 	FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
 74 | )
 75 | 
 76 | const (
 77 | 	defaultHttpPort  = ":80"
 78 | 	defaultHttpsPort = ":443"
 79 | )
 80 | 
 81 | // Regular expressions used by the normalizations
 82 | var rxPort = regexp.MustCompile(`(:\d+)/?$`)
 83 | var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
 84 | var rxDupSlashes = regexp.MustCompile(`/{2,}`)
 85 | var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
 86 | var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
 87 | var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
 88 | var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
 89 | var rxHostInteriorDots = regexp.MustCompile(`\.+`)
 90 | var rxEmptyPort = regexp.MustCompile(`:+$`)
 91 | 
 92 | // Map of flags to implementation function.
 93 | // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
 94 | // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
 95 | 
 96 | // Since maps have undefined traversing order, make a slice of ordered keys
 97 | var flagsOrder = []NormalizationFlags{
 98 | 	FlagLowercaseScheme,
 99 | 	FlagLowercaseHost,
100 | 	FlagRemoveDefaultPort,
101 | 	FlagRemoveDirectoryIndex,
102 | 	FlagRemoveDotSegments,
103 | 	FlagRemoveFragment,
104 | 	FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
105 | 	FlagRemoveDuplicateSlashes,
106 | 	FlagRemoveWWW,
107 | 	FlagAddWWW,
108 | 	FlagSortQuery,
109 | 	FlagDecodeDWORDHost,
110 | 	FlagDecodeOctalHost,
111 | 	FlagDecodeHexHost,
112 | 	FlagRemoveUnnecessaryHostDots,
113 | 	FlagRemoveEmptyPortSeparator,
114 | 	FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
115 | 	FlagAddTrailingSlash,
116 | }
117 | 
118 | // ... and then the map, where order is unimportant
119 | var flags = map[NormalizationFlags]func(*url.URL){
120 | 	FlagLowercaseScheme:           lowercaseScheme,
121 | 	FlagLowercaseHost:             lowercaseHost,
122 | 	FlagRemoveDefaultPort:         removeDefaultPort,
123 | 	FlagRemoveDirectoryIndex:      removeDirectoryIndex,
124 | 	FlagRemoveDotSegments:         removeDotSegments,
125 | 	FlagRemoveFragment:            removeFragment,
126 | 	FlagForceHTTP:                 forceHTTP,
127 | 	FlagRemoveDuplicateSlashes:    removeDuplicateSlashes,
128 | 	FlagRemoveWWW:                 removeWWW,
129 | 	FlagAddWWW:                    addWWW,
130 | 	FlagSortQuery:                 sortQuery,
131 | 	FlagDecodeDWORDHost:           decodeDWORDHost,
132 | 	FlagDecodeOctalHost:           decodeOctalHost,
133 | 	FlagDecodeHexHost:             decodeHexHost,
134 | 	FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
135 | 	FlagRemoveEmptyPortSeparator:  removeEmptyPortSeparator,
136 | 	FlagRemoveTrailingSlash:       removeTrailingSlash,
137 | 	FlagAddTrailingSlash:          addTrailingSlash,
138 | }
139 | 
140 | // MustNormalizeURLString returns the normalized string, and panics if an error occurs.
141 | // It takes an URL string as input, as well as the normalization flags.
142 | func MustNormalizeURLString(u string, f NormalizationFlags) string {
143 | 	result, e := NormalizeURLString(u, f)
144 | 	if e != nil {
145 | 		panic(e)
146 | 	}
147 | 	return result
148 | }
149 | 
150 | // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
151 | // It takes an URL string as input, as well as the normalization flags.
152 | func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
153 | 	parsed, err := url.Parse(u)
154 | 	if err != nil {
155 | 		return "", err
156 | 	}
157 | 
158 | 	if f&FlagLowercaseHost == FlagLowercaseHost {
159 | 		parsed.Host = strings.ToLower(parsed.Host)
160 | 	}
161 | 
162 | 	// The idna package doesn't fully conform to RFC 5895
163 | 	// (https://tools.ietf.org/html/rfc5895), so we do it here.
164 | 	// Taken from Go 1.8 cycle source, courtesy of bradfitz.
165 | 	// TODO: Remove when (if?) idna package conforms to RFC 5895.
166 | 	parsed.Host = width.Fold.String(parsed.Host)
167 | 	parsed.Host = norm.NFC.String(parsed.Host)
168 | 	if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
169 | 		return "", err
170 | 	}
171 | 
172 | 	return NormalizeURL(parsed, f), nil
173 | }
174 | 
175 | // NormalizeURL returns the normalized string.
176 | // It takes a parsed URL object as input, as well as the normalization flags.
177 | func NormalizeURL(u *url.URL, f NormalizationFlags) string {
178 | 	for _, k := range flagsOrder {
179 | 		if f&k == k {
180 | 			flags[k](u)
181 | 		}
182 | 	}
183 | 	return escapeURL(u)
184 | }
185 | 
186 | func lowercaseScheme(u *url.URL) {
187 | 	if len(u.Scheme) > 0 {
188 | 		u.Scheme = strings.ToLower(u.Scheme)
189 | 	}
190 | }
191 | 
192 | func lowercaseHost(u *url.URL) {
193 | 	if len(u.Host) > 0 {
194 | 		u.Host = strings.ToLower(u.Host)
195 | 	}
196 | }
197 | 
198 | func removeDefaultPort(u *url.URL) {
199 | 	if len(u.Host) > 0 {
200 | 		scheme := strings.ToLower(u.Scheme)
201 | 		u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
202 | 			if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
203 | 				return ""
204 | 			}
205 | 			return val
206 | 		})
207 | 	}
208 | }
209 | 
210 | func removeTrailingSlash(u *url.URL) {
211 | 	if l := len(u.Path); l > 0 {
212 | 		if strings.HasSuffix(u.Path, "/") {
213 | 			u.Path = u.Path[:l-1]
214 | 		}
215 | 	} else if l = len(u.Host); l > 0 {
216 | 		if strings.HasSuffix(u.Host, "/") {
217 | 			u.Host = u.Host[:l-1]
218 | 		}
219 | 	}
220 | }
221 | 
222 | func addTrailingSlash(u *url.URL) {
223 | 	if l := len(u.Path); l > 0 {
224 | 		if !strings.HasSuffix(u.Path, "/") {
225 | 			u.Path += "/"
226 | 		}
227 | 	} else if l = len(u.Host); l > 0 {
228 | 		if !strings.HasSuffix(u.Host, "/") {
229 | 			u.Host += "/"
230 | 		}
231 | 	}
232 | }
233 | 
234 | func removeDotSegments(u *url.URL) {
235 | 	if len(u.Path) > 0 {
236 | 		var dotFree []string
237 | 		var lastIsDot bool
238 | 
239 | 		sections := strings.Split(u.Path, "/")
240 | 		for _, s := range sections {
241 | 			if s == ".." {
242 | 				if len(dotFree) > 0 {
243 | 					dotFree = dotFree[:len(dotFree)-1]
244 | 				}
245 | 			} else if s != "." {
246 | 				dotFree = append(dotFree, s)
247 | 			}
248 | 			lastIsDot = (s == "." || s == "..")
249 | 		}
250 | 		// Special case if host does not end with / and new path does not begin with /
251 | 		u.Path = strings.Join(dotFree, "/")
252 | 		if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
253 | 			u.Path = "/" + u.Path
254 | 		}
255 | 		// Special case if the last segment was a dot, make sure the path ends with a slash
256 | 		if lastIsDot && !strings.HasSuffix(u.Path, "/") {
257 | 			u.Path += "/"
258 | 		}
259 | 	}
260 | }
261 | 
262 | func removeDirectoryIndex(u *url.URL) {
263 | 	if len(u.Path) > 0 {
264 | 		u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
265 | 	}
266 | }
267 | 
268 | func removeFragment(u *url.URL) {
269 | 	u.Fragment = ""
270 | }
271 | 
272 | func forceHTTP(u *url.URL) {
273 | 	if strings.ToLower(u.Scheme) == "https" {
274 | 		u.Scheme = "http"
275 | 	}
276 | }
277 | 
278 | func removeDuplicateSlashes(u *url.URL) {
279 | 	if len(u.Path) > 0 {
280 | 		u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
281 | 	}
282 | }
283 | 
284 | func removeWWW(u *url.URL) {
285 | 	if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
286 | 		u.Host = u.Host[4:]
287 | 	}
288 | }
289 | 
290 | func addWWW(u *url.URL) {
291 | 	if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
292 | 		u.Host = "www." + u.Host
293 | 	}
294 | }
295 | 
296 | func sortQuery(u *url.URL) {
297 | 	q := u.Query()
298 | 
299 | 	if len(q) > 0 {
300 | 		arKeys := make([]string, len(q))
301 | 		i := 0
302 | 		for k := range q {
303 | 			arKeys[i] = k
304 | 			i++
305 | 		}
306 | 		sort.Strings(arKeys)
307 | 		buf := new(bytes.Buffer)
308 | 		for _, k := range arKeys {
309 | 			sort.Strings(q[k])
310 | 			for _, v := range q[k] {
311 | 				if buf.Len() > 0 {
312 | 					buf.WriteRune('&')
313 | 				}
314 | 				buf.WriteString(fmt.Sprintf("%s=%s", k, url.QueryEscape(v)))
315 | 			}
316 | 		}
317 | 
318 | 		// Rebuild the raw query string
319 | 		u.RawQuery = buf.String()
320 | 	}
321 | }
322 | 
323 | func decodeDWORDHost(u *url.URL) {
324 | 	if len(u.Host) > 0 {
325 | 		if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
326 | 			var parts [4]int64
327 | 
328 | 			dword, _ := strconv.ParseInt(matches[1], 10, 0)
329 | 			for i, shift := range []uint{24, 16, 8, 0} {
330 | 				parts[i] = dword >> shift & 0xFF
331 | 			}
332 | 			u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
333 | 		}
334 | 	}
335 | }
336 | 
337 | func decodeOctalHost(u *url.URL) {
338 | 	if len(u.Host) > 0 {
339 | 		if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
340 | 			var parts [4]int64
341 | 
342 | 			for i := 1; i <= 4; i++ {
343 | 				parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
344 | 			}
345 | 			u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
346 | 		}
347 | 	}
348 | }
349 | 
350 | func decodeHexHost(u *url.URL) {
351 | 	if len(u.Host) > 0 {
352 | 		if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
353 | 			// Conversion is safe because of regex validation
354 | 			parsed, _ := strconv.ParseInt(matches[1], 16, 0)
355 | 			// Set host as DWORD (base 10) encoded host
356 | 			u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
357 | 			// The rest is the same as decoding a DWORD host
358 | 			decodeDWORDHost(u)
359 | 		}
360 | 	}
361 | }
362 | 
363 | func removeUnncessaryHostDots(u *url.URL) {
364 | 	if len(u.Host) > 0 {
365 | 		if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
366 | 			// Trim the leading and trailing dots
367 | 			u.Host = strings.Trim(matches[1], ".")
368 | 			if len(matches) > 2 {
369 | 				u.Host += matches[2]
370 | 			}
371 | 		}
372 | 		u.Host = rxHostInteriorDots.ReplaceAllString(u.Host, ".")
373 | 	}
374 | }
375 | 
376 | func removeEmptyPortSeparator(u *url.URL) {
377 | 	if len(u.Host) > 0 {
378 | 		u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
379 | 	}
380 | }
381 | 


--------------------------------------------------------------------------------
/purell_test.go:
--------------------------------------------------------------------------------
  1 | package purell
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"net/url"
  6 | 	"testing"
  7 | 	"unicode"
  8 | )
  9 | 
 10 | type testCase struct {
 11 | 	nm     string
 12 | 	src    string
 13 | 	flgs   NormalizationFlags
 14 | 	res    string
 15 | 	parsed bool
 16 | }
 17 | 
 18 | var (
 19 | 	cases = [...]*testCase{
 20 | 		{
 21 | 			"LowerScheme",
 22 | 			"HTTP://www.SRC.ca",
 23 | 			FlagLowercaseScheme,
 24 | 			"http://www.SRC.ca",
 25 | 			false,
 26 | 		},
 27 | 		{
 28 | 			"LowerScheme2",
 29 | 			"http://www.SRC.ca",
 30 | 			FlagLowercaseScheme,
 31 | 			"http://www.SRC.ca",
 32 | 			false,
 33 | 		},
 34 | 		{
 35 | 			"LowerHost",
 36 | 			"HTTP://www.SRC.ca/",
 37 | 			FlagLowercaseHost,
 38 | 			"http://www.src.ca/", // Since Go1.1, scheme is automatically lowercased
 39 | 			false,
 40 | 		},
 41 | 		{
 42 | 			"UpperEscapes",
 43 | 			`http://www.whatever.com/Some%aa%20Special%8Ecases/`,
 44 | 			FlagUppercaseEscapes,
 45 | 			"http://www.whatever.com/Some%AA%20Special%8Ecases/",
 46 | 			false,
 47 | 		},
 48 | 		{
 49 | 			"UnnecessaryEscapes",
 50 | 			`http://www.toto.com/%41%42%2E%44/%32%33%52%2D/%5f%7E`,
 51 | 			FlagDecodeUnnecessaryEscapes,
 52 | 			"http://www.toto.com/AB.D/23R-/_~",
 53 | 			false,
 54 | 		},
 55 | 		{
 56 | 			"RemoveDefaultPort",
 57 | 			"HTTP://www.SRC.ca:80/",
 58 | 			FlagRemoveDefaultPort,
 59 | 			"http://www.SRC.ca/", // Since Go1.1, scheme is automatically lowercased
 60 | 			false,
 61 | 		},
 62 | 		{
 63 | 			"RemoveDefaultPort2",
 64 | 			"HTTP://www.SRC.ca:80",
 65 | 			FlagRemoveDefaultPort,
 66 | 			"http://www.SRC.ca", // Since Go1.1, scheme is automatically lowercased
 67 | 			false,
 68 | 		},
 69 | 		{
 70 | 			"RemoveDefaultPort3",
 71 | 			"HTTP://www.SRC.ca:8080",
 72 | 			FlagRemoveDefaultPort,
 73 | 			"http://www.SRC.ca:8080", // Since Go1.1, scheme is automatically lowercased
 74 | 			false,
 75 | 		},
 76 | 		{
 77 | 			"Safe",
 78 | 			"HTTP://www.SRC.ca:80/to%1ato%8b%ee/OKnow%41%42%43%7e",
 79 | 			FlagsSafe,
 80 | 			"http://www.src.ca/to%1Ato%8B%EE/OKnowABC~",
 81 | 			false,
 82 | 		},
 83 | 		{
 84 | 			"BothLower",
 85 | 			"HTTP://www.SRC.ca:80/to%1ato%8b%ee/OKnow%41%42%43%7e",
 86 | 			FlagLowercaseHost | FlagLowercaseScheme,
 87 | 			"http://www.src.ca:80/to%1Ato%8B%EE/OKnowABC~",
 88 | 			false,
 89 | 		},
 90 | 		{
 91 | 			"RemoveTrailingSlash",
 92 | 			"HTTP://www.SRC.ca:80/",
 93 | 			FlagRemoveTrailingSlash,
 94 | 			"http://www.SRC.ca:80", // Since Go1.1, scheme is automatically lowercased
 95 | 			false,
 96 | 		},
 97 | 		{
 98 | 			"RemoveTrailingSlash2",
 99 | 			"HTTP://www.SRC.ca:80/toto/titi/",
100 | 			FlagRemoveTrailingSlash,
101 | 			"http://www.SRC.ca:80/toto/titi", // Since Go1.1, scheme is automatically lowercased
102 | 			false,
103 | 		},
104 | 		{
105 | 			"RemoveTrailingSlash3",
106 | 			"HTTP://www.SRC.ca:80/toto/titi/fin/?a=1",
107 | 			FlagRemoveTrailingSlash,
108 | 			"http://www.SRC.ca:80/toto/titi/fin?a=1", // Since Go1.1, scheme is automatically lowercased
109 | 			false,
110 | 		},
111 | 		{
112 | 			"AddTrailingSlash",
113 | 			"HTTP://www.SRC.ca:80",
114 | 			FlagAddTrailingSlash,
115 | 			"http://www.SRC.ca:80/", // Since Go1.1, scheme is automatically lowercased
116 | 			false,
117 | 		},
118 | 		{
119 | 			"AddTrailingSlash2",
120 | 			"HTTP://www.SRC.ca:80/toto/titi.html",
121 | 			FlagAddTrailingSlash,
122 | 			"http://www.SRC.ca:80/toto/titi.html/", // Since Go1.1, scheme is automatically lowercased
123 | 			false,
124 | 		},
125 | 		{
126 | 			"AddTrailingSlash3",
127 | 			"HTTP://www.SRC.ca:80/toto/titi/fin?a=1",
128 | 			FlagAddTrailingSlash,
129 | 			"http://www.SRC.ca:80/toto/titi/fin/?a=1", // Since Go1.1, scheme is automatically lowercased
130 | 			false,
131 | 		},
132 | 		{
133 | 			"RemoveDotSegments",
134 | 			"HTTP://root/a/b/./../../c/",
135 | 			FlagRemoveDotSegments,
136 | 			"http://root/c/", // Since Go1.1, scheme is automatically lowercased
137 | 			false,
138 | 		},
139 | 		{
140 | 			"RemoveDotSegments2",
141 | 			"HTTP://root/../a/b/./../c/../d",
142 | 			FlagRemoveDotSegments,
143 | 			"http://root/a/d", // Since Go1.1, scheme is automatically lowercased
144 | 			false,
145 | 		},
146 | 		{
147 | 			"UsuallySafe",
148 | 			"HTTP://www.SRC.ca:80/to%1ato%8b%ee/./c/d/../OKnow%41%42%43%7e/?a=b#test",
149 | 			FlagsUsuallySafeGreedy,
150 | 			"http://www.src.ca/to%1Ato%8B%EE/c/OKnowABC~?a=b#test",
151 | 			false,
152 | 		},
153 | 		{
154 | 			"RemoveDirectoryIndex",
155 | 			"HTTP://root/a/b/c/default.aspx",
156 | 			FlagRemoveDirectoryIndex,
157 | 			"http://root/a/b/c/", // Since Go1.1, scheme is automatically lowercased
158 | 			false,
159 | 		},
160 | 		{
161 | 			"RemoveDirectoryIndex2",
162 | 			"HTTP://root/a/b/c/default#a=b",
163 | 			FlagRemoveDirectoryIndex,
164 | 			"http://root/a/b/c/default#a=b", // Since Go1.1, scheme is automatically lowercased
165 | 			false,
166 | 		},
167 | 		{
168 | 			"RemoveFragment",
169 | 			"HTTP://root/a/b/c/default#toto=tata",
170 | 			FlagRemoveFragment,
171 | 			"http://root/a/b/c/default", // Since Go1.1, scheme is automatically lowercased
172 | 			false,
173 | 		},
174 | 		{
175 | 			"ForceHTTP",
176 | 			"https://root/a/b/c/default#toto=tata",
177 | 			FlagForceHTTP,
178 | 			"http://root/a/b/c/default#toto=tata",
179 | 			false,
180 | 		},
181 | 		{
182 | 			"RemoveDuplicateSlashes",
183 | 			"https://root/a//b///c////default#toto=tata",
184 | 			FlagRemoveDuplicateSlashes,
185 | 			"https://root/a/b/c/default#toto=tata",
186 | 			false,
187 | 		},
188 | 		{
189 | 			"RemoveDuplicateSlashes2",
190 | 			"https://root//a//b///c////default#toto=tata",
191 | 			FlagRemoveDuplicateSlashes,
192 | 			"https://root/a/b/c/default#toto=tata",
193 | 			false,
194 | 		},
195 | 		{
196 | 			"RemoveWWW",
197 | 			"https://www.root/a/b/c/",
198 | 			FlagRemoveWWW,
199 | 			"https://root/a/b/c/",
200 | 			false,
201 | 		},
202 | 		{
203 | 			"RemoveWWW2",
204 | 			"https://WwW.Root/a/b/c/",
205 | 			FlagRemoveWWW,
206 | 			"https://Root/a/b/c/",
207 | 			false,
208 | 		},
209 | 		{
210 | 			"AddWWW",
211 | 			"https://Root/a/b/c/",
212 | 			FlagAddWWW,
213 | 			"https://www.Root/a/b/c/",
214 | 			false,
215 | 		},
216 | 		{
217 | 			"SortQuery",
218 | 			"http://root/toto/?b=4&a=1&c=3&b=2&a=5",
219 | 			FlagSortQuery,
220 | 			"http://root/toto/?a=1&a=5&b=2&b=4&c=3",
221 | 			false,
222 | 		},
223 | 		{
224 | 			"RemoveEmptyQuerySeparator",
225 | 			"http://root/toto/?",
226 | 			FlagRemoveEmptyQuerySeparator,
227 | 			"http://root/toto/",
228 | 			false,
229 | 		},
230 | 		{
231 | 			"Unsafe",
232 | 			"HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid",
233 | 			FlagsUnsafeGreedy,
234 | 			"http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3",
235 | 			false,
236 | 		},
237 | 		{
238 | 			"Safe2",
239 | 			"HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid",
240 | 			FlagsSafe,
241 | 			"https://www.root.com/toto/tE%1F///a/./b/../c/?z=3&w=2&a=4&w=1#invalid",
242 | 			false,
243 | 		},
244 | 		{
245 | 			"UsuallySafe2",
246 | 			"HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid",
247 | 			FlagsUsuallySafeGreedy,
248 | 			"https://www.root.com/toto/tE%1F///a/c?z=3&w=2&a=4&w=1#invalid",
249 | 			false,
250 | 		},
251 | 		{
252 | 			"AddTrailingSlashBug",
253 | 			"http://src.ca/",
254 | 			FlagsAllNonGreedy,
255 | 			"http://www.src.ca/",
256 | 			false,
257 | 		},
258 | 		{
259 | 			"SourceModified",
260 | 			"HTTPS://www.RooT.com/toto/t%45%1f///a/./b/../c/?z=3&w=2&a=4&w=1#invalid",
261 | 			FlagsUnsafeGreedy,
262 | 			"http://root.com/toto/tE%1F/a/c?a=4&w=1&w=2&z=3",
263 | 			true,
264 | 		},
265 | 		{
266 | 			"IPv6-1",
267 | 			"http://[2001:db8:1f70::999:de8:7648:6e8]/test",
268 | 			FlagsSafe | FlagRemoveDotSegments,
269 | 			"http://[2001:db8:1f70::999:de8:7648:6e8]/test",
270 | 			false,
271 | 		},
272 | 		{
273 | 			"IPv6-2",
274 | 			"http://[::ffff:192.168.1.1]/test",
275 | 			FlagsSafe | FlagRemoveDotSegments,
276 | 			"http://[::ffff:192.168.1.1]/test",
277 | 			false,
278 | 		},
279 | 		{
280 | 			"IPv6-3",
281 | 			"http://[::ffff:192.168.1.1]:80/test",
282 | 			FlagsSafe | FlagRemoveDotSegments,
283 | 			"http://[::ffff:192.168.1.1]/test",
284 | 			false,
285 | 		},
286 | 		{
287 | 			"IPv6-4",
288 | 			"htTps://[::fFff:192.168.1.1]:443/test",
289 | 			FlagsSafe | FlagRemoveDotSegments,
290 | 			"https://[::ffff:192.168.1.1]/test",
291 | 			false,
292 | 		},
293 | 		{
294 | 			"FTP",
295 | 			"ftp://user:pass@ftp.foo.net/foo/bar",
296 | 			FlagsSafe | FlagRemoveDotSegments,
297 | 			"ftp://user:pass@ftp.foo.net/foo/bar",
298 | 			false,
299 | 		},
300 | 		{
301 | 			"Standard-1",
302 | 			"http://www.foo.com:80/foo",
303 | 			FlagsSafe | FlagRemoveDotSegments,
304 | 			"http://www.foo.com/foo",
305 | 			false,
306 | 		},
307 | 		{
308 | 			"Standard-2",
309 | 			"http://www.foo.com:8000/foo",
310 | 			FlagsSafe | FlagRemoveDotSegments,
311 | 			"http://www.foo.com:8000/foo",
312 | 			false,
313 | 		},
314 | 		{
315 | 			"Standard-3",
316 | 			"http://www.foo.com/%7ebar",
317 | 			FlagsSafe | FlagRemoveDotSegments,
318 | 			"http://www.foo.com/~bar",
319 | 			false,
320 | 		},
321 | 		{
322 | 			"Standard-4",
323 | 			"http://www.foo.com/%7Ebar",
324 | 			FlagsSafe | FlagRemoveDotSegments,
325 | 			"http://www.foo.com/~bar",
326 | 			false,
327 | 		},
328 | 		{
329 | 			"Standard-5",
330 | 			"http://USER:pass@www.Example.COM/foo/bar",
331 | 			FlagsSafe | FlagRemoveDotSegments,
332 | 			"http://USER:pass@www.example.com/foo/bar",
333 | 			false,
334 | 		},
335 | 		{
336 | 			"Standard-6",
337 | 			"http://test.example/?a=%26&b=1",
338 | 			FlagsSafe | FlagRemoveDotSegments,
339 | 			"http://test.example/?a=%26&b=1",
340 | 			false,
341 | 		},
342 | 		{
343 | 			"Standard-7",
344 | 			"http://test.example/%25/?p=%20val%20%25",
345 | 			FlagsSafe | FlagRemoveDotSegments,
346 | 			"http://test.example/%25/?p=%20val%20%25",
347 | 			false,
348 | 		},
349 | 		{
350 | 			"Standard-8",
351 | 			"http://test.example/path/with a%20space+/",
352 | 			FlagsSafe | FlagRemoveDotSegments,
353 | 			"http://test.example/path/with%20a%20space+/",
354 | 			false,
355 | 		},
356 | 		{
357 | 			"Standard-9",
358 | 			"http://test.example/?",
359 | 			FlagsSafe | FlagRemoveDotSegments,
360 | 			"http://test.example/",
361 | 			false,
362 | 		},
363 | 		{
364 | 			"Standard-10",
365 | 			"http://a.COM/path/?b&a",
366 | 			FlagsSafe | FlagRemoveDotSegments,
367 | 			"http://a.com/path/?b&a",
368 | 			false,
369 | 		},
370 | 		{
371 | 			"StandardCasesAddTrailingSlash",
372 | 			"http://test.example?",
373 | 			FlagsSafe | FlagAddTrailingSlash,
374 | 			"http://test.example/",
375 | 			false,
376 | 		},
377 | 		{
378 | 			"OctalIP-1",
379 | 			"http://0123.011.0.4/",
380 | 			FlagsSafe | FlagDecodeOctalHost,
381 | 			"http://0123.011.0.4/",
382 | 			false,
383 | 		},
384 | 		{
385 | 			"OctalIP-2",
386 | 			"http://0102.0146.07.0223/",
387 | 			FlagsSafe | FlagDecodeOctalHost,
388 | 			"http://66.102.7.147/",
389 | 			false,
390 | 		},
391 | 		{
392 | 			"OctalIP-3",
393 | 			"http://0102.0146.07.0223.:23/",
394 | 			FlagsSafe | FlagDecodeOctalHost,
395 | 			"http://66.102.7.147.:23/",
396 | 			false,
397 | 		},
398 | 		{
399 | 			"OctalIP-4",
400 | 			"http://USER:pass@0102.0146.07.0223../",
401 | 			FlagsSafe | FlagDecodeOctalHost,
402 | 			"http://USER:pass@66.102.7.147../",
403 | 			false,
404 | 		},
405 | 		{
406 | 			"DWORDIP-1",
407 | 			"http://123.1113982867/",
408 | 			FlagsSafe | FlagDecodeDWORDHost,
409 | 			"http://123.1113982867/",
410 | 			false,
411 | 		},
412 | 		{
413 | 			"DWORDIP-2",
414 | 			"http://1113982867/",
415 | 			FlagsSafe | FlagDecodeDWORDHost,
416 | 			"http://66.102.7.147/",
417 | 			false,
418 | 		},
419 | 		{
420 | 			"DWORDIP-3",
421 | 			"http://1113982867.:23/",
422 | 			FlagsSafe | FlagDecodeDWORDHost,
423 | 			"http://66.102.7.147.:23/",
424 | 			false,
425 | 		},
426 | 		{
427 | 			"DWORDIP-4",
428 | 			"http://USER:pass@1113982867../",
429 | 			FlagsSafe | FlagDecodeDWORDHost,
430 | 			"http://USER:pass@66.102.7.147../",
431 | 			false,
432 | 		},
433 | 		{
434 | 			"HexIP-1",
435 | 			"http://0x123.1113982867/",
436 | 			FlagsSafe | FlagDecodeHexHost,
437 | 			"http://0x123.1113982867/",
438 | 			false,
439 | 		},
440 | 		{
441 | 			"HexIP-2",
442 | 			"http://0x42660793/",
443 | 			FlagsSafe | FlagDecodeHexHost,
444 | 			"http://66.102.7.147/",
445 | 			false,
446 | 		},
447 | 		{
448 | 			"HexIP-3",
449 | 			"http://0x42660793.:23/",
450 | 			FlagsSafe | FlagDecodeHexHost,
451 | 			"http://66.102.7.147.:23/",
452 | 			false,
453 | 		},
454 | 		{
455 | 			"HexIP-4",
456 | 			"http://USER:pass@0x42660793../",
457 | 			FlagsSafe | FlagDecodeHexHost,
458 | 			"http://USER:pass@66.102.7.147../",
459 | 			false,
460 | 		},
461 | 		{
462 | 			"UnnecessaryHostDots-1",
463 | 			"http://.www.foo.com../foo/bar.html",
464 | 			FlagsSafe | FlagRemoveUnnecessaryHostDots,
465 | 			"http://www.foo.com/foo/bar.html",
466 | 			false,
467 | 		},
468 | 		{
469 | 			"UnnecessaryHostDots-2",
470 | 			"http://www.foo.com./foo/bar.html",
471 | 			FlagsSafe | FlagRemoveUnnecessaryHostDots,
472 | 			"http://www.foo.com/foo/bar.html",
473 | 			false,
474 | 		},
475 | 		{
476 | 			"UnnecessaryHostDots-3",
477 | 			"http://www.foo.com.:81/foo",
478 | 			FlagsSafe | FlagRemoveUnnecessaryHostDots,
479 | 			"http://www.foo.com:81/foo",
480 | 			false,
481 | 		},
482 | 		{
483 | 			"UnnecessaryHostDots-4",
484 | 			"http://www.example.com./",
485 | 			FlagsSafe | FlagRemoveUnnecessaryHostDots,
486 | 			"http://www.example.com/",
487 | 			false,
488 | 		},
489 | 		{
490 | 			"UnnecessaryHostDots-5",
491 | 			"http://www..example...com/",
492 | 			FlagsSafe | FlagRemoveUnnecessaryHostDots,
493 | 			"http://www.example.com/",
494 | 			false,
495 | 		},
496 | 		{
497 | 			"EmptyPort-1",
498 | 			"http://www.thedraymin.co.uk:/main/?p=308",
499 | 			FlagsSafe | FlagRemoveEmptyPortSeparator,
500 | 			"http://www.thedraymin.co.uk/main/?p=308",
501 | 			false,
502 | 		},
503 | 		{
504 | 			"EmptyPort-2",
505 | 			"http://www.src.ca:",
506 | 			FlagsSafe | FlagRemoveEmptyPortSeparator,
507 | 			"http://www.src.ca",
508 | 			false,
509 | 		},
510 | 		{
511 | 			"Slashes-1",
512 | 			"http://test.example/foo/bar/.",
513 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
514 | 			"http://test.example/foo/bar/",
515 | 			false,
516 | 		},
517 | 		{
518 | 			"Slashes-2",
519 | 			"http://test.example/foo/bar/./",
520 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
521 | 			"http://test.example/foo/bar/",
522 | 			false,
523 | 		},
524 | 		{
525 | 			"Slashes-3",
526 | 			"http://test.example/foo/bar/..",
527 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
528 | 			"http://test.example/foo/",
529 | 			false,
530 | 		},
531 | 		{
532 | 			"Slashes-4",
533 | 			"http://test.example/foo/bar/../",
534 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
535 | 			"http://test.example/foo/",
536 | 			false,
537 | 		},
538 | 		{
539 | 			"Slashes-5",
540 | 			"http://test.example/foo/bar/../baz",
541 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
542 | 			"http://test.example/foo/baz",
543 | 			false,
544 | 		},
545 | 		{
546 | 			"Slashes-6",
547 | 			"http://test.example/foo/bar/../..",
548 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
549 | 			"http://test.example/",
550 | 			false,
551 | 		},
552 | 		{
553 | 			"Slashes-7",
554 | 			"http://test.example/foo/bar/../../",
555 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
556 | 			"http://test.example/",
557 | 			false,
558 | 		},
559 | 		{
560 | 			"Slashes-8",
561 | 			"http://test.example/foo/bar/../../baz",
562 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
563 | 			"http://test.example/baz",
564 | 			false,
565 | 		},
566 | 		{
567 | 			"Slashes-9",
568 | 			"http://test.example/foo/bar/../../../baz",
569 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
570 | 			"http://test.example/baz",
571 | 			false,
572 | 		},
573 | 		{
574 | 			"Slashes-10",
575 | 			"http://test.example/foo/bar/../../../../baz",
576 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
577 | 			"http://test.example/baz",
578 | 			false,
579 | 		},
580 | 		{
581 | 			"Slashes-11",
582 | 			"http://test.example/./foo",
583 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
584 | 			"http://test.example/foo",
585 | 			false,
586 | 		},
587 | 		{
588 | 			"Slashes-12",
589 | 			"http://test.example/../foo",
590 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
591 | 			"http://test.example/foo",
592 | 			false,
593 | 		},
594 | 		{
595 | 			"Slashes-13",
596 | 			"http://test.example/foo.",
597 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
598 | 			"http://test.example/foo.",
599 | 			false,
600 | 		},
601 | 		{
602 | 			"Slashes-14",
603 | 			"http://test.example/.foo",
604 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
605 | 			"http://test.example/.foo",
606 | 			false,
607 | 		},
608 | 		{
609 | 			"Slashes-15",
610 | 			"http://test.example/foo..",
611 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
612 | 			"http://test.example/foo..",
613 | 			false,
614 | 		},
615 | 		{
616 | 			"Slashes-16",
617 | 			"http://test.example/..foo",
618 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
619 | 			"http://test.example/..foo",
620 | 			false,
621 | 		},
622 | 		{
623 | 			"Slashes-17",
624 | 			"http://test.example/./../foo",
625 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
626 | 			"http://test.example/foo",
627 | 			false,
628 | 		},
629 | 		{
630 | 			"Slashes-18",
631 | 			"http://test.example/./foo/.",
632 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
633 | 			"http://test.example/foo/",
634 | 			false,
635 | 		},
636 | 		{
637 | 			"Slashes-19",
638 | 			"http://test.example/foo/./bar",
639 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
640 | 			"http://test.example/foo/bar",
641 | 			false,
642 | 		},
643 | 		{
644 | 			"Slashes-20",
645 | 			"http://test.example/foo/../bar",
646 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
647 | 			"http://test.example/bar",
648 | 			false,
649 | 		},
650 | 		{
651 | 			"Slashes-21",
652 | 			"http://test.example/foo//",
653 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
654 | 			"http://test.example/foo/",
655 | 			false,
656 | 		},
657 | 		{
658 | 			"Slashes-22",
659 | 			"http://test.example/foo///bar//",
660 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
661 | 			"http://test.example/foo/bar/",
662 | 			false,
663 | 		},
664 | 		{
665 | 			"Relative",
666 | 			"foo/bar",
667 | 			FlagsAllGreedy,
668 | 			"foo/bar",
669 | 			false,
670 | 		},
671 | 		{
672 | 			"Relative-1",
673 | 			"./../foo",
674 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
675 | 			"foo",
676 | 			false,
677 | 		},
678 | 		{
679 | 			"Relative-2",
680 | 			"./foo/bar/../baz/../bang/..",
681 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
682 | 			"foo/",
683 | 			false,
684 | 		},
685 | 		{
686 | 			"Relative-3",
687 | 			"foo///bar//",
688 | 			FlagsSafe | FlagRemoveDotSegments | FlagRemoveDuplicateSlashes,
689 | 			"foo/bar/",
690 | 			false,
691 | 		},
692 | 		{
693 | 			"Relative-4",
694 | 			"www.youtube.com",
695 | 			FlagsUsuallySafeGreedy,
696 | 			"www.youtube.com",
697 | 			false,
698 | 		},
699 | 		{
700 | 			"Issue-#24",
701 | 			"///foo///bar///",
702 | 			FlagRemoveDuplicateSlashes | FlagRemoveTrailingSlash,
703 | 			"/foo/bar",
704 | 			false,
705 | 		},
706 | 		/*&testCase{
707 | 			"UrlNorm-5",
708 | 			"http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3",
709 | 			FlagsSafe | FlagRemoveDotSegments,
710 | 			"http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3",
711 | 			false,
712 | 		},
713 | 		&testCase{
714 | 			"UrlNorm-1",
715 | 			"http://test.example/?a=%e3%82%82%26",
716 | 			FlagsAllGreedy,
717 | 			"http://test.example/?a=\xe3\x82\x82%26",
718 | 			false,
719 | 		},*/
720 | 	}
721 | )
722 | 
723 | func TestRunner(t *testing.T) {
724 | 	for _, tc := range cases {
725 | 		runCase(tc, t)
726 | 	}
727 | }
728 | 
729 | func runCase(tc *testCase, t *testing.T) {
730 | 	t.Logf("running %s...", tc.nm)
731 | 	if tc.parsed {
732 | 		u, e := url.Parse(tc.src)
733 | 		if e != nil {
734 | 			t.Errorf("%s - FAIL : %s", tc.nm, e)
735 | 			return
736 | 		} else {
737 | 			NormalizeURL(u, tc.flgs)
738 | 			if s := u.String(); s != tc.res {
739 | 				t.Errorf("%s - FAIL expected '%s', got '%s'", tc.nm, tc.res, s)
740 | 			}
741 | 		}
742 | 	} else {
743 | 		if s, e := NormalizeURLString(tc.src, tc.flgs); e != nil {
744 | 			t.Errorf("%s - FAIL : %s", tc.nm, e)
745 | 		} else if s != tc.res {
746 | 			t.Errorf("%s - FAIL expected '%s', got '%s'", tc.nm, tc.res, s)
747 | 		}
748 | 	}
749 | }
750 | 
751 | func TestDecodeUnnecessaryEscapesAll(t *testing.T) {
752 | 	var url = "http://host/"
753 | 
754 | 	for i := 0; i < 256; i++ {
755 | 		url += fmt.Sprintf("%%%02x", i)
756 | 	}
757 | 	s, err := NormalizeURLString(url, FlagDecodeUnnecessaryEscapes)
758 | 	if err != nil {
759 | 		t.Fatalf("parse error: %s", err)
760 | 	}
761 | 
762 | 	const want = "http://host/%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23$%25&'()*+,-./0123456789:;%3C=%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF"
763 | 	if s != want {
764 | 		t.Errorf("DecodeUnnecessaryEscapesAll:\nwant\n%s\ngot\n%s", want, s)
765 | 	}
766 | }
767 | 
768 | func TestEncodeNecessaryEscapesAll(t *testing.T) {
769 | 	const base = "http://host/"
770 | 	var path []byte
771 | 
772 | 	for i := 0; i < 256; i++ {
773 | 		// Since go1.12, url.Parse fails if the raw URL contains ASCII control characters,
774 | 		// meaning anything < 0x20 and 0x7f (DEL), so do not add those bytes to the constructed url.
775 | 		// See https://github.com/PuerkitoBio/purell/issues/28
776 | 		if i != 0x25 && !unicode.IsControl(rune(i)) {
777 | 			path = append(path, byte(i))
778 | 		}
779 | 	}
780 | 	s, err := NormalizeURLString(base+string(path), FlagEncodeNecessaryEscapes)
781 | 	if err != nil {
782 | 		t.Fatalf("parse error: %s", err)
783 | 	}
784 | 
785 | 	const want = "http://host/%20!%22#$&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF"
786 | 	if s != want {
787 | 		t.Errorf("EncodeNecessaryEscapesAll:\nwant\n%s\ngot\n%s", want, s)
788 | 	}
789 | }
790 | 


--------------------------------------------------------------------------------
/urlesc.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // This file implements query escaping as per RFC 3986.
  6 | // It contains some parts of the net/url package, modified so as to allow
  7 | // some reserved characters incorrectly escaped by net/url.
  8 | // See https://github.com/golang/go/issues/5684
  9 | package purell
 10 | 
 11 | import (
 12 | 	"bytes"
 13 | 	"net/url"
 14 | 	"strings"
 15 | )
 16 | 
 17 | type encoding int
 18 | 
 19 | const (
 20 | 	encodePath encoding = 1 + iota
 21 | 	encodeUserPassword
 22 | 	encodeQueryComponent
 23 | 	encodeFragment
 24 | )
 25 | 
 26 | // Return true if the specified character should be escaped when
 27 | // appearing in a URL string, according to RFC 3986.
 28 | func shouldEscape(c byte, mode encoding) bool {
 29 | 	// §2.3 Unreserved characters (alphanum)
 30 | 	if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' {
 31 | 		return false
 32 | 	}
 33 | 
 34 | 	switch c {
 35 | 	case '-', '.', '_', '~': // §2.3 Unreserved characters (mark)
 36 | 		return false
 37 | 
 38 | 	// §2.2 Reserved characters (reserved)
 39 | 	case ':', '/', '?', '#', '[', ']', '@', // gen-delims
 40 | 		'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=': // sub-delims
 41 | 		// Different sections of the URL allow a few of
 42 | 		// the reserved characters to appear unescaped.
 43 | 		switch mode {
 44 | 		case encodePath: // §3.3
 45 | 			// The RFC allows sub-delims and : @.
 46 | 			// '/', '[' and ']' can be used to assign meaning to individual path
 47 | 			// segments.  This package only manipulates the path as a whole,
 48 | 			// so we allow those as well.  That leaves only ? and # to escape.
 49 | 			return c == '?' || c == '#'
 50 | 
 51 | 		case encodeUserPassword: // §3.2.1
 52 | 			// The RFC allows : and sub-delims in
 53 | 			// userinfo.  The parsing of userinfo treats ':' as special so we must escape
 54 | 			// all the gen-delims.
 55 | 			return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || c == ']' || c == '@'
 56 | 
 57 | 		case encodeQueryComponent: // §3.4
 58 | 			// The RFC allows / and ?.
 59 | 			return c != '/' && c != '?'
 60 | 
 61 | 		case encodeFragment: // §4.1
 62 | 			// The RFC text is silent but the grammar allows
 63 | 			// everything, so escape nothing but #
 64 | 			return c == '#'
 65 | 		}
 66 | 	}
 67 | 
 68 | 	// Everything else must be escaped.
 69 | 	return true
 70 | }
 71 | 
 72 | func escape(s string, mode encoding) string {
 73 | 	spaceCount, hexCount := 0, 0
 74 | 	for i := 0; i < len(s); i++ {
 75 | 		c := s[i]
 76 | 		if shouldEscape(c, mode) {
 77 | 			if c == ' ' && mode == encodeQueryComponent {
 78 | 				spaceCount++
 79 | 			} else {
 80 | 				hexCount++
 81 | 			}
 82 | 		}
 83 | 	}
 84 | 
 85 | 	if spaceCount == 0 && hexCount == 0 {
 86 | 		return s
 87 | 	}
 88 | 
 89 | 	t := make([]byte, len(s)+2*hexCount)
 90 | 	j := 0
 91 | 	for i := 0; i < len(s); i++ {
 92 | 		switch c := s[i]; {
 93 | 		case c == ' ' && mode == encodeQueryComponent:
 94 | 			t[j] = '+'
 95 | 			j++
 96 | 		case shouldEscape(c, mode):
 97 | 			t[j] = '%'
 98 | 			t[j+1] = "0123456789ABCDEF"[c>>4]
 99 | 			t[j+2] = "0123456789ABCDEF"[c&15]
100 | 			j += 3
101 | 		default:
102 | 			t[j] = s[i]
103 | 			j++
104 | 		}
105 | 	}
106 | 	return string(t)
107 | }
108 | 
109 | var uiReplacer = strings.NewReplacer(
110 | 	"%21", "!",
111 | 	"%27", "'",
112 | 	"%28", "(",
113 | 	"%29", ")",
114 | 	"%2A", "*",
115 | )
116 | 
117 | // unescapeUserinfo unescapes some characters that need not to be escaped as per RFC3986.
118 | func unescapeUserinfo(s string) string {
119 | 	return uiReplacer.Replace(s)
120 | }
121 | 
122 | // Escape reassembles the URL into a valid URL string.
123 | // The general form of the result is one of:
124 | //
125 | //	scheme:opaque
126 | //	scheme://userinfo@host/path?query#fragment
127 | //
128 | // If u.Opaque is non-empty, String uses the first form;
129 | // otherwise it uses the second form.
130 | //
131 | // In the second form, the following rules apply:
132 | //	- if u.Scheme is empty, scheme: is omitted.
133 | //	- if u.User is nil, userinfo@ is omitted.
134 | //	- if u.Host is empty, host/ is omitted.
135 | //	- if u.Scheme and u.Host are empty and u.User is nil,
136 | //	   the entire scheme://userinfo@host/ is omitted.
137 | //	- if u.Host is non-empty and u.Path begins with a /,
138 | //	   the form host/path does not add its own /.
139 | //	- if u.RawQuery is empty, ?query is omitted.
140 | //	- if u.Fragment is empty, #fragment is omitted.
141 | func escapeURL(u *url.URL) string {
142 | 	var buf bytes.Buffer
143 | 	if u.Scheme != "" {
144 | 		buf.WriteString(u.Scheme)
145 | 		buf.WriteByte(':')
146 | 	}
147 | 	if u.Opaque != "" {
148 | 		buf.WriteString(u.Opaque)
149 | 	} else {
150 | 		if u.Scheme != "" || u.Host != "" || u.User != nil {
151 | 			buf.WriteString("//")
152 | 			if ui := u.User; ui != nil {
153 | 				buf.WriteString(unescapeUserinfo(ui.String()))
154 | 				buf.WriteByte('@')
155 | 			}
156 | 			if h := u.Host; h != "" {
157 | 				buf.WriteString(h)
158 | 			}
159 | 		}
160 | 		if u.Path != "" && u.Path[0] != '/' && u.Host != "" {
161 | 			buf.WriteByte('/')
162 | 		}
163 | 		buf.WriteString(escape(u.Path, encodePath))
164 | 	}
165 | 	if u.RawQuery != "" {
166 | 		buf.WriteByte('?')
167 | 		buf.WriteString(u.RawQuery)
168 | 	}
169 | 	if u.Fragment != "" {
170 | 		buf.WriteByte('#')
171 | 		buf.WriteString(escape(u.Fragment, encodeFragment))
172 | 	}
173 | 	return buf.String()
174 | }
175 | 


--------------------------------------------------------------------------------
/urlesc_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package purell
  6 | 
  7 | import (
  8 | 	"net/url"
  9 | 	"testing"
 10 | )
 11 | 
 12 | type URLTest struct {
 13 | 	in        string
 14 | 	out       *url.URL
 15 | 	roundtrip string // expected result of reserializing the URL; empty means same as "in".
 16 | }
 17 | 
 18 | var urltests = []URLTest{
 19 | 	// no path
 20 | 	{
 21 | 		"http://www.google.com",
 22 | 		&url.URL{
 23 | 			Scheme: "http",
 24 | 			Host:   "www.google.com",
 25 | 		},
 26 | 		"",
 27 | 	},
 28 | 	// path
 29 | 	{
 30 | 		"http://www.google.com/",
 31 | 		&url.URL{
 32 | 			Scheme: "http",
 33 | 			Host:   "www.google.com",
 34 | 			Path:   "/",
 35 | 		},
 36 | 		"",
 37 | 	},
 38 | 	// path with hex escaping
 39 | 	{
 40 | 		"http://www.google.com/file%20one%26two",
 41 | 		&url.URL{
 42 | 			Scheme: "http",
 43 | 			Host:   "www.google.com",
 44 | 			Path:   "/file one&two",
 45 | 		},
 46 | 		"http://www.google.com/file%20one&two",
 47 | 	},
 48 | 	// user
 49 | 	{
 50 | 		"ftp://webmaster@www.google.com/",
 51 | 		&url.URL{
 52 | 			Scheme: "ftp",
 53 | 			User:   url.User("webmaster"),
 54 | 			Host:   "www.google.com",
 55 | 			Path:   "/",
 56 | 		},
 57 | 		"",
 58 | 	},
 59 | 	// escape sequence in username
 60 | 	{
 61 | 		"ftp://john%20doe@www.google.com/",
 62 | 		&url.URL{
 63 | 			Scheme: "ftp",
 64 | 			User:   url.User("john doe"),
 65 | 			Host:   "www.google.com",
 66 | 			Path:   "/",
 67 | 		},
 68 | 		"ftp://john%20doe@www.google.com/",
 69 | 	},
 70 | 	// query
 71 | 	{
 72 | 		"http://www.google.com/?q=go+language",
 73 | 		&url.URL{
 74 | 			Scheme:   "http",
 75 | 			Host:     "www.google.com",
 76 | 			Path:     "/",
 77 | 			RawQuery: "q=go+language",
 78 | 		},
 79 | 		"",
 80 | 	},
 81 | 	// query with hex escaping: NOT parsed
 82 | 	{
 83 | 		"http://www.google.com/?q=go%20language",
 84 | 		&url.URL{
 85 | 			Scheme:   "http",
 86 | 			Host:     "www.google.com",
 87 | 			Path:     "/",
 88 | 			RawQuery: "q=go%20language",
 89 | 		},
 90 | 		"",
 91 | 	},
 92 | 	// %20 outside query
 93 | 	{
 94 | 		"http://www.google.com/a%20b?q=c+d",
 95 | 		&url.URL{
 96 | 			Scheme:   "http",
 97 | 			Host:     "www.google.com",
 98 | 			Path:     "/a b",
 99 | 			RawQuery: "q=c+d",
100 | 		},
101 | 		"",
102 | 	},
103 | 	// path without leading /, so no parsing
104 | 	{
105 | 		"http:www.google.com/?q=go+language",
106 | 		&url.URL{
107 | 			Scheme:   "http",
108 | 			Opaque:   "www.google.com/",
109 | 			RawQuery: "q=go+language",
110 | 		},
111 | 		"http:www.google.com/?q=go+language",
112 | 	},
113 | 	// path without leading /, so no parsing
114 | 	{
115 | 		"http:%2f%2fwww.google.com/?q=go+language",
116 | 		&url.URL{
117 | 			Scheme:   "http",
118 | 			Opaque:   "%2f%2fwww.google.com/",
119 | 			RawQuery: "q=go+language",
120 | 		},
121 | 		"http:%2f%2fwww.google.com/?q=go+language",
122 | 	},
123 | 	// non-authority with path
124 | 	{
125 | 		"mailto:/webmaster@golang.org",
126 | 		&url.URL{
127 | 			Scheme: "mailto",
128 | 			Path:   "/webmaster@golang.org",
129 | 		},
130 | 		"mailto:///webmaster@golang.org", // unfortunate compromise
131 | 	},
132 | 	// non-authority
133 | 	{
134 | 		"mailto:webmaster@golang.org",
135 | 		&url.URL{
136 | 			Scheme: "mailto",
137 | 			Opaque: "webmaster@golang.org",
138 | 		},
139 | 		"",
140 | 	},
141 | 	// unescaped :// in query should not create a scheme
142 | 	{
143 | 		"/foo?query=http://bad",
144 | 		&url.URL{
145 | 			Path:     "/foo",
146 | 			RawQuery: "query=http://bad",
147 | 		},
148 | 		"",
149 | 	},
150 | 	// leading // without scheme should create an authority
151 | 	{
152 | 		"//foo",
153 | 		&url.URL{
154 | 			Host: "foo",
155 | 		},
156 | 		"",
157 | 	},
158 | 	// leading // without scheme, with userinfo, path, and query
159 | 	{
160 | 		"//user@foo/path?a=b",
161 | 		&url.URL{
162 | 			User:     url.User("user"),
163 | 			Host:     "foo",
164 | 			Path:     "/path",
165 | 			RawQuery: "a=b",
166 | 		},
167 | 		"",
168 | 	},
169 | 	// Three leading slashes isn't an authority, but doesn't return an error.
170 | 	// (We can't return an error, as this code is also used via
171 | 	// ServeHTTP -> ReadRequest -> Parse, which is arguably a
172 | 	// different URL parsing context, but currently shares the
173 | 	// same codepath)
174 | 	{
175 | 		"///threeslashes",
176 | 		&url.URL{
177 | 			Path: "///threeslashes",
178 | 		},
179 | 		"",
180 | 	},
181 | 	{
182 | 		"http://user:password@google.com",
183 | 		&url.URL{
184 | 			Scheme: "http",
185 | 			User:   url.UserPassword("user", "password"),
186 | 			Host:   "google.com",
187 | 		},
188 | 		"http://user:password@google.com",
189 | 	},
190 | 	// unescaped @ in username should not confuse host
191 | 	{
192 | 		"http://j@ne:password@google.com",
193 | 		&url.URL{
194 | 			Scheme: "http",
195 | 			User:   url.UserPassword("j@ne", "password"),
196 | 			Host:   "google.com",
197 | 		},
198 | 		"http://j%40ne:password@google.com",
199 | 	},
200 | 	// unescaped @ in password should not confuse host
201 | 	{
202 | 		"http://jane:p@ssword@google.com",
203 | 		&url.URL{
204 | 			Scheme: "http",
205 | 			User:   url.UserPassword("jane", "p@ssword"),
206 | 			Host:   "google.com",
207 | 		},
208 | 		"http://jane:p%40ssword@google.com",
209 | 	},
210 | 	{
211 | 		"http://j@ne:password@google.com/p@th?q=@go",
212 | 		&url.URL{
213 | 			Scheme:   "http",
214 | 			User:     url.UserPassword("j@ne", "password"),
215 | 			Host:     "google.com",
216 | 			Path:     "/p@th",
217 | 			RawQuery: "q=@go",
218 | 		},
219 | 		"http://j%40ne:password@google.com/p@th?q=@go",
220 | 	},
221 | 	{
222 | 		"http://www.google.com/?q=go+language#foo",
223 | 		&url.URL{
224 | 			Scheme:   "http",
225 | 			Host:     "www.google.com",
226 | 			Path:     "/",
227 | 			RawQuery: "q=go+language",
228 | 			Fragment: "foo",
229 | 		},
230 | 		"",
231 | 	},
232 | 	{
233 | 		"http://www.google.com/?q=go+language#foo%26bar",
234 | 		&url.URL{
235 | 			Scheme:   "http",
236 | 			Host:     "www.google.com",
237 | 			Path:     "/",
238 | 			RawQuery: "q=go+language",
239 | 			Fragment: "foo&bar",
240 | 		},
241 | 		"http://www.google.com/?q=go+language#foo&bar",
242 | 	},
243 | 	{
244 | 		"file:///home/adg/rabbits",
245 | 		&url.URL{
246 | 			Scheme: "file",
247 | 			Host:   "",
248 | 			Path:   "/home/adg/rabbits",
249 | 		},
250 | 		"file:///home/adg/rabbits",
251 | 	},
252 | 	// "Windows" paths are no exception to the rule.
253 | 	// See golang.org/issue/6027, especially comment #9.
254 | 	{
255 | 		"file:///C:/FooBar/Baz.txt",
256 | 		&url.URL{
257 | 			Scheme: "file",
258 | 			Host:   "",
259 | 			Path:   "/C:/FooBar/Baz.txt",
260 | 		},
261 | 		"file:///C:/FooBar/Baz.txt",
262 | 	},
263 | 	// case-insensitive scheme
264 | 	{
265 | 		"MaIlTo:webmaster@golang.org",
266 | 		&url.URL{
267 | 			Scheme: "mailto",
268 | 			Opaque: "webmaster@golang.org",
269 | 		},
270 | 		"mailto:webmaster@golang.org",
271 | 	},
272 | 	// Relative path
273 | 	{
274 | 		"a/b/c",
275 | 		&url.URL{
276 | 			Path: "a/b/c",
277 | 		},
278 | 		"a/b/c",
279 | 	},
280 | 	// escaped '?' in username and password
281 | 	{
282 | 		"http://%3Fam:pa%3Fsword@google.com",
283 | 		&url.URL{
284 | 			Scheme: "http",
285 | 			User:   url.UserPassword("?am", "pa?sword"),
286 | 			Host:   "google.com",
287 | 		},
288 | 		"",
289 | 	},
290 | 	// escaped '?' and '#' in path
291 | 	{
292 | 		"http://example.com/%3F%23",
293 | 		&url.URL{
294 | 			Scheme: "http",
295 | 			Host:   "example.com",
296 | 			Path:   "?#",
297 | 		},
298 | 		"",
299 | 	},
300 | 	// unescaped [ ] ! ' ( ) * in path
301 | 	{
302 | 		"http://example.com/[]!'()*",
303 | 		&url.URL{
304 | 			Scheme: "http",
305 | 			Host:   "example.com",
306 | 			Path:   "[]!'()*",
307 | 		},
308 | 		"http://example.com/[]!'()*",
309 | 	},
310 | 	// escaped : / ? # [ ] @ in username and password
311 | 	{
312 | 		"http://%3A%2F%3F:%23%5B%5D%40@example.com",
313 | 		&url.URL{
314 | 			Scheme: "http",
315 | 			User:   url.UserPassword(":/?", "#[]@"),
316 | 			Host:   "example.com",
317 | 		},
318 | 		"",
319 | 	},
320 | 	// unescaped ! $ & ' ( ) * + , ; = in username and password
321 | 	{
322 | 		"http://!$&'():*+,;=@example.com",
323 | 		&url.URL{
324 | 			Scheme: "http",
325 | 			User:   url.UserPassword("!$&'()", "*+,;="),
326 | 			Host:   "example.com",
327 | 		},
328 | 		"",
329 | 	},
330 | 	// unescaped = : / . ? = in query component
331 | 	{
332 | 		"http://example.com/?q=http://google.com/?q=",
333 | 		&url.URL{
334 | 			Scheme:   "http",
335 | 			Host:     "example.com",
336 | 			Path:     "/",
337 | 			RawQuery: "q=http://google.com/?q=",
338 | 		},
339 | 		"",
340 | 	},
341 | 	// unescaped : / ? [ ] @ ! $ & ' ( ) * + , ; = in fragment
342 | 	{
343 | 		"http://example.com/#:/?%23[]@!$&'()*+,;=",
344 | 		&url.URL{
345 | 			Scheme:   "http",
346 | 			Host:     "example.com",
347 | 			Path:     "/",
348 | 			Fragment: ":/?#[]@!$&'()*+,;=",
349 | 		},
350 | 		"",
351 | 	},
352 | }
353 | 
354 | func DoTestString(t *testing.T, parse func(string) (*url.URL, error), name string, tests []URLTest) {
355 | 	for _, tt := range tests {
356 | 		u, err := parse(tt.in)
357 | 		if err != nil {
358 | 			t.Errorf("%s(%q) returned error %s", name, tt.in, err)
359 | 			continue
360 | 		}
361 | 		expected := tt.in
362 | 		if len(tt.roundtrip) > 0 {
363 | 			expected = tt.roundtrip
364 | 		}
365 | 		s := escapeURL(u)
366 | 		if s != expected {
367 | 			t.Errorf("Escape(%s(%q)) == %q (expected %q)", name, tt.in, s, expected)
368 | 		}
369 | 	}
370 | }
371 | 
372 | func TestURLString(t *testing.T) {
373 | 	DoTestString(t, url.Parse, "Parse", urltests)
374 | 
375 | 	// no leading slash on path should prepend
376 | 	// slash on String() call
377 | 	noslash := URLTest{
378 | 		"http://www.google.com/search",
379 | 		&url.URL{
380 | 			Scheme: "http",
381 | 			Host:   "www.google.com",
382 | 			Path:   "search",
383 | 		},
384 | 		"",
385 | 	}
386 | 	s := escapeURL(noslash.out)
387 | 	if s != noslash.in {
388 | 		t.Errorf("Expected %s; go %s", noslash.in, s)
389 | 	}
390 | }
391 | 
392 | var resolveReferenceTests = []struct {
393 | 	base, rel, expected string
394 | }{
395 | 	// Absolute URL references
396 | 	{"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"},
397 | 	{"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/?a=b"},
398 | 	{"http://foo.com/bar", "mailto:foo@example.com", "mailto:foo@example.com"},
399 | 
400 | 	// Path-absolute references
401 | 	{"http://foo.com/bar", "/baz", "http://foo.com/baz"},
402 | 	{"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"},
403 | 	{"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz?c=d"},
404 | 
405 | 	// Scheme-relative
406 | 	{"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"},
407 | 
408 | 	// Path-relative references:
409 | 
410 | 	// ... current directory
411 | 	{"http://foo.com", ".", "http://foo.com/"},
412 | 	{"http://foo.com/bar", ".", "http://foo.com/"},
413 | 	{"http://foo.com/bar/", ".", "http://foo.com/bar/"},
414 | 
415 | 	// ... going down
416 | 	{"http://foo.com", "bar", "http://foo.com/bar"},
417 | 	{"http://foo.com/", "bar", "http://foo.com/bar"},
418 | 	{"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"},
419 | 
420 | 	// ... going up
421 | 	{"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"},
422 | 	{"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"},
423 | 	{"http://foo.com/bar", "..", "http://foo.com/"},
424 | 	{"http://foo.com/bar/baz", "./..", "http://foo.com/"},
425 | 	// ".." in the middle (issue 3560)
426 | 	{"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"},
427 | 	{"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"},
428 | 	{"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"},
429 | 	{"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"},
430 | 	{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"},
431 | 	{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"},
432 | 	{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"},
433 | 	{"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"},
434 | 
435 | 	// Remove any dot-segments prior to forming the target URI.
436 | 	// http://tools.ietf.org/html/rfc3986#section-5.2.4
437 | 	{"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"},
438 | 
439 | 	// Triple dot isn't special
440 | 	{"http://foo.com/bar", "...", "http://foo.com/..."},
441 | 
442 | 	// Fragment
443 | 	{"http://foo.com/bar", ".#frag", "http://foo.com/#frag"},
444 | 
445 | 	// RFC 3986: Normal Examples
446 | 	// http://tools.ietf.org/html/rfc3986#section-5.4.1
447 | 	{"http://a/b/c/d;p?q", "g:h", "g:h"},
448 | 	{"http://a/b/c/d;p?q", "g", "http://a/b/c/g"},
449 | 	{"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"},
450 | 	{"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"},
451 | 	{"http://a/b/c/d;p?q", "/g", "http://a/g"},
452 | 	{"http://a/b/c/d;p?q", "//g", "http://g"},
453 | 	{"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p?y"},
454 | 	{"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g?y"},
455 | 	{"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p?q#s"},
456 | 	{"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g#s"},
457 | 	{"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g?y#s"},
458 | 	{"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"},
459 | 	{"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"},
460 | 	{"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x?y#s"},
461 | 	{"http://a/b/c/d;p?q", "", "http://a/b/c/d;p?q"},
462 | 	{"http://a/b/c/d;p?q", ".", "http://a/b/c/"},
463 | 	{"http://a/b/c/d;p?q", "./", "http://a/b/c/"},
464 | 	{"http://a/b/c/d;p?q", "..", "http://a/b/"},
465 | 	{"http://a/b/c/d;p?q", "../", "http://a/b/"},
466 | 	{"http://a/b/c/d;p?q", "../g", "http://a/b/g"},
467 | 	{"http://a/b/c/d;p?q", "../..", "http://a/"},
468 | 	{"http://a/b/c/d;p?q", "../../", "http://a/"},
469 | 	{"http://a/b/c/d;p?q", "../../g", "http://a/g"},
470 | 
471 | 	// RFC 3986: Abnormal Examples
472 | 	// http://tools.ietf.org/html/rfc3986#section-5.4.2
473 | 	{"http://a/b/c/d;p?q", "../../../g", "http://a/g"},
474 | 	{"http://a/b/c/d;p?q", "../../../../g", "http://a/g"},
475 | 	{"http://a/b/c/d;p?q", "/./g", "http://a/g"},
476 | 	{"http://a/b/c/d;p?q", "/../g", "http://a/g"},
477 | 	{"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."},
478 | 	{"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"},
479 | 	{"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."},
480 | 	{"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"},
481 | 	{"http://a/b/c/d;p?q", "./../g", "http://a/b/g"},
482 | 	{"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"},
483 | 	{"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"},
484 | 	{"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"},
485 | 	{"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"},
486 | 	{"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"},
487 | 	{"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g?y/./x"},
488 | 	{"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g?y/../x"},
489 | 	{"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g#s/./x"},
490 | 	{"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g#s/../x"},
491 | 
492 | 	// Extras.
493 | 	{"https://a/b/c/d;p?q", "//g?q", "https://g?q"},
494 | 	{"https://a/b/c/d;p?q", "//g#s", "https://g#s"},
495 | 	{"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f?y#s"},
496 | 	{"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p?y"},
497 | 	{"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p?y"},
498 | }
499 | 
500 | func TestResolveReference(t *testing.T) {
501 | 	mustParse := func(url_ string) *url.URL {
502 | 		u, err := url.Parse(url_)
503 | 		if err != nil {
504 | 			t.Fatalf("Expected URL to parse: %q, got error: %v", url_, err)
505 | 		}
506 | 		return u
507 | 	}
508 | 	opaque := &url.URL{Scheme: "scheme", Opaque: "opaque"}
509 | 	for _, test := range resolveReferenceTests {
510 | 		base := mustParse(test.base)
511 | 		rel := mustParse(test.rel)
512 | 		url := base.ResolveReference(rel)
513 | 		if escapeURL(url) != test.expected {
514 | 			t.Errorf("URL(%q).ResolveReference(%q) == %q, got %q", test.base, test.rel, test.expected, escapeURL(url))
515 | 		}
516 | 		// Ensure that new instances are returned.
517 | 		if base == url {
518 | 			t.Errorf("Expected URL.ResolveReference to return new URL instance.")
519 | 		}
520 | 		// Test the convenience wrapper too.
521 | 		url, err := base.Parse(test.rel)
522 | 		if err != nil {
523 | 			t.Errorf("URL(%q).Parse(%q) failed: %v", test.base, test.rel, err)
524 | 		} else if escapeURL(url) != test.expected {
525 | 			t.Errorf("URL(%q).Parse(%q) == %q, got %q", test.base, test.rel, test.expected, escapeURL(url))
526 | 		} else if base == url {
527 | 			// Ensure that new instances are returned for the wrapper too.
528 | 			t.Errorf("Expected URL.Parse to return new URL instance.")
529 | 		}
530 | 		// Ensure Opaque resets the URL.
531 | 		url = base.ResolveReference(opaque)
532 | 		if *url != *opaque {
533 | 			t.Errorf("ResolveReference failed to resolve opaque URL: want %#v, got %#v", url, opaque)
534 | 		}
535 | 		// Test the convenience wrapper with an opaque URL too.
536 | 		url, err = base.Parse("scheme:opaque")
537 | 		if err != nil {
538 | 			t.Errorf(`URL(%q).Parse("scheme:opaque") failed: %v`, test.base, err)
539 | 		} else if *url != *opaque {
540 | 			t.Errorf("Parse failed to resolve opaque URL: want %#v, got %#v", url, opaque)
541 | 		} else if base == url {
542 | 			// Ensure that new instances are returned, again.
543 | 			t.Errorf("Expected URL.Parse to return new URL instance.")
544 | 		}
545 | 	}
546 | }
547 | 
548 | type shouldEscapeTest struct {
549 | 	in     byte
550 | 	mode   encoding
551 | 	escape bool
552 | }
553 | 
554 | var shouldEscapeTests = []shouldEscapeTest{
555 | 	// Unreserved characters (§2.3)
556 | 	{'a', encodePath, false},
557 | 	{'a', encodeUserPassword, false},
558 | 	{'a', encodeQueryComponent, false},
559 | 	{'a', encodeFragment, false},
560 | 	{'z', encodePath, false},
561 | 	{'A', encodePath, false},
562 | 	{'Z', encodePath, false},
563 | 	{'0', encodePath, false},
564 | 	{'9', encodePath, false},
565 | 	{'-', encodePath, false},
566 | 	{'-', encodeUserPassword, false},
567 | 	{'-', encodeQueryComponent, false},
568 | 	{'-', encodeFragment, false},
569 | 	{'.', encodePath, false},
570 | 	{'_', encodePath, false},
571 | 	{'~', encodePath, false},
572 | 
573 | 	// User information (§3.2.1)
574 | 	{':', encodeUserPassword, true},
575 | 	{'/', encodeUserPassword, true},
576 | 	{'?', encodeUserPassword, true},
577 | 	{'@', encodeUserPassword, true},
578 | 	{'$', encodeUserPassword, false},
579 | 	{'&', encodeUserPassword, false},
580 | 	{'+', encodeUserPassword, false},
581 | 	{',', encodeUserPassword, false},
582 | 	{';', encodeUserPassword, false},
583 | 	{'=', encodeUserPassword, false},
584 | }
585 | 
586 | func TestShouldEscape(t *testing.T) {
587 | 	for _, tt := range shouldEscapeTests {
588 | 		if shouldEscape(tt.in, tt.mode) != tt.escape {
589 | 			t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape)
590 | 		}
591 | 	}
592 | }
593 | 


--------------------------------------------------------------------------------
/urlnorm_test.go:
--------------------------------------------------------------------------------
 1 | package purell
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | // Test cases merged from PR #1
 8 | // Originally from https://github.com/jehiah/urlnorm/blob/master/test_urlnorm.py
 9 | 
10 | func assertMap(t *testing.T, cases map[string]string, f NormalizationFlags) {
11 | 	for bad, good := range cases {
12 | 		s, e := NormalizeURLString(bad, f)
13 | 		if e != nil {
14 | 			t.Errorf("%s normalizing %v to %v", e.Error(), bad, good)
15 | 		} else {
16 | 			if s != good {
17 | 				t.Errorf("source: %v expected: %v got: %v", bad, good, s)
18 | 			}
19 | 		}
20 | 	}
21 | }
22 | 
23 | // This tests normalization to a unicode representation
24 | // precent escapes for unreserved values are unescaped to their unicode value
25 | // tests normalization to idna domains
26 | // test ip word handling, ipv6 address handling, and trailing domain periods
27 | // in general, this matches google chromes unescaping for things in the address bar.
28 | // spaces are converted to '+' (perhaphs controversial)
29 | // http://code.google.com/p/google-url/ probably is another good reference for this approach
30 | func TestUrlnorm(t *testing.T) {
31 | 	testcases := map[string]string{
32 | 		"http://test.example/?a=%e3%82%82%26": "http://test.example/?a=%e3%82%82%26",
33 | 		//"http://test.example/?a=%e3%82%82%26": "http://test.example/?a=\xe3\x82\x82%26", //should return a unicode character
34 | 		"http://s.xn--q-bga.DE/":    "http://s.xn--q-bga.de/",       //should be in idna format
35 | 		"http://XBLA\u306eXbox.com": "http://xn--xblaxbox-jf4g.com", //test utf8 and unicode
36 | 		"http://президент.рф":       "http://xn--d1abbgf6aiiy.xn--p1ai",
37 | 		"http://ПРЕЗИДЕНТ.РФ":       "http://xn--d1abbgf6aiiy.xn--p1ai",
38 | 		"http://ａｂ￥ヲ￦○.com":         "http://xn--ab-ida8983azmfnvs.com", //test width folding
39 | 		"http://\u00e9.com":         "http://xn--9ca.com",
40 | 		"http://e\u0301.com":        "http://xn--9ca.com",
41 | 		"http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3": "http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3",
42 | 		//"http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3": "http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3",
43 | 
44 | 		"http://test.example/\xe3\x82\xad": "http://test.example/%E3%82%AD",
45 | 		//"http://test.example/\xe3\x82\xad":              "http://test.example/\xe3\x82\xad",
46 | 		"http://test.example/?p=%23val#test-%23-val%25": "http://test.example/?p=%23val#test-%23-val%25", //check that %23 (#) is not escaped where it shouldn't be
47 | 
48 | 		"http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n": "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n",
49 | 		//"http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n": "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n",
50 | 	}
51 | 
52 | 	assertMap(t, testcases, FlagsSafe|FlagRemoveDotSegments)
53 | }
54 | 


--------------------------------------------------------------------------------