├── go.mod ├── .travis.yml ├── go.sum ├── LICENSE ├── README.md ├── urlx.go └── urlx_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/goware/urlx 2 | 3 | require ( 4 | github.com/PuerkitoBio/purell v1.1.1 5 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect 6 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd 7 | golang.org/x/text v0.3.0 // indirect 8 | ) 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.11.x 5 | - tip 6 | 7 | env: 8 | global: 9 | - GO111MODULE=on 10 | 11 | before_install: 12 | - curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $GOPATH/bin latest 13 | 14 | install: 15 | - go mod download 16 | 17 | script: 18 | - go mod tidy && git diff --exit-code; code=$?; git checkout -- .; (exit $code) 19 | - go test -race -cover -coverprofile=coverage.txt -covermode=atomic ./... 20 | - golangci-lint run 21 | 22 | after_success: 23 | - bash <(curl -s https://codecov.io/bash) 24 | 25 | matrix: 26 | fast_finish: true 27 | allow_failures: 28 | - go: tip 29 | 30 | notifications: 31 | email: false 32 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= 2 | github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= 3 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= 4 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= 5 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd h1:HuTn7WObtcDo9uEEU7rEqL0jYthdXAmZ6PP+meazmaU= 6 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 7 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 8 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2014 Pressly Inc. www.pressly.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # URLx 2 | [Golang](http://golang.org/) pkg for URL parsing and normalization. 3 | 4 | 1. [Parsing URL](#parsing-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Parse)) 5 | 2. [Normalizing URL](#normalizing-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Normalize)) 6 | 3. [Splitting host:port from URL](#splitting-hostport-from-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#SplitHostPort)) 7 | 4. [Resolving IP address from URL](#resolving-ip-address-from-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Resolve)) 8 | 9 | [![GoDoc](https://godoc.org/github.com/goware/urlx?status.png)](https://godoc.org/github.com/goware/urlx) 10 | [![Travis](https://travis-ci.org/goware/urlx.svg?branch=master)](https://travis-ci.org/goware/urlx) 11 | 12 | ## Parsing URL 13 | 14 | The [urlx.Parse()](https://godoc.org/github.com/goware/urlx#Parse) is compatible with the same function from [net/url](https://golang.org/pkg/net/url/#Parse) pkg, but has slightly different behavior. It enforces default scheme and favors absolute URLs over relative paths. 15 | 16 | ### Difference between [urlx](https://godoc.org/github.com/goware/urlx#Parse) and [net/url](https://golang.org/pkg/net/url/#Parse) 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 37 | 48 | 49 | 50 | 62 | 74 | 75 | 76 | 88 | 100 | 101 |
github.com/goware/urlxnet/url
27 |
 28 | urlx.Parse("example.com")
 29 | 
 30 | &url.URL{
 31 |    Scheme:  "http",
 32 |    Host:    "example.com",
 33 |    Path:    "",
 34 | }
 35 | 
36 |
38 |
 39 | url.Parse("example.com")
 40 | 
 41 | &url.URL{
 42 |    Scheme:  "",
 43 |    Host:    "",
 44 |    Path:    "example.com",
 45 | }
 46 | 
47 |
51 |
 52 | urlx.Parse("localhost:8080")
 53 | 
 54 | &url.URL{
 55 |    Scheme:  "http",
 56 |    Host:    "localhost:8080",
 57 |    Path:    "",
 58 |    Opaque:  "",
 59 | }
 60 | 
61 |
63 |
 64 | url.Parse("localhost:8080")
 65 | 
 66 | &url.URL{
 67 |    Scheme:  "localhost",
 68 |    Host:    "",
 69 |    Path:    "",
 70 |    Opaque:  "8080",
 71 | }
 72 | 
73 |
77 |
 78 | urlx.Parse("user.local:8000/path")
 79 | 
 80 | &url.URL{
 81 |    Scheme:  "http",
 82 |    Host:    "user.local:8000",
 83 |    Path:    "/path",
 84 |    Opaque:  "",
 85 | }
 86 | 
87 |
89 |
 90 | url.Parse("user.local:8000/path")
 91 | 
 92 | &url.URL{
 93 |    Scheme:  "user.local",
 94 |    Host:    "",
 95 |    Path:    "",
 96 |    Opaque:  "8000/path",
 97 | }
 98 | 
99 |
102 | 103 | ### Usage 104 | 105 | ```go 106 | import "github.com/goware/urlx" 107 | 108 | func main() { 109 | url, _ := urlx.Parse("example.com") 110 | // url.Scheme == "http" 111 | // url.Host == "example.com" 112 | 113 | fmt.Print(url) 114 | // Prints http://example.com 115 | } 116 | ``` 117 | 118 | ## Normalizing URL 119 | 120 | The [urlx.Normalize()](https://godoc.org/github.com/goware/urlx#Normalize) function normalizes the URL using the predefined subset of [Purell](https://github.com/PuerkitoBio/purell) flags. 121 | 122 | ### Usage 123 | 124 | ```go 125 | import "github.com/goware/urlx" 126 | 127 | func main() { 128 | url, _ := urlx.Parse("localhost:80///x///y/z/../././index.html?b=y&a=x#t=20") 129 | normalized, _ := urlx.Normalize(url) 130 | 131 | fmt.Print(normalized) 132 | // Prints http://localhost/x/y/index.html?a=x&b=y#t=20 133 | } 134 | ``` 135 | 136 | ## Splitting host:port from URL 137 | 138 | The [urlx.SplitHostPort()](https://godoc.org/github.com/goware/urlx#SplitHostPort) is compatible with the same function from [net](https://golang.org/pkg/net/) pkg, but has slightly different behavior. It doesn't remove brackets from `[IPv6]` host. 139 | 140 | ### Usage 141 | 142 | ```go 143 | import "github.com/goware/urlx" 144 | 145 | func main() { 146 | url, _ := urlx.Parse("localhost:80") 147 | host, port, _ := urlx.SplitHostPort(url) 148 | 149 | fmt.Print(host) 150 | // Prints localhost 151 | 152 | fmt.Print(port) 153 | // Prints 80 154 | } 155 | ``` 156 | 157 | ## Resolving IP address from URL 158 | 159 | The [urlx.Resolve()](https://godoc.org/github.com/goware/urlx#Resolve) is compatible with [ResolveIPAddr()](https://golang.org/pkg/net/#ResolveIPAddr) from [net](https://golang.org/pkg/net/). 160 | 161 | ### Usage 162 | 163 | ```go 164 | url, _ := urlx.Parse("localhost") 165 | ip, _ := urlx.Resolve(url) 166 | 167 | fmt.Print(ip) 168 | // Prints 127.0.0.1 169 | ``` 170 | 171 | ## License 172 | URLx is licensed under the [MIT License](./LICENSE). 173 | -------------------------------------------------------------------------------- /urlx.go: -------------------------------------------------------------------------------- 1 | // Package urlx parses and normalizes URLs. It can also resolve hostname to an IP address. 2 | package urlx 3 | 4 | import ( 5 | "errors" 6 | "net" 7 | "net/url" 8 | "regexp" 9 | "strconv" 10 | "strings" 11 | 12 | "github.com/PuerkitoBio/purell" 13 | "golang.org/x/net/idna" 14 | ) 15 | 16 | // Parse parses raw URL string into the net/url URL struct. 17 | // It uses the url.Parse() internally, but it slightly changes 18 | // its behavior: 19 | // 1. It forces the default scheme and port to http 20 | // 2. It favors absolute paths over relative ones, thus "example.com" 21 | // is parsed into url.Host instead of url.Path. 22 | // 3. It lowercases the Host (not only the Scheme). 23 | func Parse(rawURL string) (*url.URL, error) { 24 | return ParseWithDefaultScheme(rawURL, "http") 25 | } 26 | 27 | func ParseWithDefaultScheme(rawURL string, scheme string) (*url.URL, error) { 28 | rawURL = defaultScheme(rawURL, scheme) 29 | 30 | // Use net/url.Parse() now. 31 | u, err := url.Parse(rawURL) 32 | if err != nil { 33 | return nil, err 34 | } 35 | 36 | host, _, err := SplitHostPort(u) 37 | if err != nil { 38 | return nil, err 39 | } 40 | if err := checkHost(host); err != nil { 41 | return nil, err 42 | } 43 | 44 | u.Host = strings.ToLower(u.Host) 45 | u.Scheme = strings.ToLower(u.Scheme) 46 | 47 | return u, nil 48 | } 49 | 50 | func defaultScheme(rawURL, scheme string) string { 51 | // Force default http scheme, so net/url.Parse() doesn't 52 | // put both host and path into the (relative) path. 53 | if strings.Index(rawURL, "//") == 0 { 54 | // Leading double slashes (any scheme). Force http. 55 | rawURL = scheme + ":" + rawURL 56 | } 57 | if !strings.Contains(rawURL, "://") { 58 | // Missing scheme. Force http. 59 | rawURL = scheme + "://" + rawURL 60 | } 61 | return rawURL 62 | } 63 | 64 | var ( 65 | domainRegexp = regexp.MustCompile(`^([a-zA-Z0-9-_]{1,63}\.)*([a-zA-Z0-9-]{1,63})$`) 66 | ipv4Regexp = regexp.MustCompile(`^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$`) 67 | ipv6Regexp = regexp.MustCompile(`^\[[a-fA-F0-9:]+\]$`) 68 | ) 69 | 70 | func checkHost(host string) error { 71 | if host == "" { 72 | return &url.Error{Op: "host", URL: host, Err: errors.New("empty host")} 73 | } 74 | 75 | host = strings.ToLower(host) 76 | if domainRegexp.MatchString(host) { 77 | return nil 78 | } 79 | 80 | if punycode, err := idna.ToASCII(host); err != nil { 81 | return err 82 | } else if domainRegexp.MatchString(punycode) { 83 | return nil 84 | } 85 | 86 | // IPv4 and IPv6. 87 | if ipv4Regexp.MatchString(host) || ipv6Regexp.MatchString(host) { 88 | return nil 89 | } 90 | 91 | return &url.Error{Op: "host", URL: host, Err: errors.New("invalid host")} 92 | } 93 | 94 | // SplitHostPort splits network address of the form "host:port" into 95 | // host and port. Unlike net.SplitHostPort(), it doesn't remove brackets 96 | // from [IPv6] host and it accepts net/url.URL struct instead of a string. 97 | func SplitHostPort(u *url.URL) (host, port string, err error) { 98 | if u == nil { 99 | return "", "", &url.Error{Op: "host", URL: host, Err: errors.New("empty url")} 100 | } 101 | host = u.Host 102 | 103 | // Find last colon. 104 | i := strings.LastIndex(host, ":") 105 | if i == -1 { 106 | // No port found. 107 | return host, "", nil 108 | } 109 | 110 | // Return if the last colon is inside [IPv6] brackets. 111 | if strings.HasPrefix(host, "[") && strings.Contains(host[i:], "]") { 112 | // No port found. 113 | return host, "", nil 114 | } 115 | 116 | if i == len(host)-1 { 117 | return "", "", &url.Error{Op: "port", URL: u.String(), Err: errors.New("empty port")} 118 | } 119 | 120 | port = host[i+1:] 121 | host = host[:i] 122 | 123 | if _, err := strconv.Atoi(port); err != nil { 124 | return "", "", &url.Error{Op: "port", URL: u.String(), Err: err} 125 | } 126 | 127 | return host, port, nil 128 | } 129 | 130 | const normalizeFlags purell.NormalizationFlags = purell.FlagRemoveDefaultPort | 131 | purell.FlagDecodeDWORDHost | purell.FlagDecodeOctalHost | purell.FlagDecodeHexHost | 132 | purell.FlagRemoveUnnecessaryHostDots | purell.FlagRemoveDotSegments | purell.FlagRemoveDuplicateSlashes | 133 | purell.FlagUppercaseEscapes | purell.FlagDecodeUnnecessaryEscapes | purell.FlagEncodeNecessaryEscapes | 134 | purell.FlagSortQuery 135 | 136 | // Normalize returns normalized URL string. 137 | // Behavior: 138 | // 1. Remove unnecessary host dots. 139 | // 2. Remove default port (http://localhost:80 becomes http://localhost). 140 | // 3. Remove duplicate slashes. 141 | // 4. Remove unnecessary dots from path. 142 | // 5. Sort query parameters. 143 | // 6. Decode host IP into decimal numbers. 144 | // 7. Handle escape values. 145 | // 8. Decode Punycode domains into UTF8 representation. 146 | func Normalize(u *url.URL) (string, error) { 147 | host, port, err := SplitHostPort(u) 148 | if err != nil { 149 | return "", err 150 | } 151 | if err := checkHost(host); err != nil { 152 | return "", err 153 | } 154 | 155 | // Decode Punycode. 156 | host, err = idna.ToUnicode(host) 157 | if err != nil { 158 | return "", err 159 | } 160 | 161 | u.Host = strings.ToLower(host) 162 | if port != "" { 163 | u.Host += ":" + port 164 | } 165 | u.Scheme = strings.ToLower(u.Scheme) 166 | 167 | return purell.NormalizeURL(u, normalizeFlags), nil 168 | } 169 | 170 | // NormalizeString returns normalized URL string. 171 | // It's a shortcut for Parse() and Normalize() funcs. 172 | func NormalizeString(rawURL string) (string, error) { 173 | u, err := Parse(rawURL) 174 | if err != nil { 175 | return "", err 176 | } 177 | 178 | return Normalize(u) 179 | } 180 | 181 | // Resolve resolves the URL host to its IP address. 182 | func Resolve(u *url.URL) (*net.IPAddr, error) { 183 | host, _, err := SplitHostPort(u) 184 | if err != nil { 185 | return nil, err 186 | } 187 | 188 | addr, err := net.ResolveIPAddr("ip", host) 189 | if err != nil { 190 | return nil, err 191 | } 192 | 193 | return addr, nil 194 | } 195 | 196 | // Resolve resolves the URL host to its IP address. 197 | // It's a shortcut for Parse() and Resolve() funcs. 198 | func ResolveString(rawURL string) (*net.IPAddr, error) { 199 | u, err := Parse(rawURL) 200 | if err != nil { 201 | return nil, err 202 | } 203 | return Resolve(u) 204 | } 205 | 206 | func URIEncode(uri string) (string, error) { 207 | u, err := url.Parse(uri) 208 | if err != nil { 209 | return "", err 210 | } 211 | return u.String(), nil 212 | } 213 | -------------------------------------------------------------------------------- /urlx_test.go: -------------------------------------------------------------------------------- 1 | package urlx_test 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/goware/urlx" 9 | ) 10 | 11 | func TestParse(t *testing.T) { 12 | tests := []struct { 13 | in string 14 | out string 15 | err bool 16 | }{ 17 | // Error out on missing host: 18 | {in: "", err: true}, 19 | {in: "/", err: true}, 20 | {in: "//", err: true}, 21 | 22 | // Test schemes: 23 | {in: "http://example.com", out: "http://example.com"}, 24 | {in: "HTTP://x.example.com", out: "http://x.example.com"}, 25 | {in: "http://localhost", out: "http://localhost"}, 26 | {in: "http://user.local", out: "http://user.local"}, 27 | {in: "http://kubernetes-service", out: "http://kubernetes-service"}, 28 | {in: "https://example.com", out: "https://example.com"}, 29 | {in: "HTTPS://example.com", out: "https://example.com"}, 30 | {in: "ssh://example.com:22", out: "ssh://example.com:22"}, 31 | {in: "jabber://example.com:5222", out: "jabber://example.com:5222"}, 32 | 33 | // Leading double slashes (any scheme) defaults to http: 34 | {in: "//example.com", out: "http://example.com"}, 35 | 36 | // Empty scheme defaults to http: 37 | {in: "localhost", out: "http://localhost"}, 38 | {in: "LOCALHOST", out: "http://localhost"}, 39 | {in: "localhost:80", out: "http://localhost:80"}, 40 | {in: "localhost:8080", out: "http://localhost:8080"}, 41 | {in: "user.local", out: "http://user.local"}, 42 | {in: "user.local:80", out: "http://user.local:80"}, 43 | {in: "user.local:8080", out: "http://user.local:8080"}, 44 | {in: "kubernetes-service", out: "http://kubernetes-service"}, 45 | {in: "kubernetes-service:80", out: "http://kubernetes-service:80"}, 46 | {in: "kubernetes-service:8080", out: "http://kubernetes-service:8080"}, 47 | {in: "127.0.0.1", out: "http://127.0.0.1"}, 48 | {in: "127.0.0.1:80", out: "http://127.0.0.1:80"}, 49 | {in: "127.0.0.1:8080", out: "http://127.0.0.1:8080"}, 50 | {in: "[2001:db8:a0b:12f0::1]", out: "http://[2001:db8:a0b:12f0::1]"}, 51 | {in: "[2001:db8:a0b:12f0::80]", out: "http://[2001:db8:a0b:12f0::80]"}, 52 | 53 | // Keep the port even on matching scheme: 54 | {in: "http://localhost:80", out: "http://localhost:80"}, 55 | {in: "http://localhost:8080", out: "http://localhost:8080"}, 56 | {in: "http://x.example.io:8080", out: "http://x.example.io:8080"}, 57 | {in: "[2001:db8:a0b:12f0::80]:80", out: "http://[2001:db8:a0b:12f0::80]:80"}, 58 | {in: "[2001:db8:a0b:12f0::1]:8080", out: "http://[2001:db8:a0b:12f0::1]:8080"}, 59 | 60 | // Test domains, subdomains etc.: 61 | {in: "example.com", out: "http://example.com"}, 62 | {in: "1.example.com", out: "http://1.example.com"}, 63 | {in: "1.example.io", out: "http://1.example.io"}, 64 | {in: "subsub.sub.example.com", out: "http://subsub.sub.example.com"}, 65 | {in: "subdomain_test.example.com", out: "http://subdomain_test.example.com"}, 66 | 67 | // Test userinfo: 68 | {in: "user@example.com", out: "http://user@example.com"}, 69 | {in: "user:passwd@example.com", out: "http://user:passwd@example.com"}, 70 | {in: "https://user:passwd@subsub.sub.example.com", out: "https://user:passwd@subsub.sub.example.com"}, 71 | 72 | // Lowercase scheme and host by default. Let net/url normalize URL by default: 73 | {in: "hTTp://subSUB.sub.EXAMPLE.COM/x//////y///foo.mp3?c=z&a=x&b=y#t=20", out: "http://subsub.sub.example.com/x//////y///foo.mp3?c=z&a=x&b=y#t=20"}, 74 | 75 | // IDNA Punycode domains. 76 | // TODO: net/url escapes all the fields in String() method. Should we fix it? 77 | {in: "http://www.žluťoučký-kůň.cz/úpěl-ďábelské-ódy", out: "http://www.%C5%BElu%C5%A5ou%C4%8Dk%C3%BD-k%C5%AF%C5%88.cz/%C3%BAp%C4%9Bl-%C4%8F%C3%A1belsk%C3%A9-%C3%B3dy"}, 78 | {in: "http://www.xn--luouk-k-z2a6lsyxjlexh.cz/úpěl-ďábelské-ódy", out: "http://www.xn--luouk-k-z2a6lsyxjlexh.cz/%C3%BAp%C4%9Bl-%C4%8F%C3%A1belsk%C3%A9-%C3%B3dy"}, 79 | {in: "http://żółć.pl/żółć.html", out: "http://%C5%BC%C3%B3%C5%82%C4%87.pl/%C5%BC%C3%B3%C5%82%C4%87.html"}, 80 | {in: "http://xn--kda4b0koi.pl/żółć.html", out: "http://xn--kda4b0koi.pl/%C5%BC%C3%B3%C5%82%C4%87.html"}, 81 | 82 | // IANA TLDs. 83 | // TODO: net/url escapes all the fields in String() method. Should we fix it? 84 | {in: "https://pressly.餐厅", out: "https://pressly.%E9%A4%90%E5%8E%85"}, 85 | {in: "https://pressly.组织机构", out: "https://pressly.%E7%BB%84%E7%BB%87%E6%9C%BA%E6%9E%84"}, 86 | 87 | // Some obviously wrong data: 88 | {in: "", err: true}, 89 | {in: "javascript:evilFunction()", err: true}, 90 | {in: "otherscheme:garbage", err: true}, 91 | {in: "", err: true}, 92 | 93 | {in: "http://www.google.com", out: "http://www.google.com"}, 94 | {in: "https://www.google.com", out: "https://www.google.com"}, 95 | {in: "HTTP://WWW.GOOGLE.COM", out: "http://www.google.com"}, 96 | {in: "HTTPS://WWW.google.COM", out: "https://www.google.com"}, 97 | {in: "http:/www.google.com", err: true}, 98 | {in: "http:///www.google.com", err: true}, 99 | {in: "javascript:void(0)", err: true}, 100 | {in: "