├── go.mod
├── .travis.yml
├── go.sum
├── LICENSE
├── README.md
├── urlx.go
└── urlx_test.go
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/goware/urlx
2 |
3 | require (
4 | github.com/PuerkitoBio/purell v1.1.1
5 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
6 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd
7 | golang.org/x/text v0.3.0 // indirect
8 | )
9 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 |
3 | go:
4 | - 1.11.x
5 | - tip
6 |
7 | env:
8 | global:
9 | - GO111MODULE=on
10 |
11 | before_install:
12 | - curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $GOPATH/bin latest
13 |
14 | install:
15 | - go mod download
16 |
17 | script:
18 | - go mod tidy && git diff --exit-code; code=$?; git checkout -- .; (exit $code)
19 | - go test -race -cover -coverprofile=coverage.txt -covermode=atomic ./...
20 | - golangci-lint run
21 |
22 | after_success:
23 | - bash <(curl -s https://codecov.io/bash)
24 |
25 | matrix:
26 | fast_finish: true
27 | allow_failures:
28 | - go: tip
29 |
30 | notifications:
31 | email: false
32 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI=
2 | github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
3 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
4 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
5 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd h1:HuTn7WObtcDo9uEEU7rEqL0jYthdXAmZ6PP+meazmaU=
6 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
7 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
8 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2014 Pressly Inc. www.pressly.com
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # URLx
2 | [Golang](http://golang.org/) pkg for URL parsing and normalization.
3 |
4 | 1. [Parsing URL](#parsing-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Parse))
5 | 2. [Normalizing URL](#normalizing-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Normalize))
6 | 3. [Splitting host:port from URL](#splitting-hostport-from-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#SplitHostPort))
7 | 4. [Resolving IP address from URL](#resolving-ip-address-from-url) ([GoDoc](https://godoc.org/github.com/goware/urlx#Resolve))
8 |
9 | [](https://godoc.org/github.com/goware/urlx)
10 | [](https://travis-ci.org/goware/urlx)
11 |
12 | ## Parsing URL
13 |
14 | The [urlx.Parse()](https://godoc.org/github.com/goware/urlx#Parse) is compatible with the same function from [net/url](https://golang.org/pkg/net/url/#Parse) pkg, but has slightly different behavior. It enforces default scheme and favors absolute URLs over relative paths.
15 |
16 | ### Difference between [urlx](https://godoc.org/github.com/goware/urlx#Parse) and [net/url](https://golang.org/pkg/net/url/#Parse)
17 |
18 |
19 |
20 |
21 | | github.com/goware/urlx |
22 | net/url |
23 |
24 |
25 |
26 |
27 |
28 | urlx.Parse("example.com")
29 |
30 | &url.URL{
31 | Scheme: "http",
32 | Host: "example.com",
33 | Path: "",
34 | }
35 |
36 | |
37 |
38 |
39 | url.Parse("example.com")
40 |
41 | &url.URL{
42 | Scheme: "",
43 | Host: "",
44 | Path: "example.com",
45 | }
46 |
47 | |
48 |
49 |
50 |
51 |
52 | urlx.Parse("localhost:8080")
53 |
54 | &url.URL{
55 | Scheme: "http",
56 | Host: "localhost:8080",
57 | Path: "",
58 | Opaque: "",
59 | }
60 |
61 | |
62 |
63 |
64 | url.Parse("localhost:8080")
65 |
66 | &url.URL{
67 | Scheme: "localhost",
68 | Host: "",
69 | Path: "",
70 | Opaque: "8080",
71 | }
72 |
73 | |
74 |
75 |
76 |
77 |
78 | urlx.Parse("user.local:8000/path")
79 |
80 | &url.URL{
81 | Scheme: "http",
82 | Host: "user.local:8000",
83 | Path: "/path",
84 | Opaque: "",
85 | }
86 |
87 | |
88 |
89 |
90 | url.Parse("user.local:8000/path")
91 |
92 | &url.URL{
93 | Scheme: "user.local",
94 | Host: "",
95 | Path: "",
96 | Opaque: "8000/path",
97 | }
98 |
99 | |
100 |
101 |
102 |
103 | ### Usage
104 |
105 | ```go
106 | import "github.com/goware/urlx"
107 |
108 | func main() {
109 | url, _ := urlx.Parse("example.com")
110 | // url.Scheme == "http"
111 | // url.Host == "example.com"
112 |
113 | fmt.Print(url)
114 | // Prints http://example.com
115 | }
116 | ```
117 |
118 | ## Normalizing URL
119 |
120 | The [urlx.Normalize()](https://godoc.org/github.com/goware/urlx#Normalize) function normalizes the URL using the predefined subset of [Purell](https://github.com/PuerkitoBio/purell) flags.
121 |
122 | ### Usage
123 |
124 | ```go
125 | import "github.com/goware/urlx"
126 |
127 | func main() {
128 | url, _ := urlx.Parse("localhost:80///x///y/z/../././index.html?b=y&a=x#t=20")
129 | normalized, _ := urlx.Normalize(url)
130 |
131 | fmt.Print(normalized)
132 | // Prints http://localhost/x/y/index.html?a=x&b=y#t=20
133 | }
134 | ```
135 |
136 | ## Splitting host:port from URL
137 |
138 | The [urlx.SplitHostPort()](https://godoc.org/github.com/goware/urlx#SplitHostPort) is compatible with the same function from [net](https://golang.org/pkg/net/) pkg, but has slightly different behavior. It doesn't remove brackets from `[IPv6]` host.
139 |
140 | ### Usage
141 |
142 | ```go
143 | import "github.com/goware/urlx"
144 |
145 | func main() {
146 | url, _ := urlx.Parse("localhost:80")
147 | host, port, _ := urlx.SplitHostPort(url)
148 |
149 | fmt.Print(host)
150 | // Prints localhost
151 |
152 | fmt.Print(port)
153 | // Prints 80
154 | }
155 | ```
156 |
157 | ## Resolving IP address from URL
158 |
159 | The [urlx.Resolve()](https://godoc.org/github.com/goware/urlx#Resolve) is compatible with [ResolveIPAddr()](https://golang.org/pkg/net/#ResolveIPAddr) from [net](https://golang.org/pkg/net/).
160 |
161 | ### Usage
162 |
163 | ```go
164 | url, _ := urlx.Parse("localhost")
165 | ip, _ := urlx.Resolve(url)
166 |
167 | fmt.Print(ip)
168 | // Prints 127.0.0.1
169 | ```
170 |
171 | ## License
172 | URLx is licensed under the [MIT License](./LICENSE).
173 |
--------------------------------------------------------------------------------
/urlx.go:
--------------------------------------------------------------------------------
1 | // Package urlx parses and normalizes URLs. It can also resolve hostname to an IP address.
2 | package urlx
3 |
4 | import (
5 | "errors"
6 | "net"
7 | "net/url"
8 | "regexp"
9 | "strconv"
10 | "strings"
11 |
12 | "github.com/PuerkitoBio/purell"
13 | "golang.org/x/net/idna"
14 | )
15 |
16 | // Parse parses raw URL string into the net/url URL struct.
17 | // It uses the url.Parse() internally, but it slightly changes
18 | // its behavior:
19 | // 1. It forces the default scheme and port to http
20 | // 2. It favors absolute paths over relative ones, thus "example.com"
21 | // is parsed into url.Host instead of url.Path.
22 | // 3. It lowercases the Host (not only the Scheme).
23 | func Parse(rawURL string) (*url.URL, error) {
24 | return ParseWithDefaultScheme(rawURL, "http")
25 | }
26 |
27 | func ParseWithDefaultScheme(rawURL string, scheme string) (*url.URL, error) {
28 | rawURL = defaultScheme(rawURL, scheme)
29 |
30 | // Use net/url.Parse() now.
31 | u, err := url.Parse(rawURL)
32 | if err != nil {
33 | return nil, err
34 | }
35 |
36 | host, _, err := SplitHostPort(u)
37 | if err != nil {
38 | return nil, err
39 | }
40 | if err := checkHost(host); err != nil {
41 | return nil, err
42 | }
43 |
44 | u.Host = strings.ToLower(u.Host)
45 | u.Scheme = strings.ToLower(u.Scheme)
46 |
47 | return u, nil
48 | }
49 |
50 | func defaultScheme(rawURL, scheme string) string {
51 | // Force default http scheme, so net/url.Parse() doesn't
52 | // put both host and path into the (relative) path.
53 | if strings.Index(rawURL, "//") == 0 {
54 | // Leading double slashes (any scheme). Force http.
55 | rawURL = scheme + ":" + rawURL
56 | }
57 | if !strings.Contains(rawURL, "://") {
58 | // Missing scheme. Force http.
59 | rawURL = scheme + "://" + rawURL
60 | }
61 | return rawURL
62 | }
63 |
64 | var (
65 | domainRegexp = regexp.MustCompile(`^([a-zA-Z0-9-_]{1,63}\.)*([a-zA-Z0-9-]{1,63})$`)
66 | ipv4Regexp = regexp.MustCompile(`^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$`)
67 | ipv6Regexp = regexp.MustCompile(`^\[[a-fA-F0-9:]+\]$`)
68 | )
69 |
70 | func checkHost(host string) error {
71 | if host == "" {
72 | return &url.Error{Op: "host", URL: host, Err: errors.New("empty host")}
73 | }
74 |
75 | host = strings.ToLower(host)
76 | if domainRegexp.MatchString(host) {
77 | return nil
78 | }
79 |
80 | if punycode, err := idna.ToASCII(host); err != nil {
81 | return err
82 | } else if domainRegexp.MatchString(punycode) {
83 | return nil
84 | }
85 |
86 | // IPv4 and IPv6.
87 | if ipv4Regexp.MatchString(host) || ipv6Regexp.MatchString(host) {
88 | return nil
89 | }
90 |
91 | return &url.Error{Op: "host", URL: host, Err: errors.New("invalid host")}
92 | }
93 |
94 | // SplitHostPort splits network address of the form "host:port" into
95 | // host and port. Unlike net.SplitHostPort(), it doesn't remove brackets
96 | // from [IPv6] host and it accepts net/url.URL struct instead of a string.
97 | func SplitHostPort(u *url.URL) (host, port string, err error) {
98 | if u == nil {
99 | return "", "", &url.Error{Op: "host", URL: host, Err: errors.New("empty url")}
100 | }
101 | host = u.Host
102 |
103 | // Find last colon.
104 | i := strings.LastIndex(host, ":")
105 | if i == -1 {
106 | // No port found.
107 | return host, "", nil
108 | }
109 |
110 | // Return if the last colon is inside [IPv6] brackets.
111 | if strings.HasPrefix(host, "[") && strings.Contains(host[i:], "]") {
112 | // No port found.
113 | return host, "", nil
114 | }
115 |
116 | if i == len(host)-1 {
117 | return "", "", &url.Error{Op: "port", URL: u.String(), Err: errors.New("empty port")}
118 | }
119 |
120 | port = host[i+1:]
121 | host = host[:i]
122 |
123 | if _, err := strconv.Atoi(port); err != nil {
124 | return "", "", &url.Error{Op: "port", URL: u.String(), Err: err}
125 | }
126 |
127 | return host, port, nil
128 | }
129 |
130 | const normalizeFlags purell.NormalizationFlags = purell.FlagRemoveDefaultPort |
131 | purell.FlagDecodeDWORDHost | purell.FlagDecodeOctalHost | purell.FlagDecodeHexHost |
132 | purell.FlagRemoveUnnecessaryHostDots | purell.FlagRemoveDotSegments | purell.FlagRemoveDuplicateSlashes |
133 | purell.FlagUppercaseEscapes | purell.FlagDecodeUnnecessaryEscapes | purell.FlagEncodeNecessaryEscapes |
134 | purell.FlagSortQuery
135 |
136 | // Normalize returns normalized URL string.
137 | // Behavior:
138 | // 1. Remove unnecessary host dots.
139 | // 2. Remove default port (http://localhost:80 becomes http://localhost).
140 | // 3. Remove duplicate slashes.
141 | // 4. Remove unnecessary dots from path.
142 | // 5. Sort query parameters.
143 | // 6. Decode host IP into decimal numbers.
144 | // 7. Handle escape values.
145 | // 8. Decode Punycode domains into UTF8 representation.
146 | func Normalize(u *url.URL) (string, error) {
147 | host, port, err := SplitHostPort(u)
148 | if err != nil {
149 | return "", err
150 | }
151 | if err := checkHost(host); err != nil {
152 | return "", err
153 | }
154 |
155 | // Decode Punycode.
156 | host, err = idna.ToUnicode(host)
157 | if err != nil {
158 | return "", err
159 | }
160 |
161 | u.Host = strings.ToLower(host)
162 | if port != "" {
163 | u.Host += ":" + port
164 | }
165 | u.Scheme = strings.ToLower(u.Scheme)
166 |
167 | return purell.NormalizeURL(u, normalizeFlags), nil
168 | }
169 |
170 | // NormalizeString returns normalized URL string.
171 | // It's a shortcut for Parse() and Normalize() funcs.
172 | func NormalizeString(rawURL string) (string, error) {
173 | u, err := Parse(rawURL)
174 | if err != nil {
175 | return "", err
176 | }
177 |
178 | return Normalize(u)
179 | }
180 |
181 | // Resolve resolves the URL host to its IP address.
182 | func Resolve(u *url.URL) (*net.IPAddr, error) {
183 | host, _, err := SplitHostPort(u)
184 | if err != nil {
185 | return nil, err
186 | }
187 |
188 | addr, err := net.ResolveIPAddr("ip", host)
189 | if err != nil {
190 | return nil, err
191 | }
192 |
193 | return addr, nil
194 | }
195 |
196 | // Resolve resolves the URL host to its IP address.
197 | // It's a shortcut for Parse() and Resolve() funcs.
198 | func ResolveString(rawURL string) (*net.IPAddr, error) {
199 | u, err := Parse(rawURL)
200 | if err != nil {
201 | return nil, err
202 | }
203 | return Resolve(u)
204 | }
205 |
206 | func URIEncode(uri string) (string, error) {
207 | u, err := url.Parse(uri)
208 | if err != nil {
209 | return "", err
210 | }
211 | return u.String(), nil
212 | }
213 |
--------------------------------------------------------------------------------
/urlx_test.go:
--------------------------------------------------------------------------------
1 | package urlx_test
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 | "testing"
7 |
8 | "github.com/goware/urlx"
9 | )
10 |
11 | func TestParse(t *testing.T) {
12 | tests := []struct {
13 | in string
14 | out string
15 | err bool
16 | }{
17 | // Error out on missing host:
18 | {in: "", err: true},
19 | {in: "/", err: true},
20 | {in: "//", err: true},
21 |
22 | // Test schemes:
23 | {in: "http://example.com", out: "http://example.com"},
24 | {in: "HTTP://x.example.com", out: "http://x.example.com"},
25 | {in: "http://localhost", out: "http://localhost"},
26 | {in: "http://user.local", out: "http://user.local"},
27 | {in: "http://kubernetes-service", out: "http://kubernetes-service"},
28 | {in: "https://example.com", out: "https://example.com"},
29 | {in: "HTTPS://example.com", out: "https://example.com"},
30 | {in: "ssh://example.com:22", out: "ssh://example.com:22"},
31 | {in: "jabber://example.com:5222", out: "jabber://example.com:5222"},
32 |
33 | // Leading double slashes (any scheme) defaults to http:
34 | {in: "//example.com", out: "http://example.com"},
35 |
36 | // Empty scheme defaults to http:
37 | {in: "localhost", out: "http://localhost"},
38 | {in: "LOCALHOST", out: "http://localhost"},
39 | {in: "localhost:80", out: "http://localhost:80"},
40 | {in: "localhost:8080", out: "http://localhost:8080"},
41 | {in: "user.local", out: "http://user.local"},
42 | {in: "user.local:80", out: "http://user.local:80"},
43 | {in: "user.local:8080", out: "http://user.local:8080"},
44 | {in: "kubernetes-service", out: "http://kubernetes-service"},
45 | {in: "kubernetes-service:80", out: "http://kubernetes-service:80"},
46 | {in: "kubernetes-service:8080", out: "http://kubernetes-service:8080"},
47 | {in: "127.0.0.1", out: "http://127.0.0.1"},
48 | {in: "127.0.0.1:80", out: "http://127.0.0.1:80"},
49 | {in: "127.0.0.1:8080", out: "http://127.0.0.1:8080"},
50 | {in: "[2001:db8:a0b:12f0::1]", out: "http://[2001:db8:a0b:12f0::1]"},
51 | {in: "[2001:db8:a0b:12f0::80]", out: "http://[2001:db8:a0b:12f0::80]"},
52 |
53 | // Keep the port even on matching scheme:
54 | {in: "http://localhost:80", out: "http://localhost:80"},
55 | {in: "http://localhost:8080", out: "http://localhost:8080"},
56 | {in: "http://x.example.io:8080", out: "http://x.example.io:8080"},
57 | {in: "[2001:db8:a0b:12f0::80]:80", out: "http://[2001:db8:a0b:12f0::80]:80"},
58 | {in: "[2001:db8:a0b:12f0::1]:8080", out: "http://[2001:db8:a0b:12f0::1]:8080"},
59 |
60 | // Test domains, subdomains etc.:
61 | {in: "example.com", out: "http://example.com"},
62 | {in: "1.example.com", out: "http://1.example.com"},
63 | {in: "1.example.io", out: "http://1.example.io"},
64 | {in: "subsub.sub.example.com", out: "http://subsub.sub.example.com"},
65 | {in: "subdomain_test.example.com", out: "http://subdomain_test.example.com"},
66 |
67 | // Test userinfo:
68 | {in: "user@example.com", out: "http://user@example.com"},
69 | {in: "user:passwd@example.com", out: "http://user:passwd@example.com"},
70 | {in: "https://user:passwd@subsub.sub.example.com", out: "https://user:passwd@subsub.sub.example.com"},
71 |
72 | // Lowercase scheme and host by default. Let net/url normalize URL by default:
73 | {in: "hTTp://subSUB.sub.EXAMPLE.COM/x//////y///foo.mp3?c=z&a=x&b=y#t=20", out: "http://subsub.sub.example.com/x//////y///foo.mp3?c=z&a=x&b=y#t=20"},
74 |
75 | // IDNA Punycode domains.
76 | // TODO: net/url escapes all the fields in String() method. Should we fix it?
77 | {in: "http://www.žluťoučký-kůň.cz/úpěl-ďábelské-ódy", out: "http://www.%C5%BElu%C5%A5ou%C4%8Dk%C3%BD-k%C5%AF%C5%88.cz/%C3%BAp%C4%9Bl-%C4%8F%C3%A1belsk%C3%A9-%C3%B3dy"},
78 | {in: "http://www.xn--luouk-k-z2a6lsyxjlexh.cz/úpěl-ďábelské-ódy", out: "http://www.xn--luouk-k-z2a6lsyxjlexh.cz/%C3%BAp%C4%9Bl-%C4%8F%C3%A1belsk%C3%A9-%C3%B3dy"},
79 | {in: "http://żółć.pl/żółć.html", out: "http://%C5%BC%C3%B3%C5%82%C4%87.pl/%C5%BC%C3%B3%C5%82%C4%87.html"},
80 | {in: "http://xn--kda4b0koi.pl/żółć.html", out: "http://xn--kda4b0koi.pl/%C5%BC%C3%B3%C5%82%C4%87.html"},
81 |
82 | // IANA TLDs.
83 | // TODO: net/url escapes all the fields in String() method. Should we fix it?
84 | {in: "https://pressly.餐厅", out: "https://pressly.%E9%A4%90%E5%8E%85"},
85 | {in: "https://pressly.组织机构", out: "https://pressly.%E7%BB%84%E7%BB%87%E6%9C%BA%E6%9E%84"},
86 |
87 | // Some obviously wrong data:
88 | {in: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==", err: true},
89 | {in: "javascript:evilFunction()", err: true},
90 | {in: "otherscheme:garbage", err: true},
91 | {in: "", err: true},
92 |
93 | {in: "http://www.google.com", out: "http://www.google.com"},
94 | {in: "https://www.google.com", out: "https://www.google.com"},
95 | {in: "HTTP://WWW.GOOGLE.COM", out: "http://www.google.com"},
96 | {in: "HTTPS://WWW.google.COM", out: "https://www.google.com"},
97 | {in: "http:/www.google.com", err: true},
98 | {in: "http:///www.google.com", err: true},
99 | {in: "javascript:void(0)", err: true},
100 | {in: "