├── README.md ├── clean.go └── clean_test.go /README.md: -------------------------------------------------------------------------------- 1 | # clean-url 2 | 3 | A tiny library for cleaning up the pollution around URLs, so we can figure out if two different URLs might actually point to same page. 4 | 5 | Examples: 6 | 7 | ```go 8 | Clean("https://foobar.com/?") 9 | // => foobar.com 10 | 11 | Clean("foobar.com/?utm_source=span&utm_content=eggs") 12 | // => foobar.com 13 | 14 | Clean("http://foobar.com/?yo=lo#heythere") 15 | // => foobar.com/?yo=lo 16 | ``` 17 | 18 | # Usage 19 | 20 | ```go 21 | import ( 22 | "github.com/kozmos/clean-url" 23 | ) 24 | 25 | func main () { 26 | cleanurl.Clean("foobar.com/?utm_source=span&utm_content=eggs") 27 | // => foobar.com 28 | } 29 | ``` 30 | 31 | # Kozmos' Usage 32 | 33 | This library is vital for [Kozmos](https://getkozmos.com), as it has to avoid polluted URLs to provide more 34 | reliable and accurate information. 35 | -------------------------------------------------------------------------------- /clean.go: -------------------------------------------------------------------------------- 1 | package cleanurl 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "regexp" 7 | "strings" 8 | ) 9 | 10 | func Clean(raw string) string { 11 | parsed, err := url.Parse(raw) 12 | if err != nil { 13 | return raw 14 | } 15 | 16 | return strings.TrimSpace(fmt.Sprintf("%s%s%s", 17 | CleanHost(parsed.Host), 18 | CleanPath(parsed.Path), 19 | CleanQuery(parsed.RawQuery))) 20 | } 21 | 22 | func CleanHost(host string) string { 23 | return regexp.MustCompile(`^www\.`).ReplaceAllString(host, "") 24 | } 25 | 26 | func CleanPath(path string) string { 27 | if len(path) == 0 { 28 | return "" 29 | } 30 | 31 | if path[len(path)-1:] == "/" { 32 | path = path[:len(path)-1] 33 | } 34 | 35 | return path 36 | } 37 | 38 | func CleanQuery(query string) string { 39 | if len(query) == 0 { 40 | return "" 41 | } 42 | 43 | values, err := url.ParseQuery(query) 44 | if err != nil { 45 | return "" 46 | } 47 | 48 | result := []string{} 49 | for k, v := range values { 50 | if k != "ref" && !strings.HasPrefix(k, "utm_") && len(v) > 0 && len(v[0]) > 0 { 51 | result = append(result, fmt.Sprintf("%s=%s", k, v[0])) 52 | } 53 | } 54 | 55 | if len(result) == 0 { 56 | return "" 57 | } 58 | 59 | return fmt.Sprintf("?%s", strings.Join(result, "&")) 60 | } 61 | -------------------------------------------------------------------------------- /clean_test.go: -------------------------------------------------------------------------------- 1 | package cleanurl_test 2 | 3 | import ( 4 | "github.com/kozmos/clean-url" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | func TestCleanURL(t *testing.T) { 10 | assert.Equal(t, "foobar.com", cleanurl.Clean("https://foobar.com")) 11 | assert.Equal(t, "foobar.com", cleanurl.Clean("http://foobar.com/")) 12 | assert.Equal(t, "foobar.com", cleanurl.Clean("http://foobar.com#")) 13 | assert.Equal(t, "foobar.com", cleanurl.Clean("http://www.foobar.com?")) 14 | assert.Equal(t, "foobar.com", cleanurl.Clean("http://www.foobar.com/?")) 15 | assert.Equal(t, "foobar.com", cleanurl.Clean("http://www.foobar.com/?#")) 16 | assert.Equal(t, "foobar.com/yo/lo?span=eggs&hey=there", cleanurl.Clean("http://foobar.com/yo/lo?span=eggs&hey=there#top")) 17 | assert.Equal(t, "foobar.com/yo/lo", cleanurl.Clean("http://foobar.com/yo/lo?utm_source=hi")) 18 | assert.Equal(t, "yolo.com/blog?reffoo=bar&span=eggs", cleanurl.Clean("http://www.yolo.com/blog/?span=eggs&utm_content=bufferc60dd&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer&ref=asd&reffoo=bar&empty")) 19 | } 20 | --------------------------------------------------------------------------------