├── .gitignore ├── doc.go ├── LICENSE ├── README.md ├── tldextract_test.go └── tldextract.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Package tldextract provides the ability to extract gTLD or ccTLD(generic or country code top-level domain), the registered domain and subdomain from a url according to the Public Suffix List. 4 | 5 | A simple usage: 6 | package main 7 | 8 | import ( 9 | "fmt" 10 | "github.com/joeguo/tldextract" 11 | ) 12 | func main() { 13 | urls := []string{"git+ssh://www.github.com:8443/", "http://media.forums.theregister.co.uk", "http://218.15.32.76", "http://google.com?q=cats"} 14 | cache := "/tmp/tld.cache" 15 | extract := tldextract.New(cache,false) 16 | 17 | for _, u := range (urls) { 18 | result:=extract.Extract(u) 19 | fmt.Printf("%+v;%s\n",result,u) 20 | } 21 | } 22 | 23 | */ 24 | package tldextract 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2012-2022 JoeGuo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tldextract 2 | ========== 3 | 4 | Extract root domain, subdomain name, tld from a url, using the [the Public Suffix List](http://www.publicsuffix.org). 5 | 6 | Installation 7 | ========== 8 | Install tldextract: 9 | ```sh 10 | go get github.com/joeguo/tldextract 11 | 12 | ``` 13 | To run unit tests, run this command in tldextract's source directory($GOPATH/src/github.com/joeguo/tldextract): 14 | 15 | ```sh 16 | go test 17 | ``` 18 | 19 | Example 20 | ========== 21 | ```go 22 | package main 23 | 24 | import ( 25 | "fmt" 26 | "github.com/joeguo/tldextract" 27 | ) 28 | 29 | 30 | func main() { 31 | urls := []string{"git+ssh://www.github.com:8443/", "http://media.forums.theregister.co.uk", "http://218.15.32.76", "http://google.com?q=cats"} 32 | cache := "/tmp/tld.cache" 33 | extract, _ := tldextract.New(cache,false) 34 | 35 | for _, u := range (urls) { 36 | result:=extract.Extract(u) 37 | fmt.Printf("%+v;%s\n",result,u) 38 | } 39 | } 40 | 41 | ``` 42 | Output will look like: 43 | ```plain 44 | &{Flag:1 Sub:www Root:github Tld:com};git+ssh://www.github.com:8443/ 45 | &{Flag:1 Sub:media.forums Root:theregister Tld:co.uk};http://media.forums.theregister.co.uk 46 | &{Flag:2 Sub: Root:218.15.32.76 Tld:};http://218.15.32.76 47 | &{Flag:1 Sub: Root:google Tld:com};http://google.com?q=cats 48 | ``` 49 | Flag value meaning 50 | ```go 51 | const ( 52 | Malformed = iota 53 | Domain 54 | Ip4 55 | Ip6 56 | ) 57 | ``` 58 | 59 | ======== 60 | -------------------------------------------------------------------------------- /tldextract_test.go: -------------------------------------------------------------------------------- 1 | package tldextract 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "testing" 7 | ) 8 | 9 | var ( 10 | cache = "/tmp/tld.cache" 11 | tldExtract *TLDExtract 12 | err error 13 | ) 14 | 15 | func init() { 16 | tldExtract, err = New(cache, true) 17 | if err != nil { 18 | log.Fatal(err) 19 | } 20 | } 21 | 22 | func assert(url string, expected *Result, returned *Result, t *testing.T) { 23 | if (expected.Flag == returned.Flag) && (expected.Root == returned.Root) && (expected.Sub == returned.Sub) && (expected.Tld == returned.Tld) { 24 | return 25 | } 26 | t.Errorf("%s;expected:%+v;returned:%+v", url, expected, returned) 27 | } 28 | func aTestA(t *testing.T) { 29 | result := tldExtract.Extract("9down.cc.html&sa=u&ei=4sfsul-ximsb4ateiicaag&ved=0cbkqfjac&usg=afqjcnfmetjm8-gpgyszv9l1h6_5p369yg/wp-content/themes/airfolio/scripts/timthumb.php") 30 | fmt.Println(result) 31 | } 32 | 33 | func TestAll(t *testing.T) { 34 | cases := map[string]*Result{"http://www.google.com": &Result{Flag: Domain, Sub: "www", Root: "google", Tld: "com"}, 35 | "https://www.google.com.hk/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CDQQFjAA&url=%68%74%74%70%3a%2f%2f%67%72%6f%75%70%73%2e%67%6f%6f%67%6c%65%2e%63%6f%6d%2f%67%72%6f%75%70%2f%67%6f%6c%61%6e%67%2d%6e%75%74%73%2f%62%72%6f%77%73%65%5f%74%68%72%65%61%64%2f%74%68%72%65%61%64%2f%62%31%61%36%65%31%66%38%37%30%32%62%33%31%31%62&ei=okjQULibA9GYiAfk3IDYDw&usg=AFQjCNFVxgAwHXnmEJWVURboSTiygUMTaQ&sig2=3AIxkh4TR5QYWGXCJtBSZg": &Result{Flag: Domain, Sub: "www", Root: "google", Tld: "com.hk"}, 36 | "http://joe.blogspot.co.uk": &Result{Flag: Domain, Sub: "", Root: "joe", Tld: "blogspot.co.uk"}, 37 | "ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501/": &Result{Flag: Domain, Sub: "1337", Root: "warez", Tld: "com"}, 38 | "git+ssh://www.github.com:8443/": &Result{Flag: Domain, Sub: "www", Root: "github", Tld: "com"}, 39 | "http://www.!github.com:8443/": &Result{Flag: Malformed}, 40 | "http://www.theregister.co.uk": &Result{Flag: Domain, Sub: "www", Root: "theregister", Tld: "co.uk"}, 41 | "http://media.forums.theregister.co.uk": &Result{Flag: Domain, Sub: "media.forums", Root: "theregister", Tld: "co.uk"}, 42 | "192.168.0.103": &Result{Flag: Ip4, Root: "192.168.0.103"}, 43 | "http://192.168.0.103": &Result{Flag: Ip4, Root: "192.168.0.103"}, 44 | "http://216.22.project.coop/": &Result{Flag: Domain, Sub: "216.22", Root: "project", Tld: "coop"}, 45 | "http://Gmail.org/": &Result{Flag: Domain, Root: "gmail", Tld: "org"}, 46 | "http://wiki.info/": &Result{Flag: Domain, Root: "wiki", Tld: "info"}, 47 | "http://wiki.information/": &Result{Flag: Malformed}, 48 | "http://wiki/": &Result{Flag: Malformed}, 49 | "http://258.15.32.876": &Result{Flag: Malformed}, 50 | "http://www.cgs.act.edu.au/": &Result{Flag: Domain, Sub: "www", Root: "cgs", Tld: "act.edu.au"}, 51 | "http://www.metp.net.cn": &Result{Flag: Domain, Sub: "www", Root: "metp", Tld: "net.cn"}, 52 | "http://net.cn": &Result{Flag: Malformed}, 53 | "http://google.com?q=cats": &Result{Flag: Domain, Sub: "", Root: "google", Tld: "com"}, 54 | "https://mail.google.com/mail": &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"}, 55 | "ssh://mail.google.com/mail": &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"}, 56 | "//mail.google.com/mail": &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"}, 57 | "mail.google.com/mail": &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"}, 58 | "9down.cc.html&sa=u&ei=4sfsul-ximsb4ateiicaag&ved=0cbkqfjac&usg=afqjcnfmetjm8-gpgyszv9l1h6_5p369yg/wp-content/themes/airfolio/scripts/timthumb.php": &Result{Flag: Domain, Sub: "", Root: "9down", Tld: "cc"}, 59 | "cy": &Result{Flag: Malformed}, 60 | "c.cy": &Result{Flag: Domain, Sub: "", Root: "c", Tld: "cy"}, 61 | "b.c.cy": &Result{Flag: Domain, Sub: "b", Root: "c", Tld: "cy"}, 62 | "a.b.c.cy": &Result{Flag: Domain, Sub: "a.b", Root: "c", Tld: "cy"}, 63 | "b.ide.kyoto.jp": &Result{Flag: Domain, Sub: "", Root: "b", Tld: "ide.kyoto.jp"}, 64 | "a.b.ide.kyoto.jp": &Result{Flag: Domain, Sub: "a", Root: "b", Tld: "ide.kyoto.jp"}, 65 | "b.c.kobe.jp": &Result{Flag: Domain, Sub: "", Root: "b", Tld: "c.kobe.jp"}, 66 | "a.b.c.kobe.jp": &Result{Flag: Domain, Sub: "a", Root: "b", Tld: "c.kobe.jp"}, 67 | "city.kobe.jp": &Result{Flag: Domain, Sub: "", Root: "city", Tld: "kobe.jp"}, 68 | "city.a.kobe.jp": &Result{Flag: Domain, Sub: "", Root: "city", Tld: "a.kobe.jp"}, 69 | "blogspot.co.uk": &Result{Flag: Malformed}, 70 | "blah.blogspot.co.uk": &Result{Flag: Domain, Sub: "", Root: "blah", Tld: "blogspot.co.uk"}, 71 | } 72 | for url, expected := range cases { 73 | returned := tldExtract.Extract(url) 74 | assert(url, expected, returned, t) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tldextract.go: -------------------------------------------------------------------------------- 1 | package tldextract 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | "net" 8 | "net/http" 9 | "regexp" 10 | "strings" 11 | ) 12 | 13 | //used for Result.Flag 14 | const ( 15 | Malformed = iota 16 | Domain 17 | Ip4 18 | Ip6 19 | ) 20 | 21 | type Result struct { 22 | Flag int 23 | Sub string 24 | Root string 25 | Tld string 26 | } 27 | 28 | type TLDExtract struct { 29 | CacheFile string 30 | rootNode *Trie 31 | debug bool 32 | noValidate bool // do not validate URL schema 33 | noStrip bool // do not strip .html suffix from URL 34 | } 35 | 36 | type Trie struct { 37 | ExceptRule bool 38 | ValidTld bool 39 | matches map[string]*Trie 40 | } 41 | 42 | var ( 43 | schemaregex = regexp.MustCompile(`^([abcdefghijklmnopqrstuvwxyz0123456789\+\-\.]+:)?//`) 44 | domainregex = regexp.MustCompile(`^[a-z0-9-\p{L}]{1,63}$`) 45 | ip4regex = regexp.MustCompile(`(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])`) 46 | ) 47 | 48 | // New creates a new *TLDExtract, it may be shared between goroutines, we usually need a single instance in an application. 49 | func New(cacheFile string, debug bool) (*TLDExtract, error) { 50 | data, err := ioutil.ReadFile(cacheFile) 51 | if err != nil { 52 | data, err = download() 53 | if err != nil { 54 | return &TLDExtract{}, err 55 | } 56 | if err = ioutil.WriteFile(cacheFile, data, 0644); err != nil { 57 | return &TLDExtract{}, err 58 | } 59 | } 60 | ts := strings.Split(string(data), "\n") 61 | newMap := make(map[string]*Trie) 62 | rootNode := &Trie{ExceptRule: false, ValidTld: false, matches: newMap} 63 | for _, t := range ts { 64 | if t != "" && !strings.HasPrefix(t, "//") { 65 | t = strings.TrimSpace(t) 66 | exceptionRule := t[0] == '!' 67 | if exceptionRule { 68 | t = t[1:] 69 | } 70 | addTldRule(rootNode, strings.Split(t, "."), exceptionRule) 71 | } 72 | } 73 | 74 | return &TLDExtract{CacheFile: cacheFile, rootNode: rootNode, debug: debug}, nil 75 | } 76 | 77 | // SetNoValidate disables schema check in order to increase performance. 78 | func (extract *TLDExtract) SetNoValidate() { 79 | extract.noValidate = true 80 | } 81 | 82 | // SetNoStrip disables URL stripping in order to increase performance. 83 | func (extract *TLDExtract) SetNoStrip() { 84 | extract.noStrip = true 85 | } 86 | 87 | func addTldRule(rootNode *Trie, labels []string, ex bool) { 88 | numlabs := len(labels) 89 | t := rootNode 90 | for i := numlabs - 1; i >= 0; i-- { 91 | lab := labels[i] 92 | m, found := t.matches[lab] 93 | if !found { 94 | except := ex 95 | valid := !ex && i == 0 96 | newMap := make(map[string]*Trie) 97 | t.matches[lab] = &Trie{ExceptRule: except, ValidTld: valid, matches: newMap} 98 | m = t.matches[lab] 99 | } 100 | t = m 101 | } 102 | } 103 | 104 | func (extract *TLDExtract) Extract(u string) *Result { 105 | input := u 106 | u = strings.ToLower(u) 107 | if !extract.noValidate { 108 | u = schemaregex.ReplaceAllString(u, "") 109 | i := strings.Index(u, "@") 110 | if i != -1 { 111 | u = u[i+1:] 112 | } 113 | 114 | index := strings.IndexFunc(u, func(r rune) bool { 115 | switch r { 116 | case '&', '/', '?', ':', '#': 117 | return true 118 | } 119 | return false 120 | }) 121 | if index != -1 { 122 | u = u[0:index] 123 | } 124 | } 125 | if !extract.noStrip { 126 | if strings.HasSuffix(u, ".html") { 127 | u = u[0 : len(u)-len(".html")] 128 | } 129 | } 130 | if extract.debug { 131 | fmt.Printf("%s;%s\n", u, input) 132 | } 133 | return extract.extract(u) 134 | } 135 | 136 | func (extract *TLDExtract) extract(url string) *Result { 137 | domain, tld := extract.extractTld(url) 138 | if tld == "" { 139 | ip := net.ParseIP(url) 140 | if ip != nil { 141 | if ip4regex.MatchString(url) { 142 | return &Result{Flag: Ip4, Root: url} 143 | } 144 | return &Result{Flag: Ip6, Root: url} 145 | } 146 | return &Result{Flag: Malformed} 147 | } 148 | sub, root := subdomain(domain) 149 | if domainregex.MatchString(root) { 150 | return &Result{Flag: Domain, Root: root, Sub: sub, Tld: tld} 151 | } 152 | return &Result{Flag: Malformed} 153 | } 154 | 155 | func (extract *TLDExtract) extractTld(url string) (domain, tld string) { 156 | spl := strings.Split(url, ".") 157 | tldIndex, validTld := extract.getTldIndex(spl) 158 | if validTld { 159 | domain = strings.Join(spl[:tldIndex], ".") 160 | tld = strings.Join(spl[tldIndex:], ".") 161 | } else { 162 | domain = url 163 | } 164 | return 165 | } 166 | 167 | func (extract *TLDExtract) getTldIndex(labels []string) (int, bool) { 168 | t := extract.rootNode 169 | parentValid := false 170 | for i := len(labels) - 1; i >= 0; i-- { 171 | lab := labels[i] 172 | n, found := t.matches[lab] 173 | _, starfound := t.matches["*"] 174 | 175 | switch { 176 | case found && !n.ExceptRule: 177 | parentValid = n.ValidTld 178 | t = n 179 | // Found an exception rule 180 | case found: 181 | fallthrough 182 | case parentValid: 183 | return i + 1, true 184 | case starfound: 185 | parentValid = true 186 | default: 187 | return -1, false 188 | } 189 | } 190 | return -1, false 191 | } 192 | 193 | //return sub domain,root domain 194 | func subdomain(d string) (string, string) { 195 | ps := strings.Split(d, ".") 196 | l := len(ps) 197 | if l == 1 { 198 | return "", d 199 | } 200 | return strings.Join(ps[0:l-1], "."), ps[l-1] 201 | } 202 | 203 | func download() ([]byte, error) { 204 | u := "https://publicsuffix.org/list/public_suffix_list.dat" 205 | resp, err := http.Get(u) 206 | if err != nil { 207 | return []byte(""), err 208 | } 209 | defer resp.Body.Close() 210 | body, _ := ioutil.ReadAll(resp.Body) 211 | 212 | lines := strings.Split(string(body), "\n") 213 | var buffer bytes.Buffer 214 | 215 | for _, line := range lines { 216 | line = strings.TrimSpace(line) 217 | if line != "" && !strings.HasPrefix(line, "//") { 218 | buffer.WriteString(line) 219 | buffer.WriteString("\n") 220 | } 221 | } 222 | 223 | return buffer.Bytes(), nil 224 | } 225 | --------------------------------------------------------------------------------