├── .gitignore
├── doc.go
├── LICENSE
├── README.md
├── tldextract_test.go
└── tldextract.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Package tldextract provides the ability to extract gTLD or ccTLD(generic or country code top-level domain), the registered domain and subdomain from a url according to the Public Suffix List.
 4 | 
 5 | A simple usage:
 6 | 	package main
 7 | 
 8 | 	import (
 9 | 		"fmt"
10 | 		"github.com/joeguo/tldextract"
11 | 	)
12 | 	func main() {
13 | 		urls := []string{"git+ssh://www.github.com:8443/", "http://media.forums.theregister.co.uk", "http://218.15.32.76", "http://google.com?q=cats"}
14 | 		cache := "/tmp/tld.cache"
15 | 		extract := tldextract.New(cache,false)
16 | 
17 | 		for _, u := range (urls) {
18 | 			result:=extract.Extract(u)
19 | 			fmt.Printf("%+v;%s\n",result,u)
20 | 		}
21 |     }
22 | 
23 | */
24 | package tldextract
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2012-2022 JoeGuo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | tldextract
 2 | ==========
 3 | 
 4 | Extract root domain, subdomain name, tld from a url, using the [the Public Suffix List](http://www.publicsuffix.org).
 5 | 
 6 | Installation
 7 | ==========
 8 | Install tldextract:
 9 | ```sh
10 | go get github.com/joeguo/tldextract
11 | 
12 | ```
13 | To run unit tests, run this command  in tldextract's source directory($GOPATH/src/github.com/joeguo/tldextract):
14 | 
15 | ```sh
16 | go test
17 | ```
18 | 
19 | Example
20 | ==========
21 | ```go
22 | package main
23 | 
24 | import (
25 | "fmt"
26 | "github.com/joeguo/tldextract"
27 | )
28 | 
29 | 
30 | func main() {
31 | 	urls := []string{"git+ssh://www.github.com:8443/", "http://media.forums.theregister.co.uk", "http://218.15.32.76", "http://google.com?q=cats"}
32 | 	cache := "/tmp/tld.cache"
33 | 	extract, _ := tldextract.New(cache,false)
34 | 
35 | 	for _, u := range (urls) {
36 | 		result:=extract.Extract(u)
37 | 		fmt.Printf("%+v;%s\n",result,u)
38 | 	}
39 | }
40 | 
41 | ```
42 | Output will look like:
43 | ```plain
44 |   &{Flag:1 Sub:www Root:github Tld:com};git+ssh://www.github.com:8443/
45 |   &{Flag:1 Sub:media.forums Root:theregister Tld:co.uk};http://media.forums.theregister.co.uk
46 |   &{Flag:2 Sub: Root:218.15.32.76 Tld:};http://218.15.32.76
47 |   &{Flag:1 Sub: Root:google Tld:com};http://google.com?q=cats
48 | ```
49 | Flag value meaning
50 | ```go
51 | const (
52 | 	Malformed = iota
53 | 	Domain
54 | 	Ip4
55 | 	Ip6
56 | )
57 | ```
58 | 
59 | ========
60 | 


--------------------------------------------------------------------------------
/tldextract_test.go:
--------------------------------------------------------------------------------
 1 | package tldextract
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"testing"
 7 | )
 8 | 
 9 | var (
10 | 	cache      = "/tmp/tld.cache"
11 | 	tldExtract *TLDExtract
12 | 	err        error
13 | )
14 | 
15 | func init() {
16 | 	tldExtract, err = New(cache, true)
17 | 	if err != nil {
18 | 		log.Fatal(err)
19 | 	}
20 | }
21 | 
22 | func assert(url string, expected *Result, returned *Result, t *testing.T) {
23 | 	if (expected.Flag == returned.Flag) && (expected.Root == returned.Root) && (expected.Sub == returned.Sub) && (expected.Tld == returned.Tld) {
24 | 		return
25 | 	}
26 | 	t.Errorf("%s;expected:%+v;returned:%+v", url, expected, returned)
27 | }
28 | func aTestA(t *testing.T) {
29 | 	result := tldExtract.Extract("9down.cc.html&amp;sa=u&amp;ei=4sfsul-ximsb4ateiicaag&amp;ved=0cbkqfjac&amp;usg=afqjcnfmetjm8-gpgyszv9l1h6_5p369yg/wp-content/themes/airfolio/scripts/timthumb.php")
30 | 	fmt.Println(result)
31 | }
32 | 
33 | func TestAll(t *testing.T) {
34 | 	cases := map[string]*Result{"http://www.google.com": &Result{Flag: Domain, Sub: "www", Root: "google", Tld: "com"},
35 | 		"https://www.google.com.hk/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CDQQFjAA&url=%68%74%74%70%3a%2f%2f%67%72%6f%75%70%73%2e%67%6f%6f%67%6c%65%2e%63%6f%6d%2f%67%72%6f%75%70%2f%67%6f%6c%61%6e%67%2d%6e%75%74%73%2f%62%72%6f%77%73%65%5f%74%68%72%65%61%64%2f%74%68%72%65%61%64%2f%62%31%61%36%65%31%66%38%37%30%32%62%33%31%31%62&ei=okjQULibA9GYiAfk3IDYDw&usg=AFQjCNFVxgAwHXnmEJWVURboSTiygUMTaQ&sig2=3AIxkh4TR5QYWGXCJtBSZg": &Result{Flag: Domain, Sub: "www", Root: "google", Tld: "com.hk"},
36 | 		"http://joe.blogspot.co.uk":                       &Result{Flag: Domain, Sub: "", Root: "joe", Tld: "blogspot.co.uk"},
37 | 		"ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501/": &Result{Flag: Domain, Sub: "1337", Root: "warez", Tld: "com"},
38 | 		"git+ssh://www.github.com:8443/":                  &Result{Flag: Domain, Sub: "www", Root: "github", Tld: "com"},
39 | 		"http://www.!github.com:8443/":                    &Result{Flag: Malformed},
40 | 		"http://www.theregister.co.uk":                    &Result{Flag: Domain, Sub: "www", Root: "theregister", Tld: "co.uk"},
41 | 		"http://media.forums.theregister.co.uk":           &Result{Flag: Domain, Sub: "media.forums", Root: "theregister", Tld: "co.uk"},
42 | 		"192.168.0.103":                                   &Result{Flag: Ip4, Root: "192.168.0.103"},
43 | 		"http://192.168.0.103":                            &Result{Flag: Ip4, Root: "192.168.0.103"},
44 | 		"http://216.22.project.coop/":                     &Result{Flag: Domain, Sub: "216.22", Root: "project", Tld: "coop"},
45 | 		"http://Gmail.org/":                               &Result{Flag: Domain, Root: "gmail", Tld: "org"},
46 | 		"http://wiki.info/":                               &Result{Flag: Domain, Root: "wiki", Tld: "info"},
47 | 		"http://wiki.information/":                        &Result{Flag: Malformed},
48 | 		"http://wiki/":                                    &Result{Flag: Malformed},
49 | 		"http://258.15.32.876":                            &Result{Flag: Malformed},
50 | 		"http://www.cgs.act.edu.au/":                      &Result{Flag: Domain, Sub: "www", Root: "cgs", Tld: "act.edu.au"},
51 | 		"http://www.metp.net.cn":                          &Result{Flag: Domain, Sub: "www", Root: "metp", Tld: "net.cn"},
52 | 		"http://net.cn":                                   &Result{Flag: Malformed},
53 | 		"http://google.com?q=cats":                        &Result{Flag: Domain, Sub: "", Root: "google", Tld: "com"},
54 | 		"https://mail.google.com/mail":                    &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"},
55 | 		"ssh://mail.google.com/mail":                      &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"},
56 | 		"//mail.google.com/mail":                          &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"},
57 | 		"mail.google.com/mail":                            &Result{Flag: Domain, Sub: "mail", Root: "google", Tld: "com"},
58 | 		"9down.cc.html&amp;sa=u&amp;ei=4sfsul-ximsb4ateiicaag&amp;ved=0cbkqfjac&amp;usg=afqjcnfmetjm8-gpgyszv9l1h6_5p369yg/wp-content/themes/airfolio/scripts/timthumb.php": &Result{Flag: Domain, Sub: "", Root: "9down", Tld: "cc"},
59 | 		"cy":                  &Result{Flag: Malformed},
60 | 		"c.cy":                &Result{Flag: Domain, Sub: "", Root: "c", Tld: "cy"},
61 | 		"b.c.cy":              &Result{Flag: Domain, Sub: "b", Root: "c", Tld: "cy"},
62 | 		"a.b.c.cy":            &Result{Flag: Domain, Sub: "a.b", Root: "c", Tld: "cy"},
63 | 		"b.ide.kyoto.jp":      &Result{Flag: Domain, Sub: "", Root: "b", Tld: "ide.kyoto.jp"},
64 | 		"a.b.ide.kyoto.jp":    &Result{Flag: Domain, Sub: "a", Root: "b", Tld: "ide.kyoto.jp"},
65 | 		"b.c.kobe.jp":         &Result{Flag: Domain, Sub: "", Root: "b", Tld: "c.kobe.jp"},
66 | 		"a.b.c.kobe.jp":       &Result{Flag: Domain, Sub: "a", Root: "b", Tld: "c.kobe.jp"},
67 | 		"city.kobe.jp":        &Result{Flag: Domain, Sub: "", Root: "city", Tld: "kobe.jp"},
68 | 		"city.a.kobe.jp":      &Result{Flag: Domain, Sub: "", Root: "city", Tld: "a.kobe.jp"},
69 | 		"blogspot.co.uk":      &Result{Flag: Malformed},
70 | 		"blah.blogspot.co.uk": &Result{Flag: Domain, Sub: "", Root: "blah", Tld: "blogspot.co.uk"},
71 | 	}
72 | 	for url, expected := range cases {
73 | 		returned := tldExtract.Extract(url)
74 | 		assert(url, expected, returned, t)
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/tldextract.go:
--------------------------------------------------------------------------------
  1 | package tldextract
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"io/ioutil"
  7 | 	"net"
  8 | 	"net/http"
  9 | 	"regexp"
 10 | 	"strings"
 11 | )
 12 | 
 13 | //used for Result.Flag
 14 | const (
 15 | 	Malformed = iota
 16 | 	Domain
 17 | 	Ip4
 18 | 	Ip6
 19 | )
 20 | 
 21 | type Result struct {
 22 | 	Flag int
 23 | 	Sub  string
 24 | 	Root string
 25 | 	Tld  string
 26 | }
 27 | 
 28 | type TLDExtract struct {
 29 | 	CacheFile  string
 30 | 	rootNode   *Trie
 31 | 	debug      bool
 32 | 	noValidate bool // do not validate URL schema
 33 | 	noStrip    bool // do not strip .html suffix from URL
 34 | }
 35 | 
 36 | type Trie struct {
 37 | 	ExceptRule bool
 38 | 	ValidTld   bool
 39 | 	matches    map[string]*Trie
 40 | }
 41 | 
 42 | var (
 43 | 	schemaregex = regexp.MustCompile(`^([abcdefghijklmnopqrstuvwxyz0123456789\+\-\.]+:)?//`)
 44 | 	domainregex = regexp.MustCompile(`^[a-z0-9-\p{L}]{1,63}$`)
 45 | 	ip4regex    = regexp.MustCompile(`(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])`)
 46 | )
 47 | 
 48 | // New creates a new *TLDExtract, it may be shared between goroutines, we usually need a single instance in an application.
 49 | func New(cacheFile string, debug bool) (*TLDExtract, error) {
 50 | 	data, err := ioutil.ReadFile(cacheFile)
 51 | 	if err != nil {
 52 | 		data, err = download()
 53 | 		if err != nil {
 54 | 			return &TLDExtract{}, err
 55 | 		}
 56 | 		if err = ioutil.WriteFile(cacheFile, data, 0644); err != nil {
 57 | 			return &TLDExtract{}, err
 58 | 		}
 59 | 	}
 60 | 	ts := strings.Split(string(data), "\n")
 61 | 	newMap := make(map[string]*Trie)
 62 | 	rootNode := &Trie{ExceptRule: false, ValidTld: false, matches: newMap}
 63 | 	for _, t := range ts {
 64 | 		if t != "" && !strings.HasPrefix(t, "//") {
 65 | 			t = strings.TrimSpace(t)
 66 | 			exceptionRule := t[0] == '!'
 67 | 			if exceptionRule {
 68 | 				t = t[1:]
 69 | 			}
 70 | 			addTldRule(rootNode, strings.Split(t, "."), exceptionRule)
 71 | 		}
 72 | 	}
 73 | 
 74 | 	return &TLDExtract{CacheFile: cacheFile, rootNode: rootNode, debug: debug}, nil
 75 | }
 76 | 
 77 | // SetNoValidate disables schema check in order to increase performance.
 78 | func (extract *TLDExtract) SetNoValidate() {
 79 | 	extract.noValidate = true
 80 | }
 81 | 
 82 | // SetNoStrip disables URL stripping in order to increase performance.
 83 | func (extract *TLDExtract) SetNoStrip() {
 84 | 	extract.noStrip = true
 85 | }
 86 | 
 87 | func addTldRule(rootNode *Trie, labels []string, ex bool) {
 88 | 	numlabs := len(labels)
 89 | 	t := rootNode
 90 | 	for i := numlabs - 1; i >= 0; i-- {
 91 | 		lab := labels[i]
 92 | 		m, found := t.matches[lab]
 93 | 		if !found {
 94 | 			except := ex
 95 | 			valid := !ex && i == 0
 96 | 			newMap := make(map[string]*Trie)
 97 | 			t.matches[lab] = &Trie{ExceptRule: except, ValidTld: valid, matches: newMap}
 98 | 			m = t.matches[lab]
 99 | 		}
100 | 		t = m
101 | 	}
102 | }
103 | 
104 | func (extract *TLDExtract) Extract(u string) *Result {
105 | 	input := u
106 | 	u = strings.ToLower(u)
107 | 	if !extract.noValidate {
108 | 		u = schemaregex.ReplaceAllString(u, "")
109 | 		i := strings.Index(u, "@")
110 | 		if i != -1 {
111 | 			u = u[i+1:]
112 | 		}
113 | 
114 | 		index := strings.IndexFunc(u, func(r rune) bool {
115 | 			switch r {
116 | 			case '&', '/', '?', ':', '#':
117 | 				return true
118 | 			}
119 | 			return false
120 | 		})
121 | 		if index != -1 {
122 | 			u = u[0:index]
123 | 		}
124 | 	}
125 | 	if !extract.noStrip {
126 | 		if strings.HasSuffix(u, ".html") {
127 | 			u = u[0 : len(u)-len(".html")]
128 | 		}
129 | 	}
130 | 	if extract.debug {
131 | 		fmt.Printf("%s;%s\n", u, input)
132 | 	}
133 | 	return extract.extract(u)
134 | }
135 | 
136 | func (extract *TLDExtract) extract(url string) *Result {
137 | 	domain, tld := extract.extractTld(url)
138 | 	if tld == "" {
139 | 		ip := net.ParseIP(url)
140 | 		if ip != nil {
141 | 			if ip4regex.MatchString(url) {
142 | 				return &Result{Flag: Ip4, Root: url}
143 | 			}
144 | 			return &Result{Flag: Ip6, Root: url}
145 | 		}
146 | 		return &Result{Flag: Malformed}
147 | 	}
148 | 	sub, root := subdomain(domain)
149 | 	if domainregex.MatchString(root) {
150 | 		return &Result{Flag: Domain, Root: root, Sub: sub, Tld: tld}
151 | 	}
152 | 	return &Result{Flag: Malformed}
153 | }
154 | 
155 | func (extract *TLDExtract) extractTld(url string) (domain, tld string) {
156 | 	spl := strings.Split(url, ".")
157 | 	tldIndex, validTld := extract.getTldIndex(spl)
158 | 	if validTld {
159 | 		domain = strings.Join(spl[:tldIndex], ".")
160 | 		tld = strings.Join(spl[tldIndex:], ".")
161 | 	} else {
162 | 		domain = url
163 | 	}
164 | 	return
165 | }
166 | 
167 | func (extract *TLDExtract) getTldIndex(labels []string) (int, bool) {
168 | 	t := extract.rootNode
169 | 	parentValid := false
170 | 	for i := len(labels) - 1; i >= 0; i-- {
171 | 		lab := labels[i]
172 | 		n, found := t.matches[lab]
173 | 		_, starfound := t.matches["*"]
174 | 
175 | 		switch {
176 | 		case found && !n.ExceptRule:
177 | 			parentValid = n.ValidTld
178 | 			t = n
179 | 		// Found an exception rule
180 | 		case found:
181 | 			fallthrough
182 | 		case parentValid:
183 | 			return i + 1, true
184 | 		case starfound:
185 | 			parentValid = true
186 | 		default:
187 | 			return -1, false
188 | 		}
189 | 	}
190 | 	return -1, false
191 | }
192 | 
193 | //return sub domain,root domain
194 | func subdomain(d string) (string, string) {
195 | 	ps := strings.Split(d, ".")
196 | 	l := len(ps)
197 | 	if l == 1 {
198 | 		return "", d
199 | 	}
200 | 	return strings.Join(ps[0:l-1], "."), ps[l-1]
201 | }
202 | 
203 | func download() ([]byte, error) {
204 | 	u := "https://publicsuffix.org/list/public_suffix_list.dat"
205 | 	resp, err := http.Get(u)
206 | 	if err != nil {
207 | 		return []byte(""), err
208 | 	}
209 | 	defer resp.Body.Close()
210 | 	body, _ := ioutil.ReadAll(resp.Body)
211 | 
212 | 	lines := strings.Split(string(body), "\n")
213 | 	var buffer bytes.Buffer
214 | 
215 | 	for _, line := range lines {
216 | 		line = strings.TrimSpace(line)
217 | 		if line != "" && !strings.HasPrefix(line, "//") {
218 | 			buffer.WriteString(line)
219 | 			buffer.WriteString("\n")
220 | 		}
221 | 	}
222 | 
223 | 	return buffer.Bytes(), nil
224 | }
225 | 


--------------------------------------------------------------------------------