├── .gitattributes
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .licrc
├── CODE_OF_CONDUCT.md
├── CREDITS.md
├── LICENSE
├── Makefile
├── README.md
├── Trie_example.svg
├── benchmark_test.go
├── cmd
    ├── fasttld
    │   ├── extract.go
    │   └── root.go
    └── main.go
├── data
    └── gen.go
├── data_test.go
├── demo.gif
├── examples
    └── demo.go
├── fallback.go
├── fasttld.go
├── fasttld_test.go
├── go.mod
├── go.sum
├── net.go
├── net_test.go
├── print.go
├── print_test.go
├── psl.go
├── psl_test.go
├── renovate.json
├── strings.go
├── strings_test.go
└── test
    ├── mini_public_suffix_list.dat
    └── public_suffix_list.dat


/.gitattributes:
--------------------------------------------------------------------------------
1 | # To prevent CRLF breakages on Windows for fragile files, like testdata.
2 | * -text


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | permissions:
 3 |   contents: read
 4 |   pull-requests: write
 5 | on: [push, pull_request, workflow_dispatch]
 6 | jobs:
 7 |   format-markdown:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout
11 |         uses: actions/checkout@v4
12 | 
13 |       - name: Format Markdown with markdownlint
14 |         run: |
15 |           npm install -g markdownlint-cli
16 |           markdownlint --disable MD013 MD033 --fix . --ignore CODE_OF_CONDUCT.md
17 |           git add -A
18 |           git diff --cached --exit-code
19 |   test-and-coverage:
20 |     strategy:
21 |       matrix:
22 |         os: [macos-latest, windows-latest, ubuntu-latest]
23 |     runs-on: ${{ matrix.os }}
24 |     steps:
25 |     - name: Check out repository
26 |       uses: actions/checkout@v4
27 |       with:
28 |         fetch-depth: 0 # to support `git describe`
29 |     - name: Setup go
30 |       uses: actions/setup-go@v5
31 |       with:
32 |         go-version-file: './go.mod'
33 |     - name: Test
34 |       run: make tests
35 |     - name: Build CLI app
36 |       run: make build_cli
37 |     - name: |
38 |         If HEAD is not tagged, CLI app version tag should be newer than latest git version tag
39 |         If HEAD is tagged, CLI app version tag should be equal to latest git version tag
40 |       shell: bash
41 |       run: |
42 |         LATEST_TAG=$(git describe --tags `git rev-list --tags --max-count=1`)
43 |         CLI_TAG=$(dist/fasttld -v | awk '{print $NF}')
44 |         if [[ $( printf $LATEST_TAG"\n"$CLI_TAG ) != $( printf $LATEST_TAG"\n"$CLI_TAG | sort -V ) ]]
45 |         then
46 |           echo "Expected CLI app version number $CLI_TAG to be newer than or equal to latest git version number $LATEST_TAG. Check Makefile."
47 |           exit 1
48 |         fi
49 |         if [[ $(git describe --exact-match --tags HEAD 2>&1) =~ .*"no tag exactly matches".* ]]; then
50 |           if [[ $LATEST_TAG == $CLI_TAG ]]
51 |           then
52 |             echo "HEAD is not tagged. Expected CLI app version number $CLI_TAG to be newer than latest git version number $LATEST_TAG. Check Makefile."
53 |             exit 1
54 |           fi
55 |         else
56 |           if [[ $LATEST_TAG != $CLI_TAG ]]
57 |           then
58 |             echo "HEAD is tagged. Expected CLI app version number $CLI_TAG to equal to latest git version number $LATEST_TAG. Check Makefile."
59 |             exit 1
60 |           fi
61 |         fi
62 |     - name: Convert Go coverage to lcov
63 |       if: matrix.os == 'ubuntu-latest'
64 |       run: |
65 |         go install github.com/jandelgado/gcov2lcov@latest
66 |         gcov2lcov -infile=coverage.out -outfile=coverage.lcov
67 |     - name: Upload coverage to Coveralls
68 |       if: matrix.os == 'ubuntu-latest'
69 |       uses: coverallsapp/github-action@v2
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.prof
 2 | 
 3 | # Binaries for programs and plugins
 4 | *.exe
 5 | *.exe~
 6 | *.dll
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Test binary, built with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 | *.html
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | public_suffix_list.dat
21 | dist/


--------------------------------------------------------------------------------
/.licrc:
--------------------------------------------------------------------------------
 1 | # IMPORTANT!: ALL SECTIONS ARE MANDATORY
 2 | [licenses]
 3 | unaccepted = ["CC0", "EPL", "MPL" , "OSL", "RPL", "LGPL", "GPL", "AGPL"]
 4 | 
 5 | [dependencies]
 6 | # This will allow users to flag some dependencies so that Licensebat will not check for their license.
 7 | ignored=[]
 8 | 
 9 | [behavior]
10 | # False by default, if true, it will only run the checks when one of the dependency files or the .licrc file has been modified.
11 | run_only_on_dependency_modification = true
12 | # False by default, if true, it will never block the build.
13 | do_not_block_pr = false


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | wutingfeng@outlook.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/CREDITS.md:
--------------------------------------------------------------------------------
 1 | # Credits
 2 | 
 3 | This application uses code from other open-source projects. The copyright statements of these open-source projects are listed below.
 4 | 
 5 | ## Go
 6 | 
 7 | Source: <https://github.com/golang/go>
 8 | 
 9 | ```markdown
10 | Copyright (c) 2009 The Go Authors. All rights reserved.
11 | 
12 | Redistribution and use in source and binary forms, with or without
13 | modification, are permitted provided that the following conditions are
14 | met:
15 | 
16 |    * Redistributions of source code must retain the above copyright
17 | notice, this list of conditions and the following disclaimer.
18 |    * Redistributions in binary form must reproduce the above
19 | copyright notice, this list of conditions and the following disclaimer
20 | in the documentation and/or other materials provided with the
21 | distribution.
22 |    * Neither the name of Google Inc. nor the names of its
23 | contributors may be used to endorse or promote products derived from
24 | this software without specific prior written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 | ```
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Wu Tingfeng
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | tests:
 2 | 	go test -v -race -covermode atomic -coverprofile coverage.out && go tool cover -html coverage.out -o coverage.html
 3 | 
 4 | tests_without_race:
 5 | 	go test -v -covermode atomic -coverprofile coverage.out && go tool cover -html coverage.out -o coverage.html
 6 | 
 7 | format:
 8 | 	go fmt . ./cmd/... ./cmd/fasttld/... ./examples/...
 9 | 
10 | bench:
11 | 	go test -bench . -benchmem -cpu 1
12 | 
13 | report_bench:
14 | 	go test -cpuprofile cpu.prof -memprofile mem.prof -bench . -cpu 1
15 | 
16 | cpu_report:
17 | 	go tool pprof cpu.prof
18 | 
19 | mem_report:
20 | 	go tool pprof mem.prof
21 | 
22 | build_cli:
23 | 	go build -o ./dist/fasttld -ldflags "-X 'github.com/elliotwutingfeng/go-fasttld/cmd/fasttld.version=v0.4.5'" ./cmd/main.go
24 | 
25 | demo:
26 | 	go run ./examples/demo.go
27 | 
28 | update_psl:
29 | 	go generate data/gen.go
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # go-fasttld
  2 | 
  3 | [![Go Reference](https://img.shields.io/badge/go-reference-blue?logo=go&logoColor=white&style=for-the-badge)](https://pkg.go.dev/github.com/elliotwutingfeng/go-fasttld)
  4 | [![Go Report Card](https://goreportcard.com/badge/github.com/elliotwutingfeng/go-fasttld?style=for-the-badge)](https://goreportcard.com/report/github.com/elliotwutingfeng/go-fasttld)
  5 | [![Coveralls](https://img.shields.io/coverallsCoverage/github/elliotwutingfeng/go-fasttld?logo=coveralls&style=for-the-badge)](https://coveralls.io/github/elliotwutingfeng/go-fasttld?branch=main)
  6 | [![Mentioned in Awesome Go](https://img.shields.io/static/v1?logo=awesomelists&label=&labelColor=CCA6C4&logoColor=261120&message=Mentioned%20in%20awesome&color=494368&style=for-the-badge)](https://github.com/avelino/awesome-go)
  7 | 
  8 | [![GitHub license](https://img.shields.io/badge/LICENSE-BSD--3--CLAUSE-GREEN?style=for-the-badge)](LICENSE)
  9 | 
 10 | ## Summary
 11 | 
 12 | **go-fasttld** is a high performance [effective top level domains (eTLD)](https://wiki.mozilla.org/Public_Suffix_List) extraction module that extracts subcomponents from [URLs](https://en.wikipedia.org/wiki/URL).
 13 | 
 14 | URLs can either contain hostnames, IPv4 addresses, or IPv6 addresses. eTLD extraction is based on the [Mozilla Public Suffix List](http://www.publicsuffix.org). Private domains listed in the [Mozilla Public Suffix List](http://www.publicsuffix.org) like 'blogspot.co.uk' and 'sinaapp.com' are also supported.
 15 | 
 16 | ![Demo](demo.gif)
 17 | 
 18 | Spot any bugs? Report them [here](https://github.com/elliotwutingfeng/go-fasttld/issues)
 19 | 
 20 | ## Installation
 21 | 
 22 | ```sh
 23 | go get github.com/elliotwutingfeng/go-fasttld
 24 | ```
 25 | 
 26 | ## Try the CLI
 27 | 
 28 | First, build the CLI application.
 29 | 
 30 | ```sh
 31 | # `git clone` and `cd` to the go-fasttld repository folder first
 32 | make build_cli
 33 | ```
 34 | 
 35 | Afterwards, try extracting subcomponents from a URL.
 36 | 
 37 | ```sh
 38 | # `git clone` and `cd` to the go-fasttld repository folder first
 39 | ./dist/fasttld extract https://user@a.subdomain.example.a%63.uk:5000/a/b\?id\=42
 40 | ```
 41 | 
 42 | ## Try the example code
 43 | 
 44 | All of the following examples can be found at `examples/demo.go`. To play the demo, run the following command:
 45 | 
 46 | ```sh
 47 | # `git clone` and `cd` to the go-fasttld repository folder first
 48 | make demo
 49 | ```
 50 | 
 51 | ### Hostname
 52 | 
 53 | ```go
 54 | // Initialise fasttld extractor
 55 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
 56 | 
 57 | // Extract URL subcomponents
 58 | url := "https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42"
 59 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
 60 | 
 61 | // Display results
 62 | fasttld.PrintRes(url, res) // Pretty-prints res.Scheme, res.UserInfo, res.SubDomain etc.
 63 | ```
 64 | 
 65 | | Scheme   | UserInfo | SubDomain   | Domain  | Suffix | RegisteredDomain | Port | Path       | HostType     |
 66 | |----------|----------|-------------|---------|--------|------------------|------|------------|--------------|
 67 | | https:// | user     | a.subdomain | example | a%63.uk  | example.a%63.uk    | 5000 | /a/b?id=42 | hostname     |
 68 | 
 69 | ### IPv4 Address
 70 | 
 71 | ```go
 72 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
 73 | url := "https://127.0.0.1:5000"
 74 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
 75 | ```
 76 | 
 77 | | Scheme   | UserInfo | SubDomain | Domain    | Suffix | RegisteredDomain | Port | Path | HostType     |
 78 | |----------|----------|-----------|-----------|--------|------------------|------|------|--------------|
 79 | | https:// |          |           | 127.0.0.1 |        | 127.0.0.1        | 5000 |      | ipv4 address |
 80 | 
 81 | ### IPv6 Address
 82 | 
 83 | ```go
 84 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
 85 | url := "https://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000"
 86 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
 87 | ```
 88 | 
 89 | | Scheme   | UserInfo | SubDomain | Domain                                  | Suffix | RegisteredDomain                        | Port | Path | HostType     |
 90 | |----------|----------|-----------|-----------------------------------------|--------|-----------------------------------------|------|------|--------------|
 91 | | https:// |          |           | aBcD:ef01:2345:6789:aBcD:ef01:2345:6789 |        | aBcD:ef01:2345:6789:aBcD:ef01:2345:6789 | 5000 |      | ipv6 address |
 92 | 
 93 | ### Internationalised label separators
 94 | 
 95 | **go-fasttld** supports the following internationalised label separators (IETF RFC 3490)
 96 | 
 97 | | Full Stop  | Ideographic Full Stop | Fullwidth Full Stop | Halfwidth Ideographic Full Stop |
 98 | |------------|-----------------------|---------------------|---------------------------------|
 99 | | U+002E `.` | U+3002 `。`           | U+FF0E `．`         | U+FF61 `｡`                      |
100 | 
101 | ```go
102 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
103 | url := "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"
104 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
105 | ```
106 | 
107 | | Scheme   | UserInfo | SubDomain                             | Domain | Suffix    | RegisteredDomain  | Port | Path | HostType     |
108 | |----------|----------|---------------------------------------|--------|-----------|-------------------|------|------|--------------|
109 | | https:// |          | brb\u002ei\u3002am\uff0egoing\uff61to | be     | a\uff61fk | be\u3002a\uff61fk |      |      | hostname     |
110 | 
111 | ## Public Suffix List options
112 | 
113 | ### Specify custom public suffix list file
114 | 
115 | You can use a custom public suffix list file by setting `CacheFilePath` in `fasttld.SuffixListParams{}` to its absolute path.
116 | 
117 | ```go
118 | cacheFilePath := "/absolute/path/to/file.dat"
119 | extractor, err := fasttld.New(fasttld.SuffixListParams{CacheFilePath: cacheFilePath})
120 | ```
121 | 
122 | ### Updating the default Public Suffix List cache
123 | 
124 | Whenever `fasttld.New` is called without specifying `CacheFilePath` in `fasttld.SuffixListParams{}`, the local cache of the default Public Suffix List is updated automatically if it is more than 3 days old. You can also manually update the cache by using `Update()`.
125 | 
126 | ```go
127 | // Automatic update performed if `CacheFilePath` is not specified
128 | // and local cache is more than 3 days old
129 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
130 | 
131 | // Manually update local cache
132 | if err := extractor.Update(); err != nil {
133 |     log.Println(err)
134 | }
135 | ```
136 | 
137 | ### Private domains
138 | 
139 | According to the [Mozilla.org wiki](https://wiki.mozilla.org/Public_Suffix_List/Uses), the Mozilla Public Suffix List contains private domains like `blogspot.com` and `sinaapp.com`.
140 | 
141 | By default, these private domains are excluded (i.e. `IncludePrivateSuffix = false`)
142 | 
143 | ```go
144 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
145 | url := "https://google.blogspot.com"
146 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
147 | ```
148 | 
149 | | Scheme   | UserInfo | SubDomain | Domain   | Suffix | RegisteredDomain | Port | Path | HostType     |
150 | |----------|----------|-----------|----------|--------|------------------|------|------|--------------|
151 | | https:// |          | google    | blogspot | com    | blogspot.com     |      |      | hostname     |
152 | 
153 | You can _include_ private domains by setting `IncludePrivateSuffix = true`
154 | 
155 | ```go
156 | extractor, _ := fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: true})
157 | url := "https://google.blogspot.com"
158 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
159 | ```
160 | 
161 | | Scheme   | UserInfo | SubDomain | Domain | Suffix       | RegisteredDomain    | Port | Path | HostType     |
162 | |----------|----------|-----------|--------|--------------|---------------------|------|------|--------------|
163 | | https:// |          |           | google | blogspot.com | google.blogspot.com |      |      | hostname     |
164 | 
165 | ## Extraction options
166 | 
167 | ### Ignore Subdomains
168 | 
169 | You can ignore subdomains by setting `IgnoreSubDomains = true`. By default, subdomains are extracted.
170 | 
171 | ```go
172 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
173 | url := "https://maps.google.com"
174 | res, _ := extractor.Extract(fasttld.URLParams{URL: url, IgnoreSubDomains: true})
175 | ```
176 | 
177 | | Scheme   | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType     |
178 | |----------|----------|-----------|--------|--------|------------------|------|------|--------------|
179 | | https:// |          |           | google | com    | google.com       |      |      | hostname     |
180 | 
181 | ### Punycode
182 | 
183 | By default, internationalised URLs are not converted to punycode before extraction.
184 | 
185 | ```go
186 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
187 | url := "https://hello.世界.com"
188 | res, _ := extractor.Extract(fasttld.URLParams{URL: url})
189 | ```
190 | 
191 | | Scheme   | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType     |
192 | |----------|----------|-----------|--------|--------|------------------|------|------|--------------|
193 | | https:// |          | hello     | 世界   | com    | 世界.com         |      |      | hostname     |
194 | 
195 | You can convert internationalised URLs to [punycode](https://en.wikipedia.org/wiki/Punycode) before extraction by setting `ConvertURLToPunyCode = true`.
196 | 
197 | ```go
198 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
199 | url := "https://hello.世界.com"
200 | res, _ := extractor.Extract(fasttld.URLParams{URL: url, ConvertURLToPunyCode: true})
201 | ```
202 | 
203 | | Scheme   | UserInfo | SubDomain | Domain      | Suffix | RegisteredDomain | Port | Path | HostType     |
204 | |----------|----------|-----------|-------------|--------|------------------|------|------|--------------|
205 | | https:// |          | hello     | xn--rhqv96g | com    | xn--rhqv96g.com  |      |      | hostname     |
206 | 
207 | ## Parsing errors
208 | 
209 | If the URL is invalid, the second value returned by `Extract()`, **error**, will be non-nil. Partially extracted subcomponents can still be retrieved from the first value returned, **ExtractResult**.
210 | 
211 | ```go
212 | extractor, _ := fasttld.New(fasttld.SuffixListParams{})
213 | url := "https://example!.com" // invalid characters in hostname
214 | color.New().Println("The following line should be an error message")
215 | if res, err := extractor.Extract(fasttld.URLParams{URL: url}); err != nil {
216 |     color.New(color.FgHiRed, color.Bold).Print("Error: ")
217 |     color.New(color.FgHiWhite).Println(err)
218 | }
219 | fasttld.PrintRes(url, res) // Partially extracted subcomponents can still be retrieved
220 | ```
221 | 
222 | | Scheme   | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType |
223 | |----------|----------|-----------|--------|--------|------------------|------|------|----------|
224 | | https:// |          |           |        |        |                  |      |      |          |
225 | 
226 | ## Testing
227 | 
228 | ```sh
229 | # `git clone` and `cd` to the go-fasttld repository folder first
230 | make tests
231 | 
232 | # Alternatively, run tests without race detection
233 | # Useful for systems that do not support the -race flag like windows/386
234 | # See https://tip.golang.org/src/cmd/dist/test.go
235 | make tests_without_race
236 | ```
237 | 
238 | ## Benchmarks
239 | 
240 | ```sh
241 | # `git clone` and `cd` to the go-fasttld repository folder first
242 | make bench
243 | ```
244 | 
245 | ### Modules used
246 | 
247 | | Benchmark Name       | Source                           |
248 | |----------------------|----------------------------------|
249 | | GoFastTld            | go-fasttld (this module)         |
250 | | JPilloraGoTld        | github.com/jpillora/go-tld       |
251 | | JoeGuoTldExtract     | github.com/joeguo/tldextract     |
252 | | Mjd2021USATldExtract | github.com/mjd2021usa/tldextract |
253 | 
254 | ### Results
255 | 
256 | Benchmarks performed on AMD Ryzen 7 5800X, Manjaro Linux.
257 | 
258 | **go-fasttld** performs especially well on longer URLs.
259 | 
260 | ---
261 | 
262 | #### #1
263 | 
264 | <code>https://iupac.org/iupac-announces-the-2021-top-ten-emerging-technologies-in-chemistry/</code>
265 | 
266 | | Benchmark Name       | Iterations | ns/op       | B/op     | allocs/op   | Fastest            |
267 | |----------------------|------------|-------------|----------|-------------|--------------------|
268 | | GoFastTld            | 8037906    | 150.8 ns/op | 0 B/op   | 0 allocs/op | :heavy_check_mark: |
269 | | JPilloraGoTld        | 1675113    | 716.1 ns/op | 224 B/op | 2 allocs/op |                    |
270 | | JoeGuoTldExtract     | 2204854    | 515.1 ns/op | 272 B/op | 5 allocs/op |                    |
271 | | Mjd2021USATldExtract | 1676722    | 712.0 ns/op | 288 B/op | 6 allocs/op |                    |
272 | 
273 | ---
274 | 
275 | #### #2
276 | 
277 | <code>https://www.google.com/maps/dir/Parliament+Place,+Parliament+House+Of+Singapore,+Singapore/Parliament+St,+London,+UK/@25.2440033,33.6721455,4z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x31da19a0abd4d71d:0xeda26636dc4ea1dc!2m2!1d103.8504863!2d1.2891543!1m5!1m1!1s0x487604c5aaa7da5b:0xf13a2197d7e7dd26!2m2!1d-0.1260826!2d51.5017061!3e4</code>
278 | 
279 | | Benchmark Name       | Iterations | ns/op       | B/op      | allocs/op   | Fastest            |
280 | |----------------------|------------|-------------|-----------|-------------|--------------------|
281 | | GoFastTld            | 6381516    | 181.9 ns/op | 0 B/op    | 0 allocs/op | :heavy_check_mark: |
282 | | JPilloraGoTld        | 431671     | 2603 ns/op  | 928 B/op  | 4 allocs/op |                    |
283 | | JoeGuoTldExtract     | 893347     | 1176 ns/op  | 1120 B/op | 6 allocs/op |                    |
284 | | Mjd2021USATldExtract | 1030250    | 1165 ns/op  | 1120 B/op | 6 allocs/op |                    |
285 | 
286 | ---
287 | 
288 | #### #3
289 | 
290 | <code>https://a.b.c.d.e.f.g.h.i.j.k.l.m.n.oo.pp.qqq.rrrr.ssssss.tttttttt.uuuuuuuuuuu.vvvvvvvvvvvvvvv.wwwwwwwwwwwwwwwwwwwwww.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy.zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.cc</code>
291 | 
292 | | Benchmark Name       | Iterations | ns/op      | B/op      | allocs/op   | Fastest            |
293 | |----------------------|------------|------------|-----------|-------------|--------------------|
294 | | GoFastTld            | 833682     | 1424 ns/op | 0 B/op    | 0 allocs/op | :heavy_check_mark: |
295 | | JPilloraGoTld        | 734790     | 1640 ns/op | 304 B/op  | 3 allocs/op |                    |
296 | | JoeGuoTldExtract     | 695475     | 1452 ns/op | 1040 B/op | 5 allocs/op |                    |
297 | | Mjd2021USATldExtract | 330717     | 3628 ns/op | 1904 B/op | 8 allocs/op |                    |
298 | 
299 | ---
300 | 
301 | ## Implementation details
302 | 
303 | ### Why not split on "." and take the last element instead?
304 | 
305 | Splitting on "." and taking the last element only works for simple eTLDs like `com`, but not more complex ones like `oseto.nagasaki.jp`.
306 | 
307 | ### eTLD tries
308 | 
309 | ![Trie](Trie_example.svg)
310 | 
311 | **go-fasttld** stores eTLDs in [compressed tries](https://en.wikipedia.org/wiki/Trie).
312 | 
313 | Valid eTLDs from the [Mozilla Public Suffix List](http://www.publicsuffix.org) are appended to the compressed trie in reverse-order.
314 | 
315 | ```sh
316 | Given the following eTLDs
317 | au
318 | nsw.edu.au
319 | com.ac
320 | edu.ac
321 | gov.ac
322 | 
323 | and the example URL host `example.nsw.edu.au`
324 | 
325 | The compressed trie will be structured as follows:
326 | 
327 | START
328 |  ╠═ au 🚩 ✅
329 |  ║  ╚═ edu ✅
330 |  ║     ╚═ nsw 🚩 ✅
331 |  ╚═ ac
332 |     ╠═ com 🚩
333 |     ╠═ edu 🚩
334 |     ╚═ gov 🚩
335 | 
336 | === Symbol meanings ===
337 | 🚩 : path to this node is a valid eTLD
338 | ✅ : path to this node found in example URL host `example.nsw.edu.au`
339 | ```
340 | 
341 | The URL host subcomponents are parsed from right-to-left until no more matching nodes can be found. In this example, the path of matching nodes are `au -> edu -> nsw`. Reversing the nodes gives the extracted eTLD `nsw.edu.au`.
342 | 
343 | ## Acknowledgements
344 | 
345 | This module is a port of the Python [fasttld](https://github.com/jophy/fasttld) module, with additional modifications to support extraction of subcomponents from full URLs, IPv4 addresses, and IPv6 addresses.
346 | 
347 | - [fasttld (Python)](https://github.com/jophy/fasttld)
348 | - [tldextract (Python)](https://github.com/john-kurkowski/tldextract)
349 | - [ICANN IDN Character Validation Guidance](https://www.icann.org/resources/pages/idna-protocol-2012-02-25-en)
350 | - [IETF RFC 2396](https://www.ietf.org/rfc/rfc2396.txt)
351 | - [IETF RFC 3490](https://www.ietf.org/rfc/rfc3490.txt)
352 | - [IETF RFC 3986](https://www.ietf.org/rfc/rfc3986.txt)
353 | - [IETF RFC 6874](https://www.ietf.org/rfc/rfc6874.txt)
354 | 


--------------------------------------------------------------------------------
/Trie_example.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    version="1.0"
  6 |    width="400"
  7 |    height="375"
  8 |    id="svg2"
  9 |    sodipodi:docname="Trie_example.svg"
 10 |    inkscape:version="1.1.1 (3bf5ae0d25, 2021-09-20)"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 13 |    xmlns:xlink="http://www.w3.org/1999/xlink"
 14 |    xmlns="http://www.w3.org/2000/svg"
 15 |    xmlns:svg="http://www.w3.org/2000/svg">
 16 |   <sodipodi:namedview
 17 |      id="namedview92"
 18 |      pagecolor="#ffffff"
 19 |      bordercolor="#666666"
 20 |      borderopacity="1.0"
 21 |      inkscape:pageshadow="2"
 22 |      inkscape:pageopacity="0.0"
 23 |      inkscape:pagecheckerboard="0"
 24 |      showgrid="false"
 25 |      inkscape:zoom="1.5146667"
 26 |      inkscape:cx="148.87764"
 27 |      inkscape:cy="170.66461"
 28 |      inkscape:window-width="1827"
 29 |      inkscape:window-height="1099"
 30 |      inkscape:window-x="269"
 31 |      inkscape:window-y="166"
 32 |      inkscape:window-maximized="0"
 33 |      inkscape:current-layer="svg2" />
 34 |   <defs
 35 |      id="defs5">
 36 |     <marker
 37 |        refX="0"
 38 |        refY="0"
 39 |        orient="auto"
 40 |        id="TriangleOutM"
 41 |        style="overflow:visible">
 42 |       <path
 43 |          d="M 5.77,0 L -2.88,5 L -2.88,-5 L 5.77,0 z"
 44 |          transform="scale(0.4,0.4)"
 45 |          id="path3383"
 46 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none" />
 47 |     </marker>
 48 |     <marker
 49 |        refX="0"
 50 |        refY="0"
 51 |        orient="auto"
 52 |        id="TriangleOutL"
 53 |        style="overflow:visible">
 54 |       <path
 55 |          d="M 5.77,0 L -2.88,5 L -2.88,-5 L 5.77,0 z"
 56 |          transform="scale(0.8,0.8)"
 57 |          id="path3380"
 58 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none" />
 59 |     </marker>
 60 |   </defs>
 61 |   <path
 62 |      d="M 225,37 A 26,26 0 1 1 173,37 A 26,26 0 1 1 225,37 z"
 63 |      transform="translate(1,0)"
 64 |      id="path1891"
 65 |      style="opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:1.875;stroke-linecap:butt;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
 66 |   <use
 67 |      transform="translate(-100,90)"
 68 |      id="use2779"
 69 |      x="0"
 70 |      y="0"
 71 |      width="400"
 72 |      height="375"
 73 |      xlink:href="#path1891" />
 74 |   <use
 75 |      transform="translate(-40,180)"
 76 |      id="use2781"
 77 |      x="0"
 78 |      y="0"
 79 |      width="400"
 80 |      height="375"
 81 |      xlink:href="#path1891" />
 82 |   <use
 83 |      transform="translate(30,272)"
 84 |      id="use2783"
 85 |      x="0"
 86 |      y="0"
 87 |      width="400"
 88 |      height="375"
 89 |      xlink:href="#path1891" />
 90 |   <use
 91 |      transform="translate(100,90)"
 92 |      id="use2785"
 93 |      x="0"
 94 |      y="0"
 95 |      width="400"
 96 |      height="375"
 97 |      xlink:href="#path1891" />
 98 |   <use
 99 |      transform="translate(160,180)"
100 |      id="use2787"
101 |      x="0"
102 |      y="0"
103 |      width="400"
104 |      height="375"
105 |      xlink:href="#path1891" />
106 |   <use
107 |      transform="translate(-160,180)"
108 |      id="use2789"
109 |      x="0"
110 |      y="0"
111 |      width="400"
112 |      height="375"
113 |      xlink:href="#path1891" />
114 |   <use
115 |      transform="translate(-112,272.0005)"
116 |      id="use2791"
117 |      x="0"
118 |      y="0"
119 |      width="400"
120 |      height="375"
121 |      xlink:href="#path1891" />
122 |   <use
123 |      transform="translate(105,278)"
124 |      id="use2793"
125 |      x="0"
126 |      y="0"
127 |      width="400"
128 |      height="375"
129 |      xlink:href="#path1891" />
130 |   <text
131 |      x="262"
132 |      y="77"
133 |      id="text2803"
134 |      xml:space="preserve"
135 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
136 |        x="262"
137 |        y="77"
138 |        id="tspan2805">i</tspan></text>
139 |   <text
140 |      x="140"
141 |      y="77"
142 |      id="text2807"
143 |      xml:space="preserve"
144 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
145 |        x="140"
146 |        y="77"
147 |        id="tspan2809">t</tspan></text>
148 |   <text
149 |      x="141"
150 |      y="171"
151 |      id="text2811"
152 |      xml:space="preserve"
153 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
154 |        x="141"
155 |        y="171"
156 |        id="tspan2813">e</tspan></text>
157 |   <text
158 |      x="56"
159 |      y="171"
160 |      id="text2815"
161 |      xml:space="preserve"
162 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
163 |        x="56"
164 |        y="171"
165 |        id="tspan2817">o</tspan></text>
166 |   <text
167 |      x="343"
168 |      y="171"
169 |      id="text2819"
170 |      xml:space="preserve"
171 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
172 |        x="343"
173 |        y="171"
174 |        id="tspan2821">n</tspan></text>
175 |   <text
176 |      x="318"
177 |      y="273"
178 |      id="text2823"
179 |      xml:space="preserve"
180 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
181 |        x="318"
182 |        y="273"
183 |        id="tspan2825">n</tspan></text>
184 |   <text
185 |      x="218"
186 |      y="265"
187 |      id="text2827"
188 |      xml:space="preserve"
189 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
190 |        x="218"
191 |        y="265"
192 |        id="tspan2829">n</tspan></text>
193 |   <text
194 |      x="100"
195 |      y="265"
196 |      id="text2831"
197 |      xml:space="preserve"
198 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
199 |        x="100"
200 |        y="265"
201 |        id="tspan2833">a</tspan></text>
202 |   <text
203 |      x="99.900391"
204 |      y="134.66406"
205 |      id="text2835"
206 |      xml:space="preserve"
207 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
208 |        x="99.900391"
209 |        y="134.66406"
210 |        id="tspan2837">t</tspan></text>
211 |   <text
212 |      x="300.01172"
213 |      y="135.69531"
214 |      id="text2839"
215 |      xml:space="preserve"
216 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
217 |        x="300.01172"
218 |        y="135.69531"
219 |        id="tspan2841">i</tspan></text>
220 |   <text
221 |      x="359.98242"
222 |      y="225.69531"
223 |      id="text2843"
224 |      xml:space="preserve"
225 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
226 |        x="359.98242"
227 |        y="225.69531"
228 |        id="tspan2845">in</tspan></text>
229 |   <text
230 |      x="304.98242"
231 |      y="323.69531"
232 |      id="text2847"
233 |      xml:space="preserve"
234 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
235 |        x="304.98242"
236 |        y="323.69531"
237 |        id="tspan2849">inn</tspan></text>
238 |   <text
239 |      x="160.35742"
240 |      y="224.64062"
241 |      id="text2851"
242 |      xml:space="preserve"
243 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
244 |        x="160.35742"
245 |        y="224.64062"
246 |        id="tspan2853">te</tspan></text>
247 |   <text
248 |      x="87.824219"
249 |      y="316.64111"
250 |      id="text2855"
251 |      xml:space="preserve"
252 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
253 |        x="87.824219"
254 |        y="316.64111"
255 |        id="tspan2857">tea</tspan></text>
256 |   <text
257 |      x="230.60352"
258 |      y="316.64062"
259 |      id="text2859"
260 |      xml:space="preserve"
261 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
262 |        x="230.60352"
263 |        y="316.64062"
264 |        id="tspan2861">ten</tspan></text>
265 |   <text
266 |      x="40.328125"
267 |      y="224.64062"
268 |      id="text2863"
269 |      xml:space="preserve"
270 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
271 |        x="40.328125"
272 |        y="224.64062"
273 |        id="tspan2865">to</tspan></text>
274 |   <text
275 |      x="90.054688"
276 |      y="359.02148"
277 |      id="text2867"
278 |      xml:space="preserve"
279 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
280 |        x="90.054688"
281 |        y="359.02148"
282 |        id="tspan2869">3</tspan></text>
283 |   <text
284 |      x="225.2207"
285 |      y="359.02148"
286 |      id="text2871"
287 |      xml:space="preserve"
288 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
289 |        x="225.2207"
290 |        y="359.02148"
291 |        id="tspan2873">12</tspan></text>
292 |   <text
293 |      x="322.04102"
294 |      y="361.02148"
295 |      id="text2875"
296 |      xml:space="preserve"
297 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
298 |        x="322.04102"
299 |        y="361.02148"
300 |        id="tspan2877">9</tspan></text>
301 |   <text
302 |      x="39.979492"
303 |      y="269.02148"
304 |      id="text2879"
305 |      xml:space="preserve"
306 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
307 |        x="39.979492"
308 |        y="269.02148"
309 |        id="tspan2881">7</tspan></text>
310 |   <text
311 |      x="358.04102"
312 |      y="269.02148"
313 |      id="text2883"
314 |      xml:space="preserve"
315 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
316 |        x="358.04102"
317 |        y="269.02148"
318 |        id="tspan2885">5</tspan></text>
319 |   <text
320 |      x="299.98633"
321 |      y="179.02148"
322 |      id="text2889"
323 |      xml:space="preserve"
324 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
325 |        x="299.98633"
326 |        y="179.02148"
327 |        id="tspan2891">11</tspan></text>
328 |   <path
329 |      d="M 175.36581,55.643928 L 126.46251,105.26464"
330 |      id="path2893"
331 |      style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.59399998;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
332 |   <path
333 |      d="M 124.36133,107.4679 L 126.57751,98.578283 L 133.25094,105.25172 L 124.36133,107.4679 z"
334 |      id="path2895"
335 |      style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.27539158px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
336 |   <g
337 |      transform="matrix(-0.901838,0.901838,0.901838,0.901838,58.5094,-17.26388)"
338 |      id="use2901">
339 |     <path
340 |        d="M -51.618795,132.46237 L -51.221046,187.08638"
341 |        id="path2450"
342 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
343 |     <path
344 |        d="M -51.164452,189.47286 L -54.864361,183.31555 L -47.464542,183.31555 L -51.164452,189.47286 z"
345 |        id="path2452"
346 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
347 |   </g>
348 |   <g
349 |      transform="matrix(0.7268237,0.4961859,-0.4961859,0.7268237,187.09824,81.053123)"
350 |      id="use2909">
351 |     <path
352 |        d="M -51.618795,132.46237 L -51.221046,187.08638"
353 |        id="path2456"
354 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
355 |     <path
356 |        d="M -51.164452,189.47286 L -54.864361,183.31555 L -47.464542,183.31555 L -51.164452,189.47286 z"
357 |        id="path2458"
358 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
359 |   </g>
360 |   <g
361 |      transform="matrix(-0.7268237,0.4961859,0.4961859,0.7268237,12.76326,81.053123)"
362 |      id="use2911">
363 |     <path
364 |        d="m -51.618795,132.46237 0.397749,54.62401"
365 |        id="path951"
366 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
367 |     <path
368 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
369 |        id="path953"
370 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
371 |   </g>
372 |   <g
373 |      transform="matrix(-0.7268237,0.4961859,0.4961859,0.7268237,212.05616,81.053123)"
374 |      id="use2913">
375 |     <path
376 |        d="m -51.618795,132.46237 0.397749,54.62401"
377 |        id="path945"
378 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
379 |     <path
380 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
381 |        id="path947"
382 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
383 |   </g>
384 |   <g
385 |      transform="matrix(0.88210682,0.57070657,-0.60219413,0.83598317,261.83771,154.85943)"
386 |      id="use2915">
387 |     <path
388 |        d="m -51.618795,132.46237 0.397749,54.62401"
389 |        id="path933"
390 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
391 |     <path
392 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
393 |        id="path935"
394 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
395 |   </g>
396 |   <g
397 |      transform="matrix(0.88210682,0.57070657,-0.60219413,0.83598317,476.83771,160.85943)"
398 |      id="use2917">
399 |     <path
400 |        d="m -51.618795,132.46237 0.397749,54.62401"
401 |        id="path927"
402 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
403 |     <path
404 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
405 |        id="path929"
406 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
407 |   </g>
408 |   <use
409 |      transform="translate(-40.0005,284.0005)"
410 |      id="use2442"
411 |      x="0"
412 |      y="0"
413 |      width="400"
414 |      height="375"
415 |      xlink:href="#path1891" />
416 |   <g
417 |      transform="matrix(-0.88210682,0.57070657,0.60219413,0.83598317,57.43439,154.85943)"
418 |      id="use2444">
419 |     <path
420 |        d="m -51.618795,132.46237 0.397749,54.62401"
421 |        id="path921"
422 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
423 |     <path
424 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
425 |        id="path923"
426 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
427 |   </g>
428 |   <g
429 |      transform="matrix(1.1967009,1.5299249e-2,-2.1458364e-2,0.8532162,224.55253,132.02506)"
430 |      id="g4537">
431 |     <path
432 |        d="M -51.618795,132.46237 L -51.221046,187.08638"
433 |        id="path4539"
434 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.08848143;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
435 |     <path
436 |        d="M -51.164452,189.47286 L -54.864361,183.31555 L -47.464542,183.31555 L -51.164452,189.47286 z"
437 |        id="path4541"
438 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.87078512px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
439 |   </g>
440 |   <text
441 |      x="160.59715"
442 |      y="329.57861"
443 |      id="text4543"
444 |      xml:space="preserve"
445 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
446 |        x="160.59715"
447 |        y="329.57861"
448 |        id="tspan4545">ted</tspan></text>
449 |   <text
450 |      x="149.49609"
451 |      y="268.10455"
452 |      id="text4547"
453 |      xml:space="preserve"
454 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
455 |        x="149.49609"
456 |        y="268.10455"
457 |        id="tspan4549">d</tspan></text>
458 |   <text
459 |      x="188.51068"
460 |      y="87.989876"
461 |      id="text4553"
462 |      xml:space="preserve"
463 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
464 |        x="188.51068"
465 |        y="87.989876"
466 |        id="tspan4555">A</tspan></text>
467 |   <g
468 |      transform="matrix(1.5157767,0.01937848,-0.0271798,1.0807088,282.53636,-79.781334)"
469 |      id="use4559">
470 |     <path
471 |        d="m -51.618795,132.46237 0.397749,54.62401"
472 |        id="path939"
473 |        style="fill:none;fill-opacity:0.75;fill-rule:evenodd;stroke:#000000;stroke-width:1.08848;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
474 |     <path
475 |        d="m -51.164452,189.47286 -3.699909,-6.15731 h 7.399819 z"
476 |        id="path941"
477 |        style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.870785px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1" />
478 |   </g>
479 |   <use
480 |      transform="translate(0,118)"
481 |      id="use4561"
482 |      x="0"
483 |      y="0"
484 |      width="400"
485 |      height="375"
486 |      xlink:href="#path1891" />
487 |   <text
488 |      x="200.01172"
489 |      y="163.25586"
490 |      id="text4563"
491 |      xml:space="preserve"
492 |      style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
493 |        x="200.01172"
494 |        y="163.25586"
495 |        id="tspan4565">A</tspan></text>
496 |   <text
497 |      x="228.30049"
498 |      y="200.07301"
499 |      id="text4587"
500 |      xml:space="preserve"
501 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
502 |        x="228.30049"
503 |        y="200.07301"
504 |        id="tspan4589">15</tspan></text>
505 |   <text
506 |      x="174.62402"
507 |      y="366.24948"
508 |      id="text4591"
509 |      xml:space="preserve"
510 |      style="font-size:28px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000094;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;font-family:Arial"><tspan
511 |        x="174.62402"
512 |        y="366.24948"
513 |        id="tspan4593">4</tspan></text>
514 | </svg>
515 | 


--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/fatih/color"
 8 | 	joeguotldextract "github.com/joeguo/tldextract"
 9 | 	tld "github.com/jpillora/go-tld"
10 | 	mjd2021usatldextract "github.com/mjd2021usa/tldextract"
11 | )
12 | 
13 | func BenchmarkComparison(b *testing.B) {
14 | 	var benchmarkURLs = []string{
15 | 		"https://iupac.org/iupac-announces-the-2021-top-ten-emerging-technologies-in-chemistry/",
16 | 		"https://www.google.com/maps/dir/Parliament+Place,+Parliament+House+Of+Singapore,+" +
17 | 			"Singapore/Parliament+St,+London,+UK/@25.2440033,33.6721455,4z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x31d" +
18 | 			"a19a0abd4d71d:0xeda26636dc4ea1dc!2m2!1d103.8504863!2d1.2891543!1m5!1m1!1s0x487604c5aaa7da5b:0xf13a2" +
19 | 			"197d7e7dd26!2m2!1d-0.1260826!2d51.5017061!3e4",
20 | 		"https://a.b.c.d.e.f.g.h.i.j.k.l.m.n.oo.pp.qqq.rrrr.ssssss.tttttttt.uuuuuuuuuuu.vvvvvvvvvvvvvvv.wwwwwwwwwwwwwwwwwwwwww.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy.zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.cc",
21 | 	}
22 | 
23 | 	benchmarks := []struct {
24 | 		name string
25 | 	}{
26 | 		{"GoFastTld"},            // this module
27 | 		{"JPilloraGoTld"},        // github.com/jpillora/go-tld
28 | 		{"JoeGuoTldExtract"},     // github.com/joeguo/tldextract
29 | 		{"Mjd2021USATldExtract"}, // github.com/mjd2021usa/tldextract
30 | 	}
31 | 
32 | 	cache := "/tmp/tld.cache"
33 | 
34 | 	for _, benchmarkURL := range benchmarkURLs {
35 | 		for _, bm := range benchmarks {
36 | 			if bm.name == "GoFastTld" {
37 | 				testPSLFilePath, _ := getTestPSLFilePath()
38 | 				GoFastTld, _ := New(SuffixListParams{
39 | 					CacheFilePath:        testPSLFilePath,
40 | 					IncludePrivateSuffix: false,
41 | 				})
42 | 				b.Run(fmt.Sprint(bm.name), func(b *testing.B) {
43 | 					for i := 0; i < b.N; i++ {
44 | 						GoFastTld.Extract(URLParams{URL: benchmarkURL})
45 | 					}
46 | 				})
47 | 			} else if bm.name == "JPilloraGoTld" {
48 | 				// Provides the Port and Path subcomponents
49 | 				// Cannot handle "+://google.com" and IP addresses
50 | 				// Cannot handle urls without Scheme subcomponent
51 | 				// Cannot handle trailing whitespace
52 | 				b.Run(fmt.Sprint(bm.name), func(b *testing.B) {
53 | 					for i := 0; i < b.N; i++ {
54 | 						tld.Parse(benchmarkURL)
55 | 					}
56 | 				})
57 | 			} else if bm.name == "JoeGuoTldExtract" {
58 | 				JoeGuoTldExtract, _ := joeguotldextract.New(cache, false)
59 | 				b.Run(fmt.Sprint(bm.name), func(b *testing.B) {
60 | 					for i := 0; i < b.N; i++ {
61 | 						JoeGuoTldExtract.Extract(benchmarkURL)
62 | 					}
63 | 				})
64 | 
65 | 			} else if bm.name == "Mjd2021USATldExtract" {
66 | 				Mjd2021USATldExtract, _ := mjd2021usatldextract.New(cache, false)
67 | 				b.Run(fmt.Sprint(bm.name), func(b *testing.B) {
68 | 					for i := 0; i < b.N; i++ {
69 | 						Mjd2021USATldExtract.Extract(benchmarkURL)
70 | 					}
71 | 				})
72 | 			}
73 | 		}
74 | 		color.New().Println()
75 | 		color.New(color.FgHiGreen, color.Bold).Print("Benchmarks completed for URL : ")
76 | 		color.New(color.FgHiBlue).Println(benchmarkURL)
77 | 		color.New(color.FgHiWhite).Println("=======")
78 | 	}
79 | }
80 | 
81 | /*
82 | 
83 | Omitted modules
84 | 
85 | github.com/M507/tlde | Almost exactly the same as github.com/joeguo/tldextract
86 | 
87 | github.com/ImVexed/fasturl | Fast, but cannot extract eTLDs
88 | 
89 | github.com/weppos/publicsuffix-go | Cannot handle full URLs with scheme (i.e. https:// ftp:// etc.)
90 | 
91 | github.com/forease/gotld | Does not extract subdomain properly and cannot handle ip addresses
92 | 
93 | */
94 | 


--------------------------------------------------------------------------------
/cmd/fasttld/extract.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"log"
 5 | 
 6 | 	"github.com/elliotwutingfeng/go-fasttld"
 7 | 	"github.com/fatih/color"
 8 | 	"github.com/spf13/cobra"
 9 | )
10 | 
11 | var includePrivateSuffix, ignoreSubDomains, toPunyCode bool
12 | 
13 | var extractCmd = &cobra.Command{
14 | 	Use:     "extract",
15 | 	Aliases: []string{"ext"},
16 | 	Short:   "Extracts subcomponents from a URL.",
17 | 	Long: `Extracts subcomponents from a URL.
18 | 
19 | For Example
20 | ---
21 | fasttld extract abc.example.com:5000/a/path
22 | ---
23 | 	`,
24 | 	Args: cobra.ExactArgs(1),
25 | 	Run: func(cmd *cobra.Command, args []string) {
26 | 		extractor, err := fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: includePrivateSuffix})
27 | 		if err != nil {
28 | 			log.Fatal(err)
29 | 		}
30 | 		res, err := extractor.Extract(fasttld.URLParams{URL: args[0], IgnoreSubDomains: ignoreSubDomains, ConvertURLToPunyCode: toPunyCode})
31 | 		if err != nil {
32 | 			color.New(color.FgHiRed, color.Bold).Print("Error: ")
33 | 			color.New(color.FgHiWhite).Println(err)
34 | 		}
35 | 		fasttld.PrintRes(args[0], res)
36 | 	},
37 | }
38 | 
39 | func init() {
40 | 	extractCmd.Flags().BoolVarP(&includePrivateSuffix, "private-suffix", "p", false, "Include private suffix")
41 | 	extractCmd.Flags().BoolVarP(&ignoreSubDomains, "ignore-subdomains", "i", false, "Ignore subdomains")
42 | 	extractCmd.Flags().BoolVarP(&toPunyCode, "to-punycode", "t", false, "Convert to punycode")
43 | 	rootCmd.AddCommand(extractCmd)
44 | }
45 | 


--------------------------------------------------------------------------------
/cmd/fasttld/root.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/spf13/cobra"
 8 | )
 9 | 
10 | var version string = ""
11 | 
12 | var rootCmd = &cobra.Command{
13 | 	Use:     "fasttld",
14 | 	Version: version,
15 | 	Short:   `fasttld is a high performance effective top level domains (eTLD) extraction module.`,
16 | 	Long:    `fasttld is a high performance effective top level domains (eTLD) extraction module.`,
17 | 	Run:     func(cmd *cobra.Command, args []string) {},
18 | }
19 | 
20 | // Execute runs the cobra.Command CLI
21 | func Execute() {
22 | 	if err := rootCmd.Execute(); err != nil {
23 | 		fmt.Fprintf(os.Stderr, "Whoops. There was an error while executing your CLI '%s'", err)
24 | 		os.Exit(1)
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | import "github.com/elliotwutingfeng/go-fasttld/cmd/fasttld"
4 | 
5 | // main executes the fasttld CLI
6 | func main() {
7 | 	fasttld.Execute()
8 | }
9 | 


--------------------------------------------------------------------------------
/data/gen.go:
--------------------------------------------------------------------------------
 1 | // The following directive is necessary to make the package coherent:
 2 | 
 3 | //go:build ignore
 4 | // +build ignore
 5 | 
 6 | // This program generates fallback.go. It can be invoked by running
 7 | // go generate
 8 | 
 9 | //go:generate go run gen.go
10 | 
11 | package main
12 | 
13 | import (
14 | 	"log"
15 | 	"net/http"
16 | 	"os"
17 | 	"text/template"
18 | 	"time"
19 | 
20 | 	"github.com/spf13/afero"
21 | )
22 | 
23 | func main() {
24 | 	const url = "https://publicsuffix.org/list/public_suffix_list.dat"
25 | 
26 | 	rsp, err := http.Get(url)
27 | 	fail(err)
28 | 	defer rsp.Body.Close()
29 | 
30 | 	b, err := afero.ReadAll(rsp.Body)
31 | 	fail(err)
32 | 	content := string(b)
33 | 
34 | 	f, err := os.Create("../fallback.go")
35 | 	f.Seek(0, 0)
36 | 	fail(err)
37 | 	defer f.Close()
38 | 
39 | 	pslTemplate.Execute(f, struct {
40 | 		Timestamp time.Time
41 | 		URL       string
42 | 		Content   string
43 | 	}{
44 | 		Timestamp: time.Now(),
45 | 		URL:       url,
46 | 		Content:   content,
47 | 	})
48 | }
49 | 
50 | func fail(err error) {
51 | 	if err != nil {
52 | 		log.Fatal(err)
53 | 	}
54 | }
55 | 
56 | var pslTemplate = template.Must(template.New("").Parse(`package fasttld
57 | 
58 | // Code generated by go generate; DO NOT EDIT.
59 | // This file was generated by robots at
60 | // {{ .Timestamp }}
61 | // using data from
62 | // {{ .URL }}
63 | 
64 | const hardcodedPSL string = ` + "`{{ .Content }}`\n"))
65 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elliotwutingfeng/go-fasttld/bcce76cf9926fe015be8ef1f5cb354bbbd9f5165/demo.gif


--------------------------------------------------------------------------------
/examples/demo.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 
 6 | 	"github.com/elliotwutingfeng/go-fasttld"
 7 | 	"github.com/fatih/color"
 8 | )
 9 | 
10 | func main() {
11 | 	var fontStyle = []color.Attribute{color.FgHiWhite, color.Bold}
12 | 
13 | 	// Hostname
14 | 	url := "https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42"
15 | 
16 | 	extractor, err := fasttld.New(fasttld.SuffixListParams{})
17 | 	// or instead, specify a custom public suffix list file via SuffixListParams
18 | 
19 | 	if err != nil {
20 | 		log.Fatal(err)
21 | 	}
22 | 
23 | 	res, _ := extractor.Extract(fasttld.URLParams{URL: url})
24 | 	color.New(fontStyle...).Println("Hostname")
25 | 	fasttld.PrintRes(url, res)
26 | 
27 | 	// IPv4 Address
28 | 	url = "https://127.0.0.1:5000"
29 | 
30 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
31 | 	color.New(fontStyle...).Println("IPv4 Address")
32 | 	fasttld.PrintRes(url, res)
33 | 
34 | 	// IPv6 Address
35 | 	url = "https://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000"
36 | 
37 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
38 | 	color.New(fontStyle...).Println("IPv6 Address")
39 | 	fasttld.PrintRes(url, res)
40 | 
41 | 	// Internationalised label separators
42 | 	url = "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"
43 | 
44 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
45 | 	color.New(fontStyle...).Println("Internationalised label separators")
46 | 	fasttld.PrintRes(url, res)
47 | 
48 | 	// Manually update local cache
49 | 	if err := extractor.Update(); err != nil {
50 | 		log.Println(err)
51 | 	}
52 | 
53 | 	// Private domains
54 | 	url = "https://google.blogspot.com"
55 | 
56 | 	extractor, _ = fasttld.New(fasttld.SuffixListParams{})
57 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
58 | 	color.New(fontStyle...).Println("Exclude Private Domains")
59 | 	fasttld.PrintRes(url, res)
60 | 
61 | 	extractor, _ = fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: true})
62 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
63 | 	color.New(fontStyle...).Println("Include Private Domains")
64 | 	fasttld.PrintRes(url, res)
65 | 
66 | 	// Ignore Subdomains
67 | 	url = "https://maps.google.com"
68 | 
69 | 	extractor, _ = fasttld.New(fasttld.SuffixListParams{})
70 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url, IgnoreSubDomains: true})
71 | 	color.New(fontStyle...).Println("Ignore Subdomains")
72 | 	fasttld.PrintRes(url, res)
73 | 
74 | 	// Punycode
75 | 	url = "https://hello.世界.com"
76 | 
77 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url})
78 | 	color.New(fontStyle...).Println("No Punycode")
79 | 	fasttld.PrintRes(url, res)
80 | 
81 | 	res, _ = extractor.Extract(fasttld.URLParams{URL: url, ConvertURLToPunyCode: true})
82 | 	color.New(fontStyle...).Println("Punycode")
83 | 	fasttld.PrintRes(url, res)
84 | 
85 | 	// Parsing errors
86 | 	url = "https://example!.com" // invalid characters in hostname
87 | 
88 | 	color.New(fontStyle...).Println("Parsing errors")
89 | 	color.New().Println("The following line should be an error message")
90 | 	if res, err = extractor.Extract(fasttld.URLParams{URL: url}); err != nil {
91 | 		color.New(color.FgHiRed, color.Bold).Print("Error: ")
92 | 		color.New(color.FgHiWhite).Println(err)
93 | 	}
94 | 	fasttld.PrintRes(url, res) // Partially extracted subcomponents can still be retrieved
95 | }
96 | 


--------------------------------------------------------------------------------
/fasttld.go:
--------------------------------------------------------------------------------
  1 | // Package fasttld is a high performance effective top level domains (eTLD)
  2 | // extraction module implemented with compressed tries.
  3 | //
  4 | // This module is a port of the Python fasttld module,
  5 | // with additional modifications to support extraction
  6 | // of subcomponents from full URLs, IPv4 addresses, and IPv6 addresses.
  7 | package fasttld
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 	"log"
 12 | 	"net/url"
 13 | 	"strconv"
 14 | 	"strings"
 15 | 
 16 | 	"github.com/spf13/afero"
 17 | 	"github.com/tidwall/hashmap"
 18 | 	"golang.org/x/net/idna"
 19 | )
 20 | 
 21 | const defaultPSLFolder string = "data"
 22 | const defaultPSLFileName string = "public_suffix_list.dat"
 23 | const largestPortNumber int = 65535
 24 | const pslMaxAgeHours float64 = 72
 25 | 
 26 | // FastTLD provides the Extract() function, to extract
 27 | // URLs using tldTrie generated from the
 28 | // Public Suffix List file at cacheFilePath.
 29 | type FastTLD struct {
 30 | 	cacheFilePath        string
 31 | 	tldTrie              *trie
 32 | 	includePrivateSuffix bool
 33 | }
 34 | 
 35 | // HostType indicates whether parsed URL
 36 | // contains a HostName, IPv4 address, IPv6 address
 37 | // or none of them
 38 | type HostType int
 39 | 
 40 | // None, HostName, IPv4 and IPv6 indicate whether parsed URL
 41 | // contains a HostName, IPv4 address, IPv6 address
 42 | // or none of them
 43 | const (
 44 | 	None HostType = iota
 45 | 	HostName
 46 | 	IPv4
 47 | 	IPv6
 48 | )
 49 | 
 50 | // ExtractResult contains components extracted from URL.
 51 | type ExtractResult struct {
 52 | 	Scheme, UserInfo, SubDomain, Domain, Suffix, RegisteredDomain, Port, Path string
 53 | 	HostType                                                                  HostType
 54 | }
 55 | 
 56 | // SuffixListParams contains parameters for specifying path to Public Suffix List file and
 57 | // whether to extract private suffixes (e.g. blogspot.com).
 58 | type SuffixListParams struct {
 59 | 	CacheFilePath        string
 60 | 	IncludePrivateSuffix bool
 61 | }
 62 | 
 63 | // URLParams specifies URL to extract components from.
 64 | //
 65 | // If IgnoreSubDomains = true, do not extract SubDomain.
 66 | //
 67 | // If ConvertURLToPunyCode = true, convert non-ASCII characters like 世界 to punycode.
 68 | type URLParams struct {
 69 | 	URL                  string
 70 | 	IgnoreSubDomains     bool
 71 | 	ConvertURLToPunyCode bool
 72 | }
 73 | 
 74 | // trie is a node of the compressed trie
 75 | // used to store Public Suffix List eTLDs.
 76 | type trie struct {
 77 | 	matches hashmap.Map[string, *trie]
 78 | 	end     bool
 79 | }
 80 | 
 81 | // nestedDict stores a slice of keys in the trie, by traversing the trie using the keys as a "path",
 82 | // creating new tries for keys that do not exist yet.
 83 | //
 84 | // If a new path overlaps an existing path, flag the previous path's trie node as end = true.
 85 | func nestedDict(dic *trie, keys []string) {
 86 | 	for _, key := range keys {
 87 | 		if _, ok := dic.matches.Get(key); !ok {
 88 | 			// key doesn't exist; add new node
 89 | 			var m hashmap.Map[string, *trie]
 90 | 			dic.matches.Set(key, &trie{matches: m})
 91 | 		}
 92 | 		dic, _ = dic.matches.Get(key)
 93 | 	}
 94 | 	// set last node to end = true
 95 | 	dic.end = true
 96 | }
 97 | 
 98 | // trieConstruct constructs a compressed trie to store Public Suffix List eTLDs split at "." in reverse-order.
 99 | //
100 | // For example: "us.gov.pl" will be stored in the order {"pl", "gov", "us"}.
101 | func trieConstruct(includePrivateSuffix bool, cacheFilePath string) (*trie, error) {
102 | 	var m hashmap.Map[string, *trie]
103 | 	tldTrie := &trie{matches: m}
104 | 
105 | 	var suffixLists suffixes
106 | 	var err error
107 | 	if cacheFilePath != "" {
108 | 		suffixLists, err = getPublicSuffixList(cacheFilePath)
109 | 	} else {
110 | 		suffixLists, err = getHardcodedPublicSuffixList()
111 | 	}
112 | 
113 | 	if err != nil {
114 | 		log.Println(err)
115 | 		return tldTrie, err
116 | 	}
117 | 
118 | 	var suffixList []string
119 | 	if includePrivateSuffix {
120 | 		suffixList = suffixLists.allSuffixes
121 | 	} else {
122 | 		suffixList = suffixLists.publicSuffixes
123 | 	}
124 | 
125 | 	for _, suffix := range suffixList {
126 | 		sp := strings.Split(suffix, ".")
127 | 		reverse(sp)
128 | 		nestedDict(tldTrie, sp)
129 | 	}
130 | 
131 | 	tldTrie.matches.Scan(func(key string, value *trie) bool {
132 | 		if _, ok := value.matches.Get("*"); ok {
133 | 			value.end = true
134 | 		}
135 | 		return true
136 | 	})
137 | 
138 | 	return tldTrie, nil
139 | }
140 | 
141 | // Extract components from a given `url`.
142 | func (f *FastTLD) Extract(e URLParams) (ExtractResult, error) {
143 | 	urlParts := ExtractResult{}
144 | 
145 | 	// Extract URL scheme
146 | 	netloc := fastTrim(e.URL, whitespaceRuneSet, trimBoth)
147 | 	if schemeEndIndex := getSchemeEndIndex(netloc); schemeEndIndex != -1 {
148 | 		urlParts.Scheme = netloc[0:schemeEndIndex]
149 | 		netloc = netloc[schemeEndIndex:]
150 | 	}
151 | 
152 | 	// Extract URL userinfo
153 | 	if atIdx := indexLastByteBefore(netloc, '@', invalidUserInfoCharsSet); atIdx != -1 {
154 | 		urlParts.UserInfo = netloc[0:atIdx]
155 | 		netloc = netloc[atIdx+1:]
156 | 	}
157 | 
158 | 	// Find square brackets (if any) and host end index
159 | 	openingSquareBracketIdx := -1
160 | 	closingSquareBracketIdx := -1
161 | 	hostEndIdx := -1
162 | 
163 | 	for i, r := range []byte(netloc) {
164 | 		if r == '[' {
165 | 			// Check for opening square bracket
166 | 			if i > 0 {
167 | 				// Reject if opening square bracket is not first character of hostname
168 | 				return urlParts, errors.New("opening square bracket is not first character of hostname")
169 | 			}
170 | 			openingSquareBracketIdx = i
171 | 		}
172 | 		if r == ']' {
173 | 			// Check for closing square bracket
174 | 			closingSquareBracketIdx = i
175 | 		}
176 | 
177 | 		if openingSquareBracketIdx == -1 {
178 | 			if closingSquareBracketIdx != -1 {
179 | 				// Reject if closing square bracket present but no opening square bracket
180 | 				return urlParts, errors.New("closing square bracket present but no opening square bracket")
181 | 			}
182 | 			if endOfHostDelimitersSet.contains(r) {
183 | 				// If no square brackets
184 | 				// Check for endOfHostDelimitersSet
185 | 				hostEndIdx = i
186 | 				break
187 | 			}
188 | 		} else if closingSquareBracketIdx > openingSquareBracketIdx && endOfHostWithPortDelimitersSet.contains(r) {
189 | 			// If opening + closing square bracket are present in correct order
190 | 			// check for endOfHostWithPortDelimitersSet
191 | 			hostEndIdx = i
192 | 			break
193 | 		}
194 | 
195 | 		if i == len(netloc)-1 && closingSquareBracketIdx < openingSquareBracketIdx {
196 | 			// Reject if end of netloc reached but incomplete square bracket pair
197 | 			return urlParts, errors.New("incomplete square bracket pair")
198 | 		}
199 | 	}
200 | 
201 | 	if closingSquareBracketIdx == len(netloc)-1 {
202 | 		hostEndIdx = -1
203 | 	} else if closingSquareBracketIdx != -1 {
204 | 		hostEndIdx = closingSquareBracketIdx + 1
205 | 	}
206 | 
207 | 	// Check for IPv6 address
208 | 	if closingSquareBracketIdx > openingSquareBracketIdx {
209 | 		if !isIPv6(netloc[1:closingSquareBracketIdx]) {
210 | 			// Have square brackets but invalid IPv6 address => Domain is invalid
211 | 			return urlParts, errors.New("invalid IPv6 address")
212 | 		}
213 | 		if hostEndIdx != -1 {
214 | 			afterHost := netloc[hostEndIdx:]
215 | 			if indexAnyASCII(afterHost, endOfHostDelimitersSet) != 0 {
216 | 				// Reject IPv6 if there are invalid trailing characters after IPv6 address
217 | 				return urlParts, errors.New("invalid trailing characters after IPv6 address")
218 | 			}
219 | 		}
220 | 		// Closing square bracket in correct place and IPv6 is valid
221 | 		urlParts.HostType = IPv6
222 | 		urlParts.Domain = netloc[1:closingSquareBracketIdx]
223 | 		urlParts.RegisteredDomain = netloc[1:closingSquareBracketIdx]
224 | 	}
225 | 
226 | 	var afterHost string
227 | 	// Separate URL host from subcomponents thereafter
228 | 	if hostEndIdx != -1 {
229 | 		afterHost = netloc[hostEndIdx:]
230 | 		netloc = netloc[0:hostEndIdx]
231 | 	}
232 | 
233 | 	// Extract Port and "Path" if any
234 | 	if len(afterHost) != 0 {
235 | 		pathStartIndex := indexAnyASCII(afterHost, endOfHostWithPortDelimitersSet)
236 | 		if afterHost[0] == ':' {
237 | 			var maybePort string
238 | 			if pathStartIndex == -1 {
239 | 				maybePort = afterHost[1:]
240 | 			} else {
241 | 				maybePort = afterHost[1:pathStartIndex]
242 | 			}
243 | 			if port, err := strconv.Atoi(maybePort); err == nil && 0 <= port && port <= largestPortNumber {
244 | 				urlParts.Port = maybePort
245 | 			} else {
246 | 				return urlParts, errors.New("invalid port")
247 | 			}
248 | 		}
249 | 		if pathStartIndex != -1 && pathStartIndex != len(afterHost) {
250 | 			// If there is any path/query/fragment after the URL authority component...
251 | 			// See https://stackoverflow.com/questions/47543432/what-do-we-call-the-combined-path-query-and-fragment-in-a-uri
252 | 			// For simplicity, we shall call this the "Path".
253 | 			urlParts.Path = afterHost[pathStartIndex:]
254 | 		}
255 | 	}
256 | 
257 | 	if urlParts.HostType == IPv6 {
258 | 		return urlParts, nil
259 | 	}
260 | 
261 | 	// decode all percentage encoded characters, if any
262 | 	unescapedNetloc, err := url.QueryUnescape(netloc)
263 | 	if err != nil {
264 | 		return urlParts, err
265 | 	}
266 | 
267 | 	if e.ConvertURLToPunyCode {
268 | 		netloc = formatAsPunycode(unescapedNetloc)
269 | 	} else if _, err := idna.ToUnicode(unescapedNetloc); err != nil {
270 | 		// host is invalid if host cannot be converted to Unicode
271 | 		//
272 | 		// skip if host already converted to punycode
273 | 		log.Println(strings.SplitAfterN(err.Error(), "idna: invalid label", 2)[0])
274 | 		return urlParts, err
275 | 	}
276 | 
277 | 	// Check for eTLD Suffix
278 | 	node := f.tldTrie
279 | 
280 | 	var (
281 | 		hasSuffix      bool
282 | 		hasLabels      bool
283 | 		end            bool
284 | 		previousSepIdx int
285 | 	)
286 | 	sepIdx, suffixStartIdx, suffixEndIdx := len(netloc), len(netloc), len(netloc)
287 | 
288 | 	for !end {
289 | 		var label string
290 | 		previousSepIdx = sepIdx
291 | 		sepIdx = lastIndexAny(netloc[0:sepIdx], labelSeparatorsRuneSet)
292 | 		if sepIdx != -1 {
293 | 			label = netloc[sepIdx+sepSize(netloc[sepIdx]) : previousSepIdx]
294 | 			if len(label) == 0 {
295 | 				// allow consecutive label separators if suffix not found yet
296 | 				if !hasLabels {
297 | 					suffixEndIdx = sepIdx
298 | 					continue
299 | 				}
300 | 				// any occurrences of consecutive label separators on left-hand side of a label are illegal.
301 | 				return urlParts, errors.New("invalid consecutive label separators on left-hand side of a label")
302 | 			}
303 | 			hasLabels = true
304 | 		} else {
305 | 			label = netloc[0:previousSepIdx]
306 | 			end = true
307 | 		}
308 | 
309 | 		if _, ok := node.matches.Get("*"); ok {
310 | 			// check if label falls under any wildcard exception rule
311 | 			// e.g. !www.ck
312 | 			if _, ok := node.matches.Get("!" + label); ok {
313 | 				sepIdx = previousSepIdx
314 | 			}
315 | 			break
316 | 		}
317 | 
318 | 		// check if label is part of an eTLD
319 | 		label, _ = url.QueryUnescape(label)
320 | 		if val, ok := node.matches.Get(label); ok {
321 | 			suffixStartIdx = sepIdx
322 | 			if !hasSuffix && val.end {
323 | 				// index of end of suffix without trailing label separators
324 | 				suffixEndIdx = previousSepIdx
325 | 				hasSuffix = true
326 | 			}
327 | 			node = val
328 | 			if val.matches.Len() == 0 {
329 | 				// label is at a leaf node (no children) ; break out of loop
330 | 				break
331 | 			}
332 | 		} else {
333 | 			if previousSepIdx != len(netloc) {
334 | 				sepIdx = previousSepIdx
335 | 			}
336 | 			break
337 | 		}
338 | 	}
339 | 
340 | 	// Check for IPv4 address
341 | 	// Minimum possible length: len("0.0.0.0") -> 7
342 | 	// Ensure first rune is numeric before expensive isIPv4()
343 | 	if len(netloc) >= 7 && numericSet.contains(netloc[0]) && isIPv4(netloc) {
344 | 		urlParts.HostType = IPv4
345 | 		urlParts.Domain = netloc[0:previousSepIdx]
346 | 		urlParts.RegisteredDomain = urlParts.Domain
347 | 		return urlParts, nil
348 | 	}
349 | 
350 | 	if sepIdx == -1 {
351 | 		sepIdx, suffixStartIdx = len(netloc), len(netloc)
352 | 	}
353 | 
354 | 	// Reject if invalidHostNameChars or consecutive label separators
355 | 	// appears before Suffix
356 | 	if hasSuffix {
357 | 		if hasInvalidChars(netloc[0:suffixStartIdx]) {
358 | 			return urlParts, errors.New("invalid characters in hostname")
359 | 		}
360 | 	} else {
361 | 		if hasInvalidChars(netloc[0:previousSepIdx]) {
362 | 			return urlParts, errors.New("invalid characters in hostname")
363 | 		}
364 | 	}
365 | 
366 | 	var domainStartSepIdx int
367 | 	if hasSuffix {
368 | 		if sepIdx < len(netloc) { // If there is a Domain
369 | 			urlParts.Suffix = netloc[sepIdx+sepSize(netloc[sepIdx]) : suffixEndIdx]
370 | 			domainStartSepIdx = lastIndexAny(netloc[0:sepIdx], labelSeparatorsRuneSet)
371 | 			if domainStartSepIdx != -1 { // If there is a SubDomain
372 | 				domainStartIdx := domainStartSepIdx + sepSize(netloc[domainStartSepIdx])
373 | 				urlParts.Domain = netloc[domainStartIdx:sepIdx]
374 | 				urlParts.RegisteredDomain = netloc[domainStartIdx:suffixEndIdx]
375 | 			} else {
376 | 				urlParts.Domain = netloc[0:sepIdx]
377 | 				urlParts.RegisteredDomain = netloc[0:suffixEndIdx]
378 | 			}
379 | 		} else {
380 | 			// Only Suffix exists
381 | 			urlParts.Suffix = netloc[0:suffixEndIdx]
382 | 		}
383 | 	} else {
384 | 		domainStartSepIdx = lastIndexAny(netloc[0:suffixEndIdx], labelSeparatorsRuneSet)
385 | 		var domainStartIdx int
386 | 		if domainStartSepIdx != -1 { // If there is a SubDomain
387 | 			domainStartIdx = domainStartSepIdx + sepSize(netloc[domainStartSepIdx])
388 | 		}
389 | 		urlParts.Domain = netloc[domainStartIdx:suffixEndIdx]
390 | 	}
391 | 	if !e.IgnoreSubDomains && domainStartSepIdx != -1 { // If SubDomain is to be included
392 | 		urlParts.SubDomain = netloc[0:domainStartSepIdx]
393 | 	}
394 | 
395 | 	if len(urlParts.Domain) == 0 {
396 | 		return urlParts, errors.New("empty domain")
397 | 	}
398 | 	urlParts.HostType = HostName
399 | 	return urlParts, nil
400 | }
401 | 
402 | // New creates a new *FastTLD using data from a Public Suffix List file.
403 | func New(n SuffixListParams) (*FastTLD, error) {
404 | 	extractor := &FastTLD{cacheFilePath: n.CacheFilePath, tldTrie: &trie{}, includePrivateSuffix: n.IncludePrivateSuffix}
405 | 	// If cacheFilePath is unreachable, use temporary folder
406 | 	if isValid, _ := checkCacheFile(extractor.cacheFilePath); !isValid {
407 | 		filesystem := new(afero.OsFs)
408 | 		defaultCacheFolderPath := afero.GetTempDir(filesystem, "")
409 | 		defaultCacheFilePath := defaultCacheFolderPath + defaultPSLFileName
410 | 		defaultCacheFolder, err := filesystem.Open(defaultCacheFolderPath)
411 | 		if err != nil {
412 | 			// temporary folder not accessible, fallback to hardcoded Public Suffix list
413 | 			return newHardcodedPSL(err, n)
414 | 		}
415 | 		defer defaultCacheFolder.Close()
416 | 		extractor.cacheFilePath = defaultCacheFilePath
417 | 		isValid, lastModifiedHours := checkCacheFile(extractor.cacheFilePath)
418 | 		if !isValid || lastModifiedHours > pslMaxAgeHours {
419 | 			// update Public Suffix list cache if it is outdated
420 | 			if updateErr := extractor.Update(); updateErr != nil {
421 | 				// update failed, fallback to hardcoded Public Suffix list
422 | 				return newHardcodedPSL(err, n)
423 | 			}
424 | 			return extractor, err
425 | 		}
426 | 	}
427 | 
428 | 	tldTrie, err := trieConstruct(n.IncludePrivateSuffix, extractor.cacheFilePath)
429 | 	if err != nil {
430 | 		return newHardcodedPSL(err, n)
431 | 	}
432 | 	extractor.tldTrie = tldTrie
433 | 	return extractor, err
434 | }
435 | 


--------------------------------------------------------------------------------
/fasttld_test.go:
--------------------------------------------------------------------------------
  1 | package fasttld
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"reflect"
  8 | 	"strings"
  9 | 	"testing"
 10 | 
 11 | 	"github.com/tidwall/hashmap"
 12 | )
 13 | 
 14 | var errs = [...]error{
 15 | 	errors.New("opening square bracket is not first character of hostname"),
 16 | 	errors.New("closing square bracket present but no opening square bracket"),
 17 | 	errors.New("invalid square bracket pair"),
 18 | 	errors.New("incomplete square bracket pair"),
 19 | 	errors.New("invalid IPv6 address"),
 20 | 	errors.New("invalid trailing characters after IPv6 address"),
 21 | 	errors.New("invalid consecutive label separators on left-hand side of a label"),
 22 | 	errors.New("invalid characters in hostname before suffix"),
 23 | 	errors.New("invalid characters in hostname"),
 24 | 	errors.New("empty domain"),
 25 | 	errors.New("invalid port"),
 26 | }
 27 | 
 28 | func getTestPSLFilePath() (string, bool) {
 29 | 	var sb strings.Builder
 30 | 	currentFilePath, ok := getCurrentFilePath()
 31 | 	if !ok {
 32 | 		return "", ok
 33 | 	}
 34 | 	sb.WriteString(currentFilePath)
 35 | 	sb.WriteString(string(os.PathSeparator))
 36 | 	sb.WriteString("test")
 37 | 	sb.WriteString(string(os.PathSeparator))
 38 | 	sb.WriteString(defaultPSLFileName)
 39 | 	return sb.String(), ok
 40 | }
 41 | 
 42 | func TestNestedDict(t *testing.T) {
 43 | 	keysSequences := [][][]string{
 44 | 		{{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"c", "b"}, {"d", "f"}},
 45 | 		{{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"d", "f"}, {"c", "b"}},
 46 | 		{{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c", "b"}, {"c"}, {"d", "f"}},
 47 | 		{{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a"}, {"a", "b", "c"}, {"a", "b"}},
 48 | 		{{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b"}, {"a"}, {"a", "b", "c"}},
 49 | 		{{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b"}, {"a", "b", "c"}, {"a"}},
 50 | 		{{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b", "c"}, {"a"}, {"a", "b"}},
 51 | 		{{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b", "c"}, {"a", "b"}, {"a"}},
 52 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a"}, {"c"}, {"a", "b", "c"}},
 53 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"a"}, {"c"}},
 54 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"a"}},
 55 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"c"}, {"a"}, {"a", "b", "c"}},
 56 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"c"}, {"a", "b", "c"}, {"a"}},
 57 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b", "c"}, {"a"}, {"a", "b"}, {"c"}},
 58 | 		{{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b", "c"}, {"a"}, {"c"}, {"a", "b"}},
 59 | 	}
 60 | 	for _, keysSequence := range keysSequences {
 61 | 		var m hashmap.Map[string, *trie]
 62 | 		originalDict := &trie{matches: m}
 63 | 		for _, keys := range keysSequence {
 64 | 			nestedDict(originalDict, keys)
 65 | 		}
 66 | 		// check each nested value
 67 | 		//Top level c
 68 | 		c, _ := originalDict.matches.Get("c")
 69 | 		if c.matches.Len() != 1 {
 70 | 			t.Errorf("Top level c must have matches map of length 1")
 71 | 		}
 72 | 		if _, ok := c.matches.Get("b"); !ok {
 73 | 			t.Errorf("Top level c must have b in matches map")
 74 | 		}
 75 | 		if !c.end {
 76 | 			t.Errorf("Top level c must have end = true")
 77 | 		}
 78 | 		// Top level a
 79 | 		a, _ := originalDict.matches.Get("a")
 80 | 		if a.matches.Len() != 2 {
 81 | 			t.Errorf("Top level a must have matches map of length 2")
 82 | 		}
 83 | 		// a -> d
 84 | 		aToD, ok := a.matches.Get("d")
 85 | 		if !ok {
 86 | 			t.Errorf("Top level a must have d in matches map")
 87 | 		}
 88 | 		if aToD.matches.Len() != 0 {
 89 | 			t.Errorf("a -> d must have empty matches map")
 90 | 		}
 91 | 		// a -> b
 92 | 		aToB, ok := a.matches.Get("b")
 93 | 		if !ok {
 94 | 			t.Errorf("Top level a must have b in matches map")
 95 | 		}
 96 | 		if !aToB.end {
 97 | 			t.Errorf("a -> b must have end = true")
 98 | 		}
 99 | 		if aToB.matches.Len() != 1 {
100 | 			t.Errorf("a -> b must have matches map of length 1")
101 | 		}
102 | 		// a -> b -> c
103 | 		aToBToC, ok := aToB.matches.Get("c")
104 | 		if !ok {
105 | 			t.Errorf("a -> b must have c in matches map")
106 | 		}
107 | 		if aToBToC.matches.Len() != 0 {
108 | 			t.Errorf("a -> b -> c must have empty matches map")
109 | 		}
110 | 		if !a.end {
111 | 			t.Errorf("Top level a must have end = true")
112 | 		}
113 | 		// d -> f
114 | 		d, _ := originalDict.matches.Get("d")
115 | 		if d.end {
116 | 			t.Errorf("Top level d must have end = false")
117 | 		}
118 | 		dToF, _ := d.matches.Get("f")
119 | 		if !dToF.end {
120 | 			t.Errorf("d -> f must have end = true")
121 | 		}
122 | 		if dToF.matches.Len() != 0 {
123 | 			t.Errorf("d -> f must have empty matches map")
124 | 		}
125 | 	}
126 | }
127 | 
128 | func TestTrieConstruct(t *testing.T) {
129 | 	if _, err := trieConstruct(false, fmt.Sprintf("test%sthis_file_does_not_exist.dat", string(os.PathSeparator))); err == nil {
130 | 		t.Errorf("error returned by trieConstruct should not be nil")
131 | 	}
132 | 	if _, err := trieConstruct(false, ""); err != nil {
133 | 		t.Errorf("error returned by trieConstruct should be nil")
134 | 	}
135 | }
136 | 
137 | func TestTrie(t *testing.T) {
138 | 	trie, err := trieConstruct(false, fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator)))
139 | 	if err != nil {
140 | 		t.Errorf("trieConstruct failed | %q", err)
141 | 	}
142 | 	if lenTrieMatches := trie.matches.Len(); lenTrieMatches != 3 {
143 | 		t.Errorf("Expected top level Trie matches map length of 3. Got %d.", lenTrieMatches)
144 | 	}
145 | 	for _, tld := range []string{"ac", "ck"} {
146 | 		if _, ok := trie.matches.Get(tld); !ok {
147 | 			t.Errorf("Top level %q must exist", tld)
148 | 		}
149 | 	}
150 | 	ac, _ := trie.matches.Get("ac")
151 | 	if !ac.end {
152 | 		t.Errorf("Top level ac must have end = true")
153 | 	}
154 | 	ck, _ := trie.matches.Get("ck")
155 | 	if !ck.end {
156 | 		t.Errorf("Top level ck must have end = true")
157 | 	}
158 | 	if ck.matches.Len() != 2 {
159 | 		t.Errorf("Top level ck must have matches map of length 2")
160 | 	}
161 | 	ckToStar, ok := ck.matches.Get("*")
162 | 	if !ok {
163 | 		t.Errorf("Top level ck must have * in matches map")
164 | 	}
165 | 	if ckToStar.matches.Len() != 0 {
166 | 		t.Errorf("ck -> * must have empty matches map")
167 | 	}
168 | 	ckToExcWww, ok := ck.matches.Get("!www")
169 | 	if !ok {
170 | 		t.Errorf("Top level ck must have !www in matches map")
171 | 	}
172 | 	if ckToExcWww.matches.Len() != 0 {
173 | 		t.Errorf("ck -> !www must have empty matches map")
174 | 	}
175 | 	for _, tld := range []string{"com", "edu", "gov", "net", "mil", "org"} {
176 | 		ac, _ := trie.matches.Get("ac")
177 | 		acToTld, ok := ac.matches.Get(tld)
178 | 		if !ok {
179 | 			t.Errorf("Top level ac must have %q in matches map", tld)
180 | 		}
181 | 		if acToTld.matches.Len() != 0 {
182 | 			t.Errorf("ac -> %q must have empty matches map", tld)
183 | 		}
184 | 	}
185 | }
186 | 
187 | type newTest struct {
188 | 	cacheFilePath        string
189 | 	includePrivateSuffix bool
190 | 	expected             int
191 | }
192 | 
193 | var newTests = []newTest{
194 | 	{cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: false, expected: 1656},
195 | 	{cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: true, expected: 1656},
196 | 	{cacheFilePath: fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: true, expected: 4},
197 | }
198 | 
199 | func TestNew(t *testing.T) {
200 | 	for _, test := range newTests {
201 | 		cacheFilePath := test.cacheFilePath
202 | 		if cacheFilePath == "" {
203 | 			testPSLFilePath, ok := getTestPSLFilePath()
204 | 			if !ok {
205 | 				t.Errorf("Cannot get path to current module file")
206 | 			}
207 | 			cacheFilePath = testPSLFilePath
208 | 		}
209 | 		extractor, _ := New(SuffixListParams{
210 | 			CacheFilePath:        cacheFilePath,
211 | 			IncludePrivateSuffix: test.includePrivateSuffix,
212 | 		})
213 | 		if numTopLevelKeys := extractor.tldTrie.matches.Len(); numTopLevelKeys != test.expected {
214 | 			t.Errorf("Expected number of top level keys to be %d. Got %d.", test.expected, numTopLevelKeys)
215 | 		}
216 | 	}
217 | }
218 | 
219 | type extractTest struct {
220 | 	includePrivateSuffix bool
221 | 	urlParams            URLParams
222 | 	expected             ExtractResult
223 | 	err                  error
224 | 	description          string
225 | }
226 | 
227 | var schemeTests = []extractTest{
228 | 	{urlParams: URLParams{URL: "h://example.com"},
229 | 		expected: ExtractResult{
230 | 			Scheme: "h://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Single character Scheme"},
231 | 	{urlParams: URLParams{URL: "hTtPs://example.com"},
232 | 		expected: ExtractResult{
233 | 			Scheme: "hTtPs://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Capitalised Scheme"},
234 | 	{urlParams: URLParams{URL: "git-ssh://example.com"},
235 | 		expected: ExtractResult{
236 | 			Scheme: "git-ssh://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Scheme with -"},
237 | 	{urlParams: URLParams{URL: "https://username:password@foo.example.com:999/some/path?param1=value1&param2=葡萄"},
238 | 		expected: ExtractResult{
239 | 			Scheme: "https://", UserInfo: "username:password", SubDomain: "foo",
240 | 			Domain: "example", Suffix: "com", RegisteredDomain: "example.com",
241 | 			Port: "999", Path: "/some/path?param1=value1&param2=葡萄", HostType: HostName}, description: "Full https URL with SubDomain"},
242 | 	{urlParams: URLParams{URL: "http://www.example.com"},
243 | 		expected: ExtractResult{
244 | 			Scheme: "http://", SubDomain: "www",
245 | 			Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName},
246 | 		description: "Full http URL with SubDomain no path"},
247 | 	{urlParams: URLParams{
248 | 		URL: "http://example.co.uk/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F"},
249 | 		expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "co.uk",
250 | 			RegisteredDomain: "example.co.uk",
251 | 			Path:             "/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F",
252 | 			HostType:         HostName},
253 | 		description: "Full http URL with no SubDomain"},
254 | 	{urlParams: URLParams{
255 | 		URL: "http://big.long.sub.domain.example.co.uk/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F"},
256 | 		expected: ExtractResult{Scheme: "http://", SubDomain: "big.long.sub.domain",
257 | 			Domain: "example", Suffix: "co.uk", RegisteredDomain: "example.co.uk",
258 | 			Path:     "/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F",
259 | 			HostType: HostName},
260 | 		description: "Full http URL with SubDomain"},
261 | 	{urlParams: URLParams{
262 | 		URL: "ftp://username名字:password@mail.example.co.uk:666/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F"},
263 | 		expected: ExtractResult{Scheme: "ftp://", UserInfo: "username名字:password", SubDomain: "mail",
264 | 			Domain: "example", Suffix: "co.uk", RegisteredDomain: "example.co.uk", Port: "666",
265 | 			Path:     "/path?param1=value1&param2=葡萄&param3=value3&param4=value4&src=https%3A%2F%2Fwww.example.net%2F",
266 | 			HostType: HostName},
267 | 		description: "Full ftp URL with SubDomain"},
268 | 	{urlParams: URLParams{URL: "git+ssh://www.example.com/"},
269 | 		expected: ExtractResult{Scheme: "git+ssh://", SubDomain: "www",
270 | 			Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/", HostType: HostName}, description: "Full git+ssh URL with SubDomain"},
271 | 	{urlParams: URLParams{URL: "ssh://server.example.com/"},
272 | 		expected: ExtractResult{Scheme: "ssh://", SubDomain: "server",
273 | 			Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/", HostType: HostName}, description: "Full ssh URL with SubDomain"},
274 | 	{urlParams: URLParams{URL: "http://www.www.net"},
275 | 		expected: ExtractResult{Scheme: "http://", SubDomain: "www",
276 | 			Domain: "www", Suffix: "net", RegisteredDomain: "www.net", HostType: HostName}, description: "Multiple www"},
277 | }
278 | var noSchemeTests = []extractTest{
279 | 	{urlParams: URLParams{URL: "localhost"}, expected: ExtractResult{Domain: "localhost", HostType: HostName}, description: "localhost"},
280 | 	{urlParams: URLParams{URL: "16777215"}, expected: ExtractResult{Domain: "16777215", HostType: HostName}, description: "Number >= 0xFFFFFF"},
281 | 	{urlParams: URLParams{URL: "org"}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only"},
282 | 	{urlParams: URLParams{URL: "org."}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only with single trailing dot"}, //  RFC 1034 - allow single trailing dot
283 | 	{urlParams: URLParams{URL: "org.."}, expected: ExtractResult{}, err: errs[8], description: "Single eTLD | Suffix Only with 2 trailing dots"},
284 | 	{urlParams: URLParams{URL: "co.th"}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only"},
285 | 	{urlParams: URLParams{URL: "co.th."}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only with single trailing dot"}, //  RFC 1034 - allow single trailing dot
286 | 	{urlParams: URLParams{URL: "co.th.."}, expected: ExtractResult{}, err: errs[8], description: "Double eTLD | Suffix Only with 2 trailing dots"},
287 | 	{urlParams: URLParams{URL: "users@example.com"}, expected: ExtractResult{UserInfo: "users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "UserInfo + Domain | No Scheme"},
288 | 	{urlParams: URLParams{URL: "mailto:users@example.com"}, expected: ExtractResult{UserInfo: "mailto:users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Mailto | No Scheme"},
289 | 	{urlParams: URLParams{URL: "example.com:999"}, expected: ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Port: "999", HostType: HostName}, description: "Domain + Port | No Scheme"},
290 | 	{urlParams: URLParams{URL: "example.com"}, expected: ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Domain | No Scheme"},
291 | 	{urlParams: URLParams{URL: "255.255.example.com"}, expected: ExtractResult{SubDomain: "255.255", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Numeric SubDomain + Domain | No Scheme"},
292 | 	{urlParams: URLParams{URL: "server.example.com/path"}, expected: ExtractResult{SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/path", HostType: HostName}, description: "SubDomain, Domain and Path | No Scheme"},
293 | }
294 | var userInfoTests = []extractTest{
295 | 	{urlParams: URLParams{URL: "https://username@example.com"}, expected: ExtractResult{Scheme: "https://",
296 | 		UserInfo: "username", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "username"},
297 | 	{urlParams: URLParams{URL: "https://password@example.com"}, expected: ExtractResult{Scheme: "https://",
298 | 		UserInfo: "password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "username + password"},
299 | 	{urlParams: URLParams{URL: "https://:password@example.com"}, expected: ExtractResult{Scheme: "https://",
300 | 		UserInfo: ":password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty username"},
301 | 	{urlParams: URLParams{URL: "https://username:@example.com"}, expected: ExtractResult{Scheme: "https://",
302 | 		UserInfo: "username:", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty password"},
303 | 	{urlParams: URLParams{URL: "https://usern@me:password@example.com"}, expected: ExtractResult{Scheme: "https://",
304 | 		UserInfo: "usern@me:password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "@ in username"},
305 | 	{urlParams: URLParams{URL: "https://usern@me:p@ssword@example.com"}, expected: ExtractResult{Scheme: "https://",
306 | 		UserInfo: "usern@me:p@ssword", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "@ in password"},
307 | 	{urlParams: URLParams{URL: "https://usern@me:@example.com"}, expected: ExtractResult{Scheme: "https://",
308 | 		UserInfo: "usern@me:", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty password; @ in username"},
309 | 	{urlParams: URLParams{URL: "https://:p@ssword@example.com"}, expected: ExtractResult{Scheme: "https://",
310 | 		UserInfo: ":p@ssword", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty username; @ in password"},
311 | 	{urlParams: URLParams{URL: "https://usern@m%40e:password@example.com/p@th?q=@go"}, expected: ExtractResult{Scheme: "https://",
312 | 		UserInfo: "usern@m%40e:password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/p@th?q=@go", HostType: HostName}, description: "@ in UserInfo and Path"},
313 | }
314 | var ipv4Tests = []extractTest{
315 | 	{urlParams: URLParams{URL: "127.0.0.1"},
316 | 		expected: ExtractResult{Domain: "127.0.0.1",
317 | 			RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Basic IPv4 Address"},
318 | 	{urlParams: URLParams{URL: "http://127.0.0.1:5000"},
319 | 		expected: ExtractResult{
320 | 			Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", Port: "5000", HostType: IPv4},
321 | 		description: "Basic IPv4 Address with Scheme and Port"},
322 | 	{urlParams: URLParams{URL: "127\uff0e0\u30020\uff611"},
323 | 		expected: ExtractResult{Domain: "127\uff0e0\u30020\uff611",
324 | 			RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Basic IPv4 Address | Internationalised label separators"},
325 | 	{urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff611:5000"},
326 | 		expected: ExtractResult{Scheme: "http://", Domain: "127\uff0e0\u30020\uff611", Port: "5000",
327 | 			RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Basic IPv4 Address with Scheme and Port | Internationalised label separators"},
328 | }
329 | var ipv6Tests = []extractTest{
330 | 	{urlParams: URLParams{URL: "[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"},
331 | 		expected: ExtractResult{Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789",
332 | 			RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", HostType: IPv6}, description: "Basic IPv6 Address"},
333 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000"},
334 | 		expected: ExtractResult{
335 | 			Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", Port: "5000",
336 | 			HostType: IPv6},
337 | 		description: "Basic IPv6 Address with Scheme and Port"},
338 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:5000"},
339 | 		expected: ExtractResult{
340 | 			Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1", RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1", Port: "5000",
341 | 			HostType: IPv6},
342 | 		description: "Basic IPv6 Address + trailing IPv4 address with Scheme and Port"},
343 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]:5000"},
344 | 		expected: ExtractResult{
345 | 			Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611",
346 | 			RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611", Port: "5000",
347 | 			HostType: IPv6},
348 | 		description: "Basic IPv6 Address + trailing IPv4 address with Scheme and Port | Internationalised label separators"},
349 | 	{urlParams: URLParams{URL: "http://[::2345:6789:aBcD:ef01:2345:678]:5000"},
350 | 		expected: ExtractResult{Scheme: "http://", Domain: "::2345:6789:aBcD:ef01:2345:678",
351 | 			RegisteredDomain: "::2345:6789:aBcD:ef01:2345:678", Port: "5000", HostType: IPv6},
352 | 		description: "Basic IPv6 Address with Scheme and Port | have leading ellipsis"},
353 | 	{urlParams: URLParams{URL: "http://[::]:5000"},
354 | 		expected: ExtractResult{Scheme: "http://", Domain: "::",
355 | 			RegisteredDomain: "::", Port: "5000", HostType: IPv6},
356 | 		description: "Basic IPv6 Address with Scheme and Port | only ellipsis"},
357 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01::]:5000"},
358 | 		expected: ExtractResult{Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01::",
359 | 			RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01::", Port: "5000", HostType: IPv6},
360 | 		description: "Basic IPv6 Address with Scheme and Port bad IP with even number of trailing empty hextets"},
361 | }
362 | var ignoreSubDomainsTests = []extractTest{
363 | 	{urlParams: URLParams{URL: "maps.google.com.sg",
364 | 		IgnoreSubDomains: true},
365 | 		expected: ExtractResult{
366 | 			Domain: "google", Suffix: "com.sg",
367 | 			RegisteredDomain: "google.com.sg", HostType: HostName,
368 | 		}, description: "Ignore SubDomain",
369 | 	},
370 | 	{urlParams: URLParams{URL: "example.za/en",
371 | 		IgnoreSubDomains: true},
372 | 		expected: ExtractResult{
373 | 			Domain: "za", Path: "/en",
374 | 			HostType: HostName},
375 | 		description: "za has no 1st-level TLD | IgnoreSubDomains",
376 | 	},
377 | 	{urlParams: URLParams{URL: "https://example.za/en",
378 | 		IgnoreSubDomains: true},
379 | 		expected: ExtractResult{
380 | 			Scheme: "https://",
381 | 			Domain: "za", Path: "/en",
382 | 			HostType: HostName},
383 | 		description: "za has no 1st-level TLD | Scheme + IgnoreSubDomains",
384 | 	},
385 | }
386 | var privateSuffixTests = []extractTest{
387 | 	{includePrivateSuffix: true,
388 | 		urlParams: URLParams{URL: "https://brb.i.am.going.to.be.blogspot.com:5000/a/b/c/d.txt?id=42"},
389 | 		expected: ExtractResult{
390 | 			Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "blogspot.com",
391 | 			RegisteredDomain: "be.blogspot.com", Port: "5000", Path: "/a/b/c/d.txt?id=42", HostType: HostName,
392 | 		}, description: "Include Private Suffix"},
393 | 	{includePrivateSuffix: true,
394 | 		urlParams: URLParams{URL: "global.prod.fastly.net"},
395 | 		expected: ExtractResult{
396 | 			Suffix: "global.prod.fastly.net",
397 | 		}, err: errs[9], description: "Include Private Suffix | Suffix only"},
398 | }
399 | var periodsAndWhiteSpacesTests = []extractTest{
400 | 	{urlParams: URLParams{URL: "http://127.0.0.1.."},
401 | 		expected: ExtractResult{Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Consecutive label separators after IPv4 address",
402 | 	},
403 | 	{urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff611..:5000"},
404 | 		expected: ExtractResult{Scheme: "http://", Domain: "127\uff0e0\u30020\uff611",
405 | 			Port: "5000", RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Consecutive label separators between IPv4 address and Port",
406 | 	},
407 | 	{urlParams: URLParams{URL: "http://127.0.0.1  "},
408 | 		expected: ExtractResult{Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Spaces after IPv4 address",
409 | 	},
410 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]  "},
411 | 		expected: ExtractResult{Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789",
412 | 			RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", HostType: IPv6}, description: "Spaces after IPv6 address",
413 | 	},
414 | 	{urlParams: URLParams{URL: "localhost.\u3002"}, expected: ExtractResult{Domain: "localhost", HostType: HostName}, description: "localhost with trailing periods"},
415 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk\uff0e\u002e\u3002"},
416 | 		expected: ExtractResult{Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be",
417 | 			Suffix: "a\uff61fk", RegisteredDomain: "be\u3002a\uff61fk", HostType: HostName},
418 | 		description: "Consecutive label separators after Suffix",
419 | 	},
420 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"},
421 | 		expected: ExtractResult{
422 | 			Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be", Suffix: "a\uff61fk",
423 | 			RegisteredDomain: "be\u3002a\uff61fk", HostType: HostName,
424 | 		}, description: "Internationalised label separators",
425 | 	},
426 | 	{urlParams: URLParams{URL: "a\uff61fk"},
427 | 		expected: ExtractResult{Suffix: "a\uff61fk"}, err: errs[9], description: "Internationalised label separators | Suffix only",
428 | 	},
429 | 	{urlParams: URLParams{URL: " https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk/a/b/c. \uff61 "},
430 | 		expected: ExtractResult{
431 | 			Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be", Suffix: "a\uff61fk",
432 | 			RegisteredDomain: "be\u3002a\uff61fk", Path: "/a/b/c. \uff61", HostType: HostName,
433 | 		}, description: "Surrounded by extra whitespace"},
434 | 
435 | 	{urlParams: URLParams{URL: " https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk/a/B/c. \uff61 ",
436 | 		ConvertURLToPunyCode: true},
437 | 		expected: ExtractResult{
438 | 			Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "a.fk",
439 | 			RegisteredDomain: "be.a.fk", Path: "/a/B/c. \uff61", HostType: HostName,
440 | 		}, description: "Surrounded by extra whitespace | PunyCode"},
441 | 	{urlParams: URLParams{URL: "http://1.1.1.1 &@2.2.2.2:33/4.4.4.4?1.1.1.1# @3.3.3.3/"},
442 | 		expected: ExtractResult{
443 | 			Scheme: "http://", UserInfo: "1.1.1.1 &", Domain: "2.2.2.2",
444 | 			RegisteredDomain: "2.2.2.2", Port: "33", Path: "/4.4.4.4?1.1.1.1# @3.3.3.3/", HostType: IPv4,
445 | 		}, description: "Whitespace in UserInfo"},
446 | 	{urlParams: URLParams{URL: "example.za./en"},
447 | 		expected:    ExtractResult{SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName},
448 | 		description: "za has no 1st-level TLD | One trailing label separator",
449 | 	},
450 | 	{urlParams: URLParams{URL: "example.za.\u3002/en"},
451 | 		expected:    ExtractResult{SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName},
452 | 		description: "za has no 1st-level TLD | 2 trailing label separators",
453 | 	},
454 | }
455 | var invalidTests = []extractTest{
456 | 	{urlParams: URLParams{URL: "localhost!"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character !"},
457 | 	{urlParams: URLParams{URL: "localhost+"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character +"},
458 | 	{urlParams: URLParams{URL: "localhost-"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character -"},
459 | 	{urlParams: URLParams{}, expected: ExtractResult{}, err: errs[9], description: "empty string"},
460 | 	{urlParams: URLParams{URL: "https://"}, expected: ExtractResult{Scheme: "https://"}, err: errs[9], description: "Scheme only"},
461 | 	{urlParams: URLParams{URL: "1b://example.com"}, expected: ExtractResult{}, err: errs[10], description: "Scheme beginning with non-alphabet (parser unsuccessfully tries to interpret runes after colon as port"},
462 | 	{urlParams: URLParams{URL: "maps.google.com.sg:8589934592/this/path/will/not/be/parsed"}, expected: ExtractResult{}, err: errs[10], description: "Invalid Port number"},
463 | 	{urlParams: URLParams{URL: "http://.\u3002127.0.0.1"},
464 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Consecutive label separators before IPv4 address",
465 | 	},
466 | 	{urlParams: URLParams{URL: "http://.\u3002[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"},
467 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0], description: "Consecutive label separators before IPv6 address",
468 | 	},
469 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789].."},
470 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[5], description: "Consecutive label separators after IPv6 address",
471 | 	},
472 | 	{urlParams: URLParams{URL: "http://example.com :50"},
473 | 		expected: ExtractResult{Scheme: "http://", Port: "50"}, err: errs[8], description: "Spaces between domain and Port/Path",
474 | 	},
475 | 	{urlParams: URLParams{URL: "http://  127.0.0.1"},
476 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Spaces before IPv4 address",
477 | 	},
478 | 	{urlParams: URLParams{URL: "http://127.0.0.1  :50"},
479 | 		expected: ExtractResult{Scheme: "http://", Port: "50"}, err: errs[8], description: "Spaces between IPv4 address and Port/Path",
480 | 	},
481 | 	{urlParams: URLParams{URL: "http://  [aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"},
482 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0], description: "Spaces before IPv6 address",
483 | 	},
484 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]  :50"},
485 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[5], description: "Spaces between IPv6 address and Port/Path",
486 | 	},
487 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61\u3002fk"},
488 | 		expected: ExtractResult{Scheme: "https://"}, err: errs[6], description: "Consecutive label separators within Suffix",
489 | 	},
490 | 	{urlParams: URLParams{URL: "example.\u3002za/en"},
491 | 		expected: ExtractResult{Path: "/en"}, err: errs[6],
492 | 		description: "za has no 1st-level TLD | Consecutive label separators between labels",
493 | 	},
494 | 	{urlParams: URLParams{URL: ".\u3002a\uff61fk"}, expected: ExtractResult{}, err: errs[8], description: "eTLD only, multiple leading label separators"},
495 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe.\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between Domain and Suffix"},
496 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to.\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between SubDomain and Domain"},
497 | 	{urlParams: URLParams{URL: "https://brb\u002ei\u3002.am.\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators within SubDomain"},
498 | 	{urlParams: URLParams{URL: "https://\uff0eexample.com"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Hostname starting with label separator"},
499 | 	{urlParams: URLParams{URL: "//server.example.com/path"}, expected: ExtractResult{Scheme: "//", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/path", HostType: HostName}, description: "Double-slash only Scheme with subdomain"},
500 | 	{urlParams: URLParams{URL: "http://temasek"}, expected: ExtractResult{Scheme: "http://", Suffix: "temasek"}, err: errs[9], description: "Basic URL with eTLD only"},
501 | 	{urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with bad eTLD"},
502 | 	{urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with subdomain and bad eTLD"},
503 | 	{urlParams: URLParams{URL: "http://127.0.0.256"}, expected: ExtractResult{Scheme: "http://", SubDomain: "127.0.0", Domain: "256", HostType: HostName}, description: "Basic IPv4 Address URL with bad IP"},
504 | 	{urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff61256:5000"},
505 | 		expected: ExtractResult{Scheme: "http://", SubDomain: "127\uff0e0\u30020", Port: "5000",
506 | 			Domain: "256", HostType: HostName}, description: "Basic IPv4 Address with Scheme and Port and bad IP | Internationalised label separators"},
507 | 	{urlParams: URLParams{URL: "http://192.168.01.1:5000"},
508 | 		expected:    ExtractResult{Scheme: "http://", SubDomain: "192.168.01", Domain: "1", Port: "5000", HostType: HostName},
509 | 		description: "Basic IPv4 Address with Scheme and Port and bad IP | octet with leading zero"},
510 | 	{urlParams: URLParams{URL: "http://a:b@xn--tub-1m9d15sfkkhsifsbqygyujjrw60.com"},
511 | 		expected: ExtractResult{Scheme: "http://", UserInfo: "a:b"}, err: errors.New("idna: invalid label \"tub-1m9d15sfkkhsifsbqygyujjrw60\""), description: "Invalid punycode Domain"},
512 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789:5000"},
513 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[3],
514 | 		description: "Basic IPv6 Address with Scheme and Port with no closing bracket"},
515 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:::]:5000"},
516 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[4],
517 | 		description: "Basic IPv6 Address with Scheme and Port and bad IP | odd number of empty hextets"},
518 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:fffffffffffffffff]:5000"},
519 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[4],
520 | 		description: "Basic IPv6 Address with Scheme and Port and bad IP | hextet too big"},
521 | 	{urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e256\u30020\uff611]:5000"},
522 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[4],
523 | 		description: "Basic IPv6 Address + trailing bad IPv4 address with Scheme and Port | Internationalised label separators"},
524 | 	{urlParams: URLParams{URL: "http://[::aBcD:ef01:2345:6789:aBcD:ef01:2345:127.255.0.1]:5000"},
525 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[4],
526 | 		description: "Malformed IPv6 Address with leading ellipsis and extra 16-bit chunk + trailing IPv4 address with Scheme and Port"},
527 | 	{urlParams: URLParams{URL: "[1::1::1:1:1:1:1:1]"},
528 | 		expected: ExtractResult{}, err: errs[4],
529 | 		description: "Malformed IPv6 Address with 2 consecutive double-colon"},
530 | 	{urlParams: URLParams{URL: "[1:1:1:1:1:1:1:1:::]"},
531 | 		expected: ExtractResult{}, err: errs[4],
532 | 		description: "Malformed IPv6 Address with trailing triple colon"},
533 | 	{urlParams: URLParams{URL: "http://["},
534 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[3],
535 | 		description: "Single opening square bracket"},
536 | 	{urlParams: URLParams{URL: "http://a["},
537 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0],
538 | 		description: "Single opening square bracket after alphabet"},
539 | 	{urlParams: URLParams{URL: "http://]"},
540 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[1],
541 | 		description: "Single closing square bracket"},
542 | 	{urlParams: URLParams{URL: "http://a]"},
543 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[1],
544 | 		description: "Single closing square bracket after alphabet"},
545 | 	{urlParams: URLParams{URL: "http://]["},
546 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[1],
547 | 		description: "closing square bracket before opening square bracket"},
548 | 	{urlParams: URLParams{URL: "http://a]["},
549 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[1],
550 | 		description: "closing square bracket before opening square bracket after alphabet"},
551 | 	{urlParams: URLParams{URL: "http://[]"},
552 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[4],
553 | 		description: "Empty pair of square brackets"},
554 | 	{urlParams: URLParams{URL: "http://a[]"},
555 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0],
556 | 		description: "Empty pair of square brackets after alphabet"},
557 | 	{urlParams: URLParams{URL: "http://a[127.0.0.1]"},
558 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0],
559 | 		description: "IPv4 in square brackets after alphabet"},
560 | 	{urlParams: URLParams{URL: "http://a[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e255\u30020\uff611]"},
561 | 		expected: ExtractResult{Scheme: "http://"}, err: errs[0],
562 | 		description: "IPv6 in square brackets after alphabet"},
563 | 	{urlParams: URLParams{URL: "http://[127.0.0.1]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "IPv4 in square brackets"},
564 | 	{urlParams: URLParams{URL: "http://%78n--0.example.com"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`idna: invalid label "0"`), description: "Bad percentage encoding"},
565 | 	{urlParams: URLParams{URL: "http://%78n--0.example.com", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://"}, err: errs[9], description: "Bad percentage encoding"},
566 | 
567 | 	// Test cases from net/ip-test.go
568 | 	{urlParams: URLParams{URL: "http://[-0.0.0.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
569 | 	{urlParams: URLParams{URL: "http://[0.-1.0.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
570 | 	{urlParams: URLParams{URL: "http://[0.0.-2.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
571 | 	{urlParams: URLParams{URL: "http://[0.0.0.-3]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
572 | 	{urlParams: URLParams{URL: "http://[127.0.0.256]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
573 | 	{urlParams: URLParams{URL: "http://[abc]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
574 | 	{urlParams: URLParams{URL: "http://[123:]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
575 | 	{urlParams: URLParams{URL: "http://[fe80::1%lo0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
576 | 	{urlParams: URLParams{URL: "http://[fe80::1%911]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
577 | 	{urlParams: URLParams{URL: "http://[a1:a2:a3:a4::b1:b2:b3:b4]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
578 | 	{urlParams: URLParams{URL: "http://[127.001.002.003]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
579 | 	{urlParams: URLParams{URL: "http://[::ffff:127.001.002.003]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
580 | 	{urlParams: URLParams{URL: "http://[123.000.000.000]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
581 | 	{urlParams: URLParams{URL: "http://[1.2..4]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
582 | 	{urlParams: URLParams{URL: "http://[0123.0.0.1]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"},
583 | 	{urlParams: URLParams{URL: "git+ssh://www.!example.com/"}, expected: ExtractResult{Scheme: "git+ssh://", Path: "/"}, err: errs[8], description: "Full git+ssh URL with bad Domain"},
584 | }
585 | var internationalTLDTests = []extractTest{
586 | 	{urlParams: URLParams{URL: "https://𝖊𝖝𝖆𝖒𝖕𝖑𝖊.𝖈𝖔𝖒.𝖘𝖌", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "com.sg", RegisteredDomain: "example.com.sg", HostType: HostName}},
587 | 	{urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--lcvr32d.hk", RegisteredDomain: "example.xn--lcvr32d.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in punycode)"},
588 | 	{urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in punycode)"},
589 | 	{urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "敎育.hk", RegisteredDomain: "example.敎育.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in unicode)"},
590 | 	{urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in unicode)"},
591 | 	{urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (result in punycode)"},
592 | 	{urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full punycode international eTLD (result in punycode)"},
593 | 	{urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (no further conversion to punycode)"},
594 | 	{urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Basic URL with full punycode international eTLD (no further conversion to punycode)"},
595 | 	{urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (no further conversion to punycode) See: https://github.com/golang/go/issues/48778"},
596 | 	{urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "xn--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xn--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (with further conversion to punycode)"},
597 | }
598 | var domainOnlySingleTLDTests = []extractTest{
599 | 	{urlParams: URLParams{URL: "https://example.ai/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "ai", RegisteredDomain: "example.ai", Path: "/en", HostType: HostName}, description: "Domain only + ai"},
600 | 	{urlParams: URLParams{URL: "https://example.co/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "co", RegisteredDomain: "example.co", Path: "/en", HostType: HostName}, description: "Domain only + co"},
601 | 	{urlParams: URLParams{URL: "https://example.sg/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "sg", RegisteredDomain: "example.sg", Path: "/en", HostType: HostName}, description: "Domain only + sg"},
602 | 	{urlParams: URLParams{URL: "https://example.tv/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "tv", RegisteredDomain: "example.tv", Path: "/en", HostType: HostName}, description: "Domain only + tv"},
603 | 	{urlParams: URLParams{URL: "https://example.%63om/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "%63om", RegisteredDomain: "example.%63om", Path: "/en", HostType: HostName}, description: "Domain only + %63om"},
604 | 	{urlParams: URLParams{URL: "https://example.za/en"}, expected: ExtractResult{Scheme: "https://", SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName}, description: "Domain only + za | za has no 1st-level TLD"},
605 | }
606 | var pathTests = []extractTest{
607 | 	{urlParams: URLParams{URL: "http://www.example.com/this:that"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/this:that", HostType: HostName}, description: "Colon in Path"},
608 | 	{urlParams: URLParams{URL: "http://example.com/oid/[order_id]"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/oid/[order_id]", HostType: HostName}, description: "Square brackets in Path"},
609 | }
610 | var wildcardTests = []extractTest{
611 | 	{urlParams: URLParams{URL: "https://asdf.wwe.ck"},
612 | 		expected: ExtractResult{
613 | 			Scheme: "https://", Domain: "asdf", Suffix: "wwe.ck",
614 | 			RegisteredDomain: "asdf.wwe.ck", HostType: HostName},
615 | 		description: "Wildcard rule | *.ck"},
616 | 	{urlParams: URLParams{URL: "https://asdf.www.ck"},
617 | 		expected: ExtractResult{
618 | 			Scheme: "https://", SubDomain: "asdf", Domain: "www", Suffix: "ck",
619 | 			RegisteredDomain: "www.ck", HostType: HostName},
620 | 		description: "Wildcard exception rule | !www.ck"},
621 | 	{urlParams: URLParams{URL: "https://brb.i.am.going.to.be.a.fk"},
622 | 		expected: ExtractResult{
623 | 			Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "a.fk",
624 | 			RegisteredDomain: "be.a.fk", HostType: HostName,
625 | 		}, description: "Wildcard rule | *.fk",
626 | 	},
627 | }
628 | var lookoutTests = []extractTest{ // some tests from lookout.net
629 | 	{urlParams: URLParams{URL: "http://GOO\u200b\u2060\ufeffgoo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
630 | 	{urlParams: URLParams{URL: "http://\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
631 | 	{urlParams: URLParams{URL: "http://\u0000\u0dc1\u0dca\u200d\u0dbb\u0dd3.com.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
632 | 	{urlParams: URLParams{URL: "http://\u0dc1\u0dca\u200d\u0dbb\u0dd3.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
633 | 	{urlParams: URLParams{URL: "http://look\ufeffout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
634 | 	{urlParams: URLParams{URL: "http://www\u00A0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
635 | 	{urlParams: URLParams{URL: "http://\u1680.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"},
636 | 	{urlParams: URLParams{URL: "%68%74%74%70%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest.lookout.net"}, expected: ExtractResult{
637 | 		SubDomain: "%68%74%74%70%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest", Domain: "lookout",
638 | 		Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
639 | 	{urlParams: URLParams{URL: "http%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest.lookout.net"}, expected: ExtractResult{
640 | 		SubDomain: "http%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest", Domain: "lookout",
641 | 		Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
642 | 	{urlParams: URLParams{URL: "http://%25.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://",
643 | 		SubDomain: "%25.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
644 | 	{urlParams: URLParams{URL: "http://%25DOMAIN:foobar@urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", UserInfo: "%25DOMAIN:foobar",
645 | 		SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded UserInfo"},
646 | 	{urlParams: URLParams{URL: "http://%30%78%63%30%2e%30%32%35%30.01%2e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%30%78%63%30%2e%30%32%35%30.01%2e.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
647 | 	{urlParams: URLParams{URL: "http://%30%78%63%30%2e%30%32%35%30.01.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%30%78%63%30%2e%30%32%35%30.01.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
648 | 	{urlParams: URLParams{URL: "http://%3g%78%63%30%2e%30%32%35%30%2E.01.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`invalid URL escape "%3g"`), description: "Invalid Percentage encoded SubDomain"},
649 | 	{urlParams: URLParams{URL: "http://%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d.urltest.lookout.net%3a%38%30"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d.urltest.lookout", Domain: "net%3a%38%30", HostType: HostName}, description: "Percentage encoded SubDomain and Domain"},
650 | 	{urlParams: URLParams{URL: "http://%A1%C1.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%A1%C1.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
651 | 	{urlParams: URLParams{URL: "http://%E4%BD%A0%E5%A5%BD\u4f60\u597d.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%E4%BD%A0%E5%A5%BD\u4f60\u597d.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded and Unicode SubDomain"},
652 | 	{urlParams: URLParams{URL: "http://%ef%b7%90zyx.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%b7%90zyx.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
653 | 	{urlParams: URLParams{URL: "http://%ef%bc%85%ef%bc%90%ef%bc%90.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%bc%85%ef%bc%90%ef%bc%90.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
654 | 	{urlParams: URLParams{URL: "http://%ef%bc%85%ef%bc%94%ef%bc%91.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%bc%85%ef%bc%94%ef%bc%91.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
655 | 	{urlParams: URLParams{URL: "http://%zz%66%a.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`invalid URL escape "%zz"`), description: "Bad Percentage encoded SubDomain"},
656 | 	{urlParams: URLParams{URL: "http://-foo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Start with dash"},
657 | 	{urlParams: URLParams{URL: "http:////////user:@urltest.lookout.net?foo"}, expected: ExtractResult{Scheme: "http:////////", UserInfo: "user:", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "?foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple slashes in Scheme"},
658 | 	{urlParams: URLParams{URL: "http://192.168.0.1 hello.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"},
659 | 	{urlParams: URLParams{URL: "http://192.168.0.257.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "192.168.0.257.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "IPv4 Address in SubDomain"},
660 | 	{urlParams: URLParams{URL: "http://B\u00fccher.de.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "B\u00fccher.de.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
661 | 	{urlParams: URLParams{URL: "http://GOO \u3000goo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"},
662 | 	{urlParams: URLParams{URL: "http://Goo%20 goo%7C|.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"},
663 | 	{urlParams: URLParams{URL: "http://[google.com.].urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "Square Brackets in SubDomain"},
664 | 	{urlParams: URLParams{URL: "http://[urltest.lookout.net]/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "Square brackets but not IPv6"},
665 | 	{urlParams: URLParams{URL: "http://\u001f.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Control Character in SubDomain"},
666 | 	{urlParams: URLParams{URL: "http://\u0378.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u0378.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode U+0378"},
667 | 	{urlParams: URLParams{URL: "http://\u03b2\u03cc\u03bb\u03bf\u03c2.com.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u03b2\u03cc\u03bb\u03bf\u03c2.com.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
668 | 	{urlParams: URLParams{URL: "http://\u03b2\u03cc\u03bb\u03bf\u03c2.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u03b2\u03cc\u03bb\u03bf\u03c2.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
669 | 	{urlParams: URLParams{URL: "http://\u0442(.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Parenthesis in SubDomain"},
670 | 	{urlParams: URLParams{URL: "http://\u04c0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
671 | 	{urlParams: URLParams{URL: "http://\u06dd.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
672 | 	{urlParams: URLParams{URL: "http://\u09dc.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u09dc.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
673 | 	{urlParams: URLParams{URL: "http://\u15ef\u15ef\u15ef.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u15ef\u15ef\u15ef.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
674 | 	{urlParams: URLParams{URL: "http://\u180e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
675 | 	{urlParams: URLParams{URL: "http://\u1e9e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u1e9e.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
676 | 	{urlParams: URLParams{URL: "http://\u2183.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
677 | 	{urlParams: URLParams{URL: "http://\u2665.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u2665.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
678 | 	{urlParams: URLParams{URL: "http://\u4f60\u597d\u4f60\u597d.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u4f60\u597d\u4f60\u597d.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
679 | 	{urlParams: URLParams{URL: "http://\ufdd0zyx.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
680 | 	{urlParams: URLParams{URL: "http://\uff05\uff10\uff10.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
681 | 	{urlParams: URLParams{URL: "http://\uff05\uff14\uff11.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
682 | 	{urlParams: URLParams{URL: "http://\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
683 | 	{urlParams: URLParams{URL: "http://\uff27\uff4f.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\uff27\uff4f.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
684 | 	{urlParams: URLParams{URL: "http://ab--cd.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "ab--cd.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Bad double-hyphen in SubDomain (still accepted)"},
685 | 	{urlParams: URLParams{URL: "http://fa\u00df.de.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "fa\u00df.de.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
686 | 	{urlParams: URLParams{URL: "http://foo-.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "foo-.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Bad SubDomain label end with dash (still accepted)"},
687 | 	{urlParams: URLParams{URL: "http://foo\u0300.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "foo\u0300.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
688 | 	{urlParams: URLParams{URL: "http://gOoGle.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "gOoGle.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Mixed case letters"},
689 | 	{urlParams: URLParams{URL: "http://hello%00.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "hello%00.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"},
690 | 	{urlParams: URLParams{URL: "http://look\u0341out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u0341out.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
691 | 	{urlParams: URLParams{URL: "http://look\u034fout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u034fout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
692 | 	{urlParams: URLParams{URL: "http://look\u05beout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u05beout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
693 | 	{urlParams: URLParams{URL: "http://look\u202eout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
694 | 	{urlParams: URLParams{URL: "http://look\u2060.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u2060.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
695 | 	{urlParams: URLParams{URL: "http://look\u206bout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u206bout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
696 | 	{urlParams: URLParams{URL: "http://look\u2ff0out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
697 | 	{urlParams: URLParams{URL: "http://look\ufffaout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"},
698 | 	{urlParams: URLParams{URL: "http://uRLTest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "uRLTest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Mixed case letters"},
699 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Simple SubDomain+Domain"},
700 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%20foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%20foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
701 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%3A%3a%3C%3c"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%3A%3a%3C%3c", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
702 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%7Ffp3%3Eju%3Dduvgw%3Dd"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%7Ffp3%3Eju%3Dduvgw%3Dd", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
703 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%A1%C1/?foo=%EF%BD%81"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%A1%C1/?foo=%EF%BD%81", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
704 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%A1%C1/?foo=???"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%A1%C1/?foo=???", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
705 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/%EF%BD%81/?foo=%A1%C1"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%EF%BD%81/?foo=%A1%C1", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
706 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/(%28:%3A%29)"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/(%28:%3A%29)", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Parentheses in Path"},
707 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/././foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/././foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
708 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/./.foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/./.foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
709 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net////../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "////../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
710 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?%02hello%7f bye"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?%02hello%7f bye", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Space in Path"},
711 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?%40%41123"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?%40%41123", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
712 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/???/?foo=%A1%C1"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/???/?foo=%A1%C1", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Consecutive question marks"},
713 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?D%C3%BCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?D%C3%BCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
714 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?D%FCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?D%FCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
715 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?as?df"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?as?df", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple question marks"},
716 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?foo=bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?foo=bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with Query Parameters"},
717 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?q=&lt;asdf&gt;"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=&lt;asdf&gt;", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with Query Parameters"},
718 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?q=\"asdf\""}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=\"asdf\"", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with inverted commas"},
719 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/?q=\u4f60\u597d"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=\u4f60\u597d", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
720 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/@asdf%40"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/@asdf%40", RegisteredDomain: "lookout.net", HostType: HostName}, description: "@ in Path"},
721 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/D%C3%BCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/D%C3%BCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
722 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/D%FCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/D%FCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
723 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/\u2025/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u2025/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
724 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/\u202e/foo/\u202d/bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u202e/foo/\u202d/bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
725 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/\u4f60\u597d\u4f60\u597d"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u4f60\u597d\u4f60\u597d", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
726 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/\ufdd0zyx"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\ufdd0zyx", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
727 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/\ufeff/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\ufeff/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
728 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Simple SubDomain+Domain+Path"},
729 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo    bar/?   foo   =   bar     #    foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo    bar/?   foo   =   bar     #    foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Space in Path"},
730 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Trailing percentage sign in Path"},
731 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%00%51"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%00%51", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
732 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
733 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2Ehtml"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2Ehtml", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
734 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2\u00c2\u00a9zbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2\u00c2\u00a9zbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"},
735 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2fbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2fbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
736 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2zbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2zbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
737 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%3fbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%3fbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
738 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo%41%7a"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%41%7a", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
739 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
740 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e%2"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e%2", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
741 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e./%2e%2e/.%2e/%2e.bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e./%2e%2e/.%2e/%2e.bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"},
742 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/.", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
743 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/../../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/../../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
744 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/../../../ton"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/../../../ton", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
745 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/..bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/..bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
746 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/./"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/./", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
747 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
748 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
749 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../ton"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../ton", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
750 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../ton/../../a"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../ton/../../a", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
751 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar//.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar//..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path, Multiple slashes"},
752 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar//../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar//../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"},
753 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo?bar=baz#"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo?bar=baz#", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Query Parameters in Path"},
754 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo\\tbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo\\tbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Backslash in Path"},
755 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net/foo\t\ufffd%91"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo\t\ufffd%91", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Tab in Path"},
756 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net:80/"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Port: "80", RegisteredDomain: "lookout.net", Path: "/", HostType: HostName}, description: "Port"},
757 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net::80::443/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[10], description: "Bad Port"},
758 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net::==80::==443::/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[10], description: "Bad Port"},
759 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net\\\\foo\\\\bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", Path: "\\\\foo\\\\bar", HostType: HostName}, description: "Multiple backslashes in Path"},
760 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net\u2a7480/"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest.lookout", Domain: "net\u2a7480", Path: "/", HostType: HostName}, description: "Unicode in Domain"},
761 | 	{urlParams: URLParams{URL: "http://urltest.lookout.net\uff0ffoo/"}, expected: ExtractResult{Scheme: "http://", Path: "/"}, err: errs[8], description: "Unicode in Domain"},
762 | 	{urlParams: URLParams{URL: "http://www.foo\u3002bar.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.foo\u3002bar.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
763 | 	{urlParams: URLParams{URL: "http://www.loo\u0138out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.loo\u0138out.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
764 | 	{urlParams: URLParams{URL: "http://www.lookout.\u0441\u043e\u043c.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.lookout.\u0441\u043e\u043c.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
765 | 	{urlParams: URLParams{URL: "http://www.lookout.net\uff1a80.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Reject full-width colon"},
766 | 	{urlParams: URLParams{URL: "http://www.lookout\u2027net.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.lookout\u2027net.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"},
767 | 	{urlParams: URLParams{URL: "http://www\u2025urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid Character"},
768 | 	{urlParams: URLParams{URL: "http://xn--0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New("idna: invalid label \"0\""), description: "Invalid Punycode"},
769 | 	{urlParams: URLParams{URL: "http:\\\\\\\\urltest.lookout.net\\\\foo"}, expected: ExtractResult{Scheme: "http:\\\\\\\\", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", Path: "\\\\foo", HostType: HostName}, description: "Multiple forward slashes in Scheme"},
770 | 	{urlParams: URLParams{URL: "http:///\\/\\/\\/\\/urltest.lookout.net"}, expected: ExtractResult{Scheme: "http:///\\/\\/\\/\\/", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple mixed slashes in Scheme"},
771 | }
772 | 
773 | func TestExtract(t *testing.T) {
774 | 	testPSLFilePath, ok := getTestPSLFilePath()
775 | 	if !ok {
776 | 		t.Errorf("Cannot get path to current module file")
777 | 	}
778 | 	extractorWithPrivateSuffix, _ := New(SuffixListParams{
779 | 		CacheFilePath:        testPSLFilePath,
780 | 		IncludePrivateSuffix: true,
781 | 	})
782 | 	extractorWithoutPrivateSuffix, _ := New(SuffixListParams{
783 | 		CacheFilePath:        testPSLFilePath,
784 | 		IncludePrivateSuffix: false,
785 | 	})
786 | 	for _, testCollection := range []([]extractTest){
787 | 		schemeTests,
788 | 		noSchemeTests,
789 | 		userInfoTests,
790 | 		ipv4Tests,
791 | 		ipv6Tests,
792 | 		ignoreSubDomainsTests,
793 | 		privateSuffixTests,
794 | 		periodsAndWhiteSpacesTests,
795 | 		invalidTests,
796 | 		internationalTLDTests,
797 | 		domainOnlySingleTLDTests,
798 | 		pathTests,
799 | 		wildcardTests,
800 | 		lookoutTests,
801 | 	} {
802 | 		for _, test := range testCollection {
803 | 			var extractor *FastTLD
804 | 			if test.includePrivateSuffix {
805 | 				extractor = extractorWithPrivateSuffix
806 | 			} else {
807 | 				extractor = extractorWithoutPrivateSuffix
808 | 			}
809 | 			res, err := extractor.Extract(test.urlParams)
810 | 
811 | 			if output := reflect.DeepEqual(res,
812 | 				test.expected); !output {
813 | 				t.Errorf("%+q | Output %q not equal to expected output %q | %q",
814 | 					test.urlParams.URL, res, test.expected, test.description)
815 | 			}
816 | 
817 | 			if !(err == nil && test.err == nil) &&
818 | 				((err == nil && test.err != nil) ||
819 | 					(err != nil && test.err == nil) ||
820 | 					!reflect.DeepEqual(err.Error(),
821 | 						test.err.Error())) {
822 | 				t.Errorf("%+q | Error %v not equal to expected error %v | %q",
823 | 					test.urlParams.URL, err, test.err, test.description)
824 | 			}
825 | 		}
826 | 	}
827 | }
828 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/elliotwutingfeng/go-fasttld
 2 | 
 3 | go 1.23.0
 4 | 
 5 | toolchain go1.24.3
 6 | 
 7 | require (
 8 | 	github.com/fatih/color v1.18.0
 9 | 	github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8
10 | 	github.com/jpillora/go-tld v1.2.1
11 | 	github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651
12 | 	github.com/mjd2021usa/tldextract v0.9.2
13 | 	github.com/spf13/afero v1.14.0
14 | 	github.com/spf13/cobra v1.9.1
15 | 	github.com/tidwall/hashmap v1.8.1
16 | 	golang.org/x/net v0.40.0
17 | )
18 | 
19 | require (
20 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
21 | 	github.com/klauspost/cpuid/v2 v2.2.9 // indirect
22 | 	github.com/mattn/go-colorable v0.1.14 // indirect
23 | 	github.com/mattn/go-isatty v0.0.20 // indirect
24 | 	github.com/spf13/pflag v1.0.6 // indirect
25 | 	github.com/zeebo/xxh3 v1.0.2 // indirect
26 | 	golang.org/x/sys v0.33.0 // indirect
27 | 	golang.org/x/text v0.25.0 // indirect
28 | )
29 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 2 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 4 | github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
 5 | github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
 6 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 7 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 8 | github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8 h1:Ig0ESdy6JtHI17vsb7L+UlUFpoZctKfvBZplcILeL6g=
 9 | github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8/go.mod h1:oGfutRjaB95239mjFVwofaOPTwuS3vb71ZLIGCEb36g=
10 | github.com/jpillora/go-tld v1.2.1 h1:kDKOkmXLlskqjcvNs7w5XHLep7c8WM7Xd4HQjxllVMk=
11 | github.com/jpillora/go-tld v1.2.1/go.mod h1:plzIl7xr5UWKGy7R+giuv+L/nOjrPjsoWxy/ST9OBUk=
12 | github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651 h1:bTfsnv9ZwdVc7mPWBEhd+F5pBeJ4P4WYVxaPuoZwmPE=
13 | github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651/go.mod h1:hJ3siwEnJbQ92zdVj7Q2OyyMrMZ7LZAIRYDZr0IAAqc=
14 | github.com/klauspost/cpuid/v2 v2.2.9 h1:66ze0taIn2H33fBvCkXuv9BmCwDfafmiIVpKV9kKGuY=
15 | github.com/klauspost/cpuid/v2 v2.2.9/go.mod h1:rqkxqrZ1EhYM9G+hXH7YdowN5R5RGN6NK4QwQ3WMXF8=
16 | github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
17 | github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
18 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
19 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
20 | github.com/mjd2021usa/tldextract v0.9.2 h1:Tkz+q0q4t4NvScACm3+bXZJY9lRlFeClopw0AkhAbA4=
21 | github.com/mjd2021usa/tldextract v0.9.2/go.mod h1:GB3fhxYasOChxf3Oo5Or6H4uzl8dhEx3wA7CQf8i4aI=
22 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
23 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
24 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
25 | github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA=
26 | github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo=
27 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
28 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
29 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
30 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
31 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
32 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
33 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
34 | github.com/tidwall/hashmap v1.8.1 h1:hXNzBfSJ2Jwvt0lbkWD59O/r3OfatSIcbuWT0VKEVns=
35 | github.com/tidwall/hashmap v1.8.1/go.mod h1:v+0qJrJn7l+l2dB8+fAFpC62p2G0SMP2Teu8ejkebg8=
36 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
37 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
38 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
39 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
40 | golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
41 | golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY=
42 | golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds=
43 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
44 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
45 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
47 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
48 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
49 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
50 | golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
51 | golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
52 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
53 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
54 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
55 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
56 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
57 | 


--------------------------------------------------------------------------------
/net.go:
--------------------------------------------------------------------------------
  1 | package fasttld
  2 | 
  3 | import "unicode/utf8"
  4 | 
  5 | // IP address lengths (bytes).
  6 | const (
  7 | 	iPv4len int = 4
  8 | 	iPv6len int = 16
  9 | 	lenDiff     = iPv6len - iPv4len
 10 | )
 11 | 
 12 | // Bigger than we need, not too big to worry about overflow
 13 | const big int = 0xFFFFFF
 14 | 
 15 | // Decimal to integer.
 16 | // Returns number, characters consumed, success.
 17 | func dtoi(s string) (n int, i int, ok bool) {
 18 | 	n = 0
 19 | 	for i = 0; i < len(s) && '0' <= s[i] && s[i] <= '9'; i++ {
 20 | 		n = n*10 + int(s[i]-'0')
 21 | 		if n >= big {
 22 | 			return big, i, false
 23 | 		}
 24 | 	}
 25 | 	if i == 0 {
 26 | 		return 0, 0, false
 27 | 	}
 28 | 	return n, i, true
 29 | }
 30 | 
 31 | // Hexadecimal to integer.
 32 | // Returns number, characters consumed, success.
 33 | func xtoi(s string) (n int, i int, ok bool) {
 34 | 	n = 0
 35 | 	for i = 0; i < len(s); i++ {
 36 | 		if '0' <= s[i] && s[i] <= '9' {
 37 | 			n *= 16
 38 | 			n += int(s[i] - '0')
 39 | 		} else if 'a' <= s[i] && s[i] <= 'f' {
 40 | 			n *= 16
 41 | 			n += int(s[i]-'a') + 10
 42 | 		} else if 'A' <= s[i] && s[i] <= 'F' {
 43 | 			n *= 16
 44 | 			n += int(s[i]-'A') + 10
 45 | 		} else {
 46 | 			break
 47 | 		}
 48 | 		if n >= big {
 49 | 			return 0, i, false
 50 | 		}
 51 | 	}
 52 | 	if i == 0 {
 53 | 		return 0, i, false
 54 | 	}
 55 | 	return n, i, true
 56 | }
 57 | 
 58 | // isIPv4 returns true if s is a literal IPv4 address
 59 | //
 60 | // trailing label separators are accepted
 61 | func isIPv4(s string) bool {
 62 | 	s = fastTrim(s, labelSeparatorsRuneSet, trimRight)
 63 | 	for i := 0; i < iPv4len; i++ {
 64 | 		if len(s) == 0 {
 65 | 			// Missing octets.
 66 | 			return false
 67 | 		}
 68 | 		if i > 0 {
 69 | 			r, size := utf8.DecodeRuneInString(s)
 70 | 			if !labelSeparatorsRuneSet.Exists(r) {
 71 | 				return false
 72 | 			}
 73 | 			s = s[size:]
 74 | 		}
 75 | 		n, c, ok := dtoi(s)
 76 | 		if !ok || n > 0xFF {
 77 | 			return false
 78 | 		}
 79 | 		if c > 1 && s[0] == '0' {
 80 | 			// Reject non-zero components with leading zeroes.
 81 | 			return false
 82 | 		}
 83 | 		s = s[c:]
 84 | 	}
 85 | 	return len(s) == 0
 86 | }
 87 | 
 88 | // isIPv6 returns true if s is a literal IPv6 address as described in RFC 4291
 89 | // and RFC 5952.
 90 | func isIPv6(s string) bool {
 91 | 	ellipsis := -1 // position of ellipsis in ip
 92 | 
 93 | 	// Might have leading ellipsis
 94 | 	if len(s) >= 2 && s[0] == ':' && s[1] == ':' {
 95 | 		ellipsis = 0
 96 | 		s = s[2:]
 97 | 		// Might be only ellipsis
 98 | 		if len(s) == 0 {
 99 | 			return true
100 | 		}
101 | 	}
102 | 
103 | 	// Loop, parsing hex numbers followed by colon.
104 | 	i := 0
105 | 	for i < iPv6len {
106 | 		// Hex number.
107 | 		n, c, ok := xtoi(s)
108 | 		if !ok || n > 0xFFFF {
109 | 			return false
110 | 		}
111 | 
112 | 		// If followed by any separator in labelSeparators, might be in trailing IPv4.
113 | 		if c < len(s) && labelSeparatorsRuneSet.Exists([]rune(s[c:])[0]) {
114 | 			if ellipsis < 0 && i != lenDiff {
115 | 				// Not the right place.
116 | 				return false
117 | 			}
118 | 			if i > lenDiff {
119 | 				// Not enough room.
120 | 				return false
121 | 			}
122 | 			if !isIPv4(s) {
123 | 				return false
124 | 			}
125 | 			s = ""
126 | 			i += iPv4len
127 | 			break
128 | 		}
129 | 
130 | 		// Save this 16-bit chunk.
131 | 		i += 2
132 | 
133 | 		// Stop at end of string.
134 | 		s = s[c:]
135 | 		if len(s) == 0 {
136 | 			break
137 | 		}
138 | 
139 | 		// Otherwise must be followed by colon and more.
140 | 		if s[0] != ':' || len(s) == 1 {
141 | 			return false
142 | 		}
143 | 		s = s[1:]
144 | 
145 | 		// Look for ellipsis.
146 | 		if s[0] == ':' {
147 | 			if ellipsis >= 0 { // already have one
148 | 				return false
149 | 			}
150 | 			ellipsis = i
151 | 			s = s[1:]
152 | 			if len(s) == 0 { // can be at end
153 | 				break
154 | 			}
155 | 		}
156 | 	}
157 | 
158 | 	// Must have used entire string.
159 | 	if len(s) != 0 {
160 | 		return false
161 | 	}
162 | 
163 | 	// If didn't parse enough, expand ellipsis.
164 | 	if i < iPv6len {
165 | 		if ellipsis < 0 {
166 | 			return false
167 | 		}
168 | 	} else if ellipsis >= 0 {
169 | 		// Ellipsis must represent at least one 0 group.
170 | 		return false
171 | 	}
172 | 	return true
173 | }
174 | 


--------------------------------------------------------------------------------
/net_test.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import "testing"
 4 | 
 5 | type looksLikeIPAddressTest struct {
 6 | 	maybeIPAddress string
 7 | 	isIPAddress    bool
 8 | }
 9 | 
10 | var looksLikeIPv4AddressTests = []looksLikeIPAddressTest{
11 | 	{maybeIPAddress: "",
12 | 		isIPAddress: false,
13 | 	},
14 | 	{maybeIPAddress: " ",
15 | 		isIPAddress: false,
16 | 	},
17 | 	{maybeIPAddress: "google.com",
18 | 		isIPAddress: false,
19 | 	},
20 | 	{maybeIPAddress: "1google.com",
21 | 		isIPAddress: false,
22 | 	},
23 | 	{maybeIPAddress: "127.0.0.1",
24 | 		isIPAddress: true,
25 | 	},
26 | 	{maybeIPAddress: "127.0.0.256",
27 | 		isIPAddress: false,
28 | 	},
29 | }
30 | 
31 | var looksLikeIPv6AddressTests = []looksLikeIPAddressTest{
32 | 	{maybeIPAddress: "",
33 | 		isIPAddress: false,
34 | 	},
35 | 	{maybeIPAddress: " ",
36 | 		isIPAddress: false,
37 | 	},
38 | 	{maybeIPAddress: "google.com",
39 | 		isIPAddress: false,
40 | 	},
41 | 	{maybeIPAddress: "1google.com",
42 | 		isIPAddress: false,
43 | 	},
44 | 	{maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789",
45 | 		isIPAddress: true,
46 | 	},
47 | 	{maybeIPAddress: "gGgG:ef01:2345:6789:aBcD:ef01:2345:6789",
48 | 		isIPAddress: false,
49 | 	},
50 | 	{maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1",
51 | 		isIPAddress: true,
52 | 	},
53 | 	{maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.256",
54 | 		isIPAddress: false,
55 | 	},
56 | }
57 | 
58 | func TestIsIPv4(t *testing.T) {
59 | 	for _, test := range looksLikeIPv4AddressTests {
60 | 		isIPv4Address := isIPv4(test.maybeIPAddress)
61 | 		if isIPv4Address != test.isIPAddress {
62 | 			t.Errorf("Output %t not equal to expected %t",
63 | 				isIPv4Address, test.isIPAddress)
64 | 		}
65 | 	}
66 | }
67 | 
68 | func TestIsIPv6(t *testing.T) {
69 | 	for _, test := range looksLikeIPv6AddressTests {
70 | 		isIPv6Address := isIPv6(test.maybeIPAddress)
71 | 		if isIPv6Address != test.isIPAddress {
72 | 			t.Errorf("Output %t not equal to expected %t",
73 | 				isIPv6Address, test.isIPAddress)
74 | 		}
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/print.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"github.com/fatih/color"
 5 | )
 6 | 
 7 | // PrintRes pretty-prints URL components from ExtractResult
 8 | func PrintRes(url string, res ExtractResult) {
 9 | 	var leftAttrsFilled = []color.Attribute{color.FgHiYellow, color.Bold}
10 | 	var leftAttrsBlank = []color.Attribute{color.FgHiBlack}
11 | 	var rightAttrs = []color.Attribute{color.FgHiWhite}
12 | 
13 | 	if len(url) != 0 {
14 | 		color.New(leftAttrsFilled...).Print("              url: ")
15 | 	} else {
16 | 		color.New(leftAttrsBlank...).Print("              url: ")
17 | 	}
18 | 	color.New(rightAttrs...).Println(url)
19 | 
20 | 	if len(res.Scheme) != 0 {
21 | 		color.New(leftAttrsFilled...).Print("           scheme: ")
22 | 	} else {
23 | 		color.New(leftAttrsBlank...).Print("           scheme: ")
24 | 	}
25 | 	color.New(rightAttrs...).Println(res.Scheme)
26 | 
27 | 	if len(res.UserInfo) != 0 {
28 | 		color.New(leftAttrsFilled...).Print("         userinfo: ")
29 | 	} else {
30 | 		color.New(leftAttrsBlank...).Print("         userinfo: ")
31 | 	}
32 | 	color.New(rightAttrs...).Println(res.UserInfo)
33 | 
34 | 	if len(res.SubDomain) != 0 {
35 | 		color.New(leftAttrsFilled...).Print("        subdomain: ")
36 | 	} else {
37 | 		color.New(leftAttrsBlank...).Print("        subdomain: ")
38 | 	}
39 | 	color.New(rightAttrs...).Println(res.SubDomain)
40 | 
41 | 	if len(res.Domain) != 0 {
42 | 		color.New(leftAttrsFilled...).Print("           domain: ")
43 | 	} else {
44 | 		color.New(leftAttrsBlank...).Print("           domain: ")
45 | 	}
46 | 	color.New(rightAttrs...).Println(res.Domain)
47 | 
48 | 	if len(res.Suffix) != 0 {
49 | 		color.New(leftAttrsFilled...).Print("           suffix: ")
50 | 	} else {
51 | 		color.New(leftAttrsBlank...).Print("           suffix: ")
52 | 	}
53 | 	color.New(rightAttrs...).Println(res.Suffix)
54 | 
55 | 	if len(res.RegisteredDomain) != 0 {
56 | 		color.New(leftAttrsFilled...).Print("registered domain: ")
57 | 	} else {
58 | 		color.New(leftAttrsBlank...).Print("registered domain: ")
59 | 	}
60 | 	color.New(rightAttrs...).Println(res.RegisteredDomain)
61 | 
62 | 	if len(res.Port) != 0 {
63 | 		color.New(leftAttrsFilled...).Print("             port: ")
64 | 	} else {
65 | 		color.New(leftAttrsBlank...).Print("             port: ")
66 | 	}
67 | 	color.New(rightAttrs...).Println(res.Port)
68 | 
69 | 	if len(res.Path) != 0 {
70 | 		color.New(leftAttrsFilled...).Print("             path: ")
71 | 	} else {
72 | 		color.New(leftAttrsBlank...).Print("             path: ")
73 | 	}
74 | 	color.New(rightAttrs...).Println(res.Path)
75 | 
76 | 	if res.HostType != 0 {
77 | 		color.New(color.FgHiBlue, color.Bold).Print("        host type: ")
78 | 	} else {
79 | 		color.New(leftAttrsBlank...).Print("        host type: ")
80 | 	}
81 | 	switch res.HostType {
82 | 	case HostName:
83 | 		color.New(rightAttrs...).Println("hostname")
84 | 	case IPv4:
85 | 		color.New(rightAttrs...).Println("ipv4 address")
86 | 	case IPv6:
87 | 		color.New(rightAttrs...).Println("ipv6 address")
88 | 	default:
89 | 		color.New(rightAttrs...).Println()
90 | 	}
91 | 
92 | 	color.New().Println()
93 | }
94 | 


--------------------------------------------------------------------------------
/print_test.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestPrintRes(t *testing.T) {
 8 | 	PrintRes("", ExtractResult{})
 9 | 	res := ExtractResult{}
10 | 	res.Scheme = "https://"
11 | 	res.UserInfo = "user"
12 | 	res.SubDomain = "a.subdomain"
13 | 	res.Domain = "example"
14 | 	res.Suffix = "a%63.uk"
15 | 	res.RegisteredDomain = "example.a%63.uk"
16 | 	res.Port = "5000"
17 | 	res.Path = "/a/b?id=42"
18 | 	res.HostType = HostName
19 | 	PrintRes("https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42", res)
20 | 	res = ExtractResult{}
21 | 	res.HostType = IPv4
22 | 	PrintRes("1.1.1.1", res)
23 | 	res.HostType = IPv6
24 | 	PrintRes("[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]", res)
25 | }
26 | 


--------------------------------------------------------------------------------
/psl.go:
--------------------------------------------------------------------------------
  1 | package fasttld
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"net/http"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | 	"runtime"
 12 | 	"strings"
 13 | 	"time"
 14 | 
 15 | 	"github.com/spf13/afero"
 16 | 	"golang.org/x/net/idna"
 17 | )
 18 | 
 19 | var publicSuffixListSources = []string{
 20 | 	"https://publicsuffix.org/list/public_suffix_list.dat",
 21 | 	"https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat",
 22 | }
 23 | 
 24 | type suffixes struct {
 25 | 	publicSuffixes  []string
 26 | 	privateSuffixes []string
 27 | 	allSuffixes     []string
 28 | }
 29 | 
 30 | func processLine(rawLine string, psl suffixes, isPrivateSuffix bool) (suffixes, bool) {
 31 | 	line := strings.TrimSpace(rawLine)
 32 | 	if "// ===BEGIN PRIVATE DOMAINS===" == line {
 33 | 		isPrivateSuffix = true
 34 | 	}
 35 | 	if len(line) == 0 || strings.HasPrefix(line, "//") {
 36 | 		return psl, isPrivateSuffix
 37 | 	}
 38 | 	suffix, err := idna.ToASCII(line)
 39 | 	if err != nil {
 40 | 		// skip line if unable to convert to ascii
 41 | 		log.Println(line, '|', err)
 42 | 		return psl, isPrivateSuffix
 43 | 	}
 44 | 	if isPrivateSuffix {
 45 | 		psl.privateSuffixes = append(psl.privateSuffixes, suffix)
 46 | 		if suffix != line {
 47 | 			// add non-punycode version if it is different from punycode version
 48 | 			psl.privateSuffixes = append(psl.privateSuffixes, line)
 49 | 		}
 50 | 	} else {
 51 | 		psl.publicSuffixes = append(psl.publicSuffixes, suffix)
 52 | 		if suffix != line {
 53 | 			// add non-punycode version if it is different from punycode version
 54 | 			psl.publicSuffixes = append(psl.publicSuffixes, line)
 55 | 		}
 56 | 	}
 57 | 	psl.allSuffixes = append(psl.allSuffixes, suffix)
 58 | 	if suffix != line {
 59 | 		// add non-punycode version if it is different from punycode version
 60 | 		psl.allSuffixes = append(psl.allSuffixes, line)
 61 | 	}
 62 | 	return psl, isPrivateSuffix
 63 | }
 64 | 
 65 | // getPublicSuffixList retrieves Public Suffixes and Private Suffixes from Public Suffix list located at cacheFilePath.
 66 | //
 67 | // publicSuffixes: ICANN domains. Example: com, net, org etc.
 68 | //
 69 | // privateSuffixes: PRIVATE domains. Example: blogspot.co.uk, appspot.com etc.
 70 | //
 71 | // allSuffixes: Both ICANN and PRIVATE domains.
 72 | func getPublicSuffixList(cacheFilePath string) (suffixes, error) {
 73 | 	var psl suffixes
 74 | 	b, err := os.ReadFile(cacheFilePath)
 75 | 	if err != nil {
 76 | 		log.Println(err)
 77 | 		return psl, err
 78 | 	}
 79 | 	var isPrivateSuffix bool
 80 | 	for _, line := range strings.Split(string(b), "\n") {
 81 | 		psl, isPrivateSuffix = processLine(line, psl, isPrivateSuffix)
 82 | 	}
 83 | 	return psl, nil
 84 | }
 85 | 
 86 | // getHardcodedPublicSuffixList retrieves Public Suffixes and Private Suffixes from hardcoded Public Suffix list.
 87 | //
 88 | // publicSuffixes: ICANN domains. Example: com, net, org etc.
 89 | //
 90 | // privateSuffixes: PRIVATE domains. Example: blogspot.co.uk, appspot.com etc.
 91 | //
 92 | // allSuffixes: Both ICANN and PRIVATE domains.
 93 | func getHardcodedPublicSuffixList() (suffixes, error) {
 94 | 	var psl suffixes
 95 | 	var isPrivateSuffix bool
 96 | 	for _, line := range strings.Split(hardcodedPSL, "\n") {
 97 | 		psl, isPrivateSuffix = processLine(line, psl, isPrivateSuffix)
 98 | 	}
 99 | 	return psl, nil
100 | }
101 | 
102 | // newHardcodedPSL creates a new *FastTLD using data from a hardcoded Public Suffix List file.
103 | func newHardcodedPSL(err error, n SuffixListParams) (*FastTLD, error) {
104 | 	log.Println(err, "Fallback to hardcoded Public Suffix List")
105 | 	tldTrie, err := trieConstruct(n.IncludePrivateSuffix, "")
106 | 	return &FastTLD{cacheFilePath: "", tldTrie: tldTrie, includePrivateSuffix: n.IncludePrivateSuffix}, err
107 | }
108 | 
109 | // downloadFile downloads file from url as byte slice
110 | func downloadFile(url string) ([]byte, error) {
111 | 	// Make HTTP GET request
112 | 	var bodyBytes []byte
113 | 	resp, err := http.Get(url)
114 | 	if err != nil {
115 | 		return bodyBytes, err
116 | 	}
117 | 	defer resp.Body.Close()
118 | 
119 | 	if resp.StatusCode == http.StatusOK {
120 | 		bodyBytes, err = afero.ReadAll(resp.Body)
121 | 	} else {
122 | 		err = errors.New("Download failed, HTTP status code : " + fmt.Sprint(resp.StatusCode))
123 | 	}
124 | 	return bodyBytes, err
125 | }
126 | 
127 | // getCurrentFilePath returns path to current module file
128 | //
129 | // Similar to os.path.dirname(os.path.realpath(__file__)) in Python
130 | //
131 | // Credits: https://andrewbrookins.com/tech/golang-get-directory-of-the-current-file
132 | func getCurrentFilePath() (string, bool) {
133 | 	_, file, _, ok := runtime.Caller(0)
134 | 	return filepath.Dir(file), ok
135 | }
136 | 
137 | // Number of hours elapsed since last modified time of fileinfo.
138 | func fileLastModifiedHours(fileinfo os.FileInfo) float64 {
139 | 	return time.Now().Sub(fileinfo.ModTime()).Hours()
140 | }
141 | 
142 | // update updates the local cache of Public Suffix List
143 | func update(file afero.File,
144 | 	publicSuffixListSources []string) error {
145 | 	for _, publicSuffixListSource := range publicSuffixListSources {
146 | 		// Write GET request body to local file
147 | 		if bodyBytes, err := downloadFile(publicSuffixListSource); err != nil {
148 | 			log.Println(err)
149 | 		} else {
150 | 			if !validPSLDelimiters(bodyBytes) {
151 | 				continue
152 | 			}
153 | 			if _, err := file.Seek(0, 0); err != nil {
154 | 				log.Println(err)
155 | 				continue
156 | 			}
157 | 			if _, err := file.Write(bodyBytes); err != nil {
158 | 				log.Println(err)
159 | 				continue
160 | 			}
161 | 			log.Println("Public Suffix List updated.")
162 | 			return nil
163 | 		}
164 | 	}
165 | 	return errors.New("failed to fetch any Public Suffix List from all mirrors")
166 | }
167 | 
168 | func validPSLDelimiters(contents []byte) bool {
169 | 	return bytes.Contains(contents, []byte("// ===BEGIN ICANN DOMAINS===")) &&
170 | 		bytes.Contains(contents, []byte("// ===END ICANN DOMAINS===")) &&
171 | 		bytes.Contains(contents, []byte("// ===BEGIN PRIVATE DOMAINS===")) &&
172 | 		bytes.Contains(contents, []byte("// ===END PRIVATE DOMAINS==="))
173 | }
174 | 
175 | func checkCacheFile(cacheFilePath string) (bool, float64) {
176 | 	cacheFilePath, pathValidErr := filepath.Abs(strings.TrimSpace(cacheFilePath))
177 | 	stat, fileinfoErr := os.Stat(cacheFilePath)
178 | 	var lastModifiedHours float64
179 | 	if fileinfoErr == nil {
180 | 		lastModifiedHours = fileLastModifiedHours(stat)
181 | 	}
182 | 
183 | 	var validDelimiters bool
184 | 	if contents, err := os.ReadFile(cacheFilePath); err == nil {
185 | 		validDelimiters = validPSLDelimiters(contents)
186 | 	}
187 | 	return pathValidErr == nil && fileinfoErr == nil && !stat.IsDir() && validDelimiters, lastModifiedHours
188 | }
189 | 
190 | // Update updates the default Public Suffix list file and updates its suffix trie using the updated file.
191 | // If cache file path is not the same as the default cache file path, this will be a no-op.
192 | func (f *FastTLD) Update() error {
193 | 	filesystem := new(afero.OsFs)
194 | 	defaultCacheFilePath := afero.GetTempDir(filesystem, "") + defaultPSLFileName
195 | 
196 | 	if f.cacheFilePath != defaultCacheFilePath {
197 | 		return errors.New("No-op. Only default Public Suffix list file can be updated")
198 | 	}
199 | 	file, err := os.OpenFile(defaultCacheFilePath, os.O_CREATE|os.O_WRONLY, 0644)
200 | 	if err != nil {
201 | 		return err
202 | 	}
203 | 	defer file.Close()
204 | 	if updateErr := update(file, publicSuffixListSources); updateErr != nil {
205 | 		return updateErr
206 | 	}
207 | 	tldTrie, err := trieConstruct(f.includePrivateSuffix, defaultCacheFilePath)
208 | 	if err == nil {
209 | 		f.tldTrie = tldTrie
210 | 		f.cacheFilePath = defaultCacheFilePath
211 | 	}
212 | 	return err
213 | }
214 | 


--------------------------------------------------------------------------------
/psl_test.go:
--------------------------------------------------------------------------------
  1 | package fasttld
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"net/http"
  6 | 	"net/http/httptest"
  7 | 	"os"
  8 | 	"reflect"
  9 | 	"testing"
 10 | 
 11 | 	"github.com/spf13/afero"
 12 | )
 13 | 
 14 | type getPublicSuffixListTest struct {
 15 | 	cacheFilePath string
 16 | 	expectedLists suffixes
 17 | 	hasError      bool
 18 | }
 19 | 
 20 | var getPublicSuffixListTests = []getPublicSuffixListTest{
 21 | 	{cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)),
 22 | 		expectedLists: pslTestLists,
 23 | 		hasError:      false,
 24 | 	},
 25 | 	{cacheFilePath: fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator)),
 26 | 		expectedLists: suffixes{[]string{"ac", "com.ac", "edu.ac", "gov.ac", "net.ac",
 27 | 			"mil.ac", "org.ac", "*.ck", "!www.ck", "org.sg"}, []string{"blogspot.com"},
 28 | 			[]string{"ac", "com.ac", "edu.ac", "gov.ac", "net.ac", "mil.ac",
 29 | 				"org.ac", "*.ck", "!www.ck", "org.sg", "blogspot.com"}},
 30 | 		hasError: false,
 31 | 	},
 32 | 	{cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat.noexist", string(os.PathSeparator)),
 33 | 		expectedLists: suffixes{[]string{}, []string{}, []string{}},
 34 | 		hasError:      true,
 35 | 	},
 36 | }
 37 | 
 38 | func TestGetPublicSuffixList(t *testing.T) {
 39 | 	for _, test := range getPublicSuffixListTests {
 40 | 		suffixLists, err := getPublicSuffixList(test.cacheFilePath)
 41 | 		if test.hasError && err == nil {
 42 | 			t.Errorf("Expected an error. Got no error.")
 43 | 		}
 44 | 		if !test.hasError && err != nil {
 45 | 			t.Errorf("Expected no error. Got an error.")
 46 | 		}
 47 | 		if output := reflect.DeepEqual(suffixLists,
 48 | 			test.expectedLists); !output && (len(suffixLists.publicSuffixes)+
 49 | 			len(suffixLists.privateSuffixes)+
 50 | 			len(suffixLists.allSuffixes)+
 51 | 			len(test.expectedLists.publicSuffixes)+
 52 | 			len(test.expectedLists.privateSuffixes)+
 53 | 			len(test.expectedLists.allSuffixes)) != 0 {
 54 | 			t.Errorf("Output %q not equal to expected %q",
 55 | 				suffixLists, test.expectedLists)
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | func TestGetHardcodedPublicSuffixList(t *testing.T) {
 61 | 	suffixLists, err := getHardcodedPublicSuffixList()
 62 | 	if err != nil {
 63 | 		t.Errorf("Expected no error. Got an error.")
 64 | 	}
 65 | 	if len(suffixLists.publicSuffixes) == 0 {
 66 | 		t.Errorf("len(suffixLists.publicSuffixes) should be more than 0.")
 67 | 	}
 68 | 	if len(suffixLists.privateSuffixes) == 0 {
 69 | 		t.Errorf("len(suffixLists.privateSuffixes) should be more than 0.")
 70 | 	}
 71 | 	if len(suffixLists.allSuffixes) == 0 {
 72 | 		t.Errorf("len(suffixLists.allSuffixes) should be more than 0.")
 73 | 	}
 74 | }
 75 | 
 76 | func TestNewHardcodedPSL(t *testing.T) {
 77 | 	f, err := newHardcodedPSL(nil, SuffixListParams{})
 78 | 	if err != nil {
 79 | 		t.Errorf("newHardcodedPSL error: %q", err)
 80 | 	}
 81 | 	if f.tldTrie.matches.Len() == 0 {
 82 | 		t.Errorf("tldTrie should not be empty")
 83 | 	}
 84 | }
 85 | 
 86 | func TestDownloadFile(t *testing.T) {
 87 | 	expectedResponse := []byte(`{"isItSunday": true}`)
 88 | 	goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 89 | 		w.Write(expectedResponse)
 90 | 		r.Header.Get("") // removes unused parameter warning
 91 | 	}))
 92 | 	defer goodServer.Close()
 93 | 	badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 94 | 		w.WriteHeader(404)
 95 | 		r.Header.Get("") // removes unused parameter warning
 96 | 	}))
 97 | 	defer badServer.Close()
 98 | 
 99 | 	// HTTP Status Code 200
100 | 	res, _ := downloadFile(goodServer.URL)
101 | 	if output := reflect.DeepEqual(expectedResponse,
102 | 		res); !output {
103 | 		t.Errorf("Output %q not equal to expected %q",
104 | 			res, expectedResponse)
105 | 	}
106 | 
107 | 	// HTTP Status Code 404
108 | 	res, _ = downloadFile(badServer.URL)
109 | 	if len(res) != 0 {
110 | 		t.Errorf("Response should be empty.")
111 | 	}
112 | 
113 | 	// Malformed URL
114 | 	res, _ = downloadFile("!example.com")
115 | 	if len(res) != 0 {
116 | 		t.Errorf("Response should be empty.")
117 | 	}
118 | }
119 | 
120 | type updateTest struct {
121 | 	mainServerAvailable, fallbackServerAvailable, expectError bool
122 | }
123 | 
124 | var updateTests = []updateTest{
125 | 	{true, true, false},
126 | 	{true, false, false},
127 | 	{false, true, false},
128 | 	{false, false, true},
129 | }
130 | 
131 | func TestUpdate(t *testing.T) {
132 | 	requiredComments := "// ===BEGIN ICANN DOMAINS===\n// ===END ICANN DOMAINS===\n// ===BEGIN PRIVATE DOMAINS===\n// ===END PRIVATE DOMAINS==="
133 | 	goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
134 | 		w.Write([]byte(requiredComments))
135 | 		r.Header.Get("") // removes unused parameter warning
136 | 	}))
137 | 	defer goodServer.Close()
138 | 	emptyServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
139 | 		w.Write([]byte(""))
140 | 		r.Header.Get("") // removes unused parameter warning
141 | 	}))
142 | 	defer emptyServer.Close()
143 | 	badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
144 | 		w.WriteHeader(404)
145 | 		r.Header.Get("") // removes unused parameter warning
146 | 	}))
147 | 	defer badServer.Close()
148 | 
149 | 	filesystem := new(afero.MemMapFs)
150 | 	file, _ := afero.TempFile(filesystem, "", "ioutil-test")
151 | 	defer file.Close()
152 | 
153 | 	for _, test := range updateTests {
154 | 		var primarySource, fallbackSource string
155 | 		if test.mainServerAvailable {
156 | 			primarySource = goodServer.URL
157 | 		} else {
158 | 			primarySource = badServer.URL
159 | 		}
160 | 		if test.fallbackServerAvailable {
161 | 			fallbackSource = goodServer.URL
162 | 		} else {
163 | 			fallbackSource = badServer.URL
164 | 		}
165 | 
166 | 		// error should only be returned if Public Suffix List with requiredComments cannot
167 | 		// be downloaded from any of the sources.
168 | 		err := update(file, []string{primarySource, fallbackSource})
169 | 		if test.expectError && err == nil {
170 | 			t.Errorf("Expected update() error, got no error.")
171 | 		}
172 | 		if !test.expectError && err != nil {
173 | 			t.Errorf("Expected no update() error, got an error.")
174 | 		}
175 | 	}
176 | 
177 | 	// None of the servers return content with requiredComments
178 | 	if err := update(file, []string{emptyServer.URL, emptyServer.URL}); err == nil {
179 | 		t.Errorf("Expected update() error, got no error.")
180 | 	}
181 | }
182 | 
183 | func TestFileLastModifiedHours(t *testing.T) {
184 | 	filesystem := new(afero.MemMapFs)
185 | 	file, _ := afero.TempFile(filesystem, "", "ioutil-test")
186 | 	fileinfo, _ := filesystem.Stat(file.Name())
187 | 	if hours := fileLastModifiedHours(fileinfo); int(hours) != 0 {
188 | 		t.Errorf("Expected hours elapsed since last modification to be 0 immediately after file creation. %f", hours)
189 | 	}
190 | 	defer file.Close()
191 | }
192 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": ["config:recommended"],
 3 |     "ignoreTests": false,
 4 |     "packageRules": [
 5 |         {
 6 |             "matchUpdateTypes": ["minor", "patch", "pin", "digest"],
 7 |             "automerge": true
 8 |         },
 9 |         {
10 |             "description": "Opt-out of minimum Go version updates",
11 |             "matchManagers": ["gomod"],
12 |             "matchDepTypes": ["golang"],
13 |             "enabled": false
14 |         }
15 |     ],
16 |     "gomod": {
17 |         "postUpdateOptions": ["gomodUpdateImportPaths", "gomodTidy"]
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/strings.go:
--------------------------------------------------------------------------------
  1 | package fasttld
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"strings"
  6 | 	"unicode/utf8"
  7 | 
  8 | 	"github.com/karlseguin/intset"
  9 | 	"golang.org/x/net/idna"
 10 | )
 11 | 
 12 | // const string -----------------------------------------------------------
 13 | 
 14 | const alphabets string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 15 | const numbers string = "0123456789"
 16 | 
 17 | // IETF RFC 3490
 18 | const labelSeparators string = "\u002e\u3002\uff0e\uff61"
 19 | 
 20 | const controlChars string = "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\t\n\v\f\r\u000e\u000f" +
 21 | 	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
 22 | const whitespace string = controlChars + " \u0085\u0086\u00a0\u1680\u200b\u200c\u200d\uFEFF"
 23 | const invalidHostNameChars string = whitespace + "!\"#$&'()*+,/:;<=>?@[\\]^_`{|}~\u0378\u04c0\u06dd\u180e\u2025\u202e\u206b\u2183\u2a74\u2ff0\ufdd0\uff05\uff0f\uff1a\ufffa"
 24 | 
 25 | const endOfHostWithPortDelimiters string = `/\?#`
 26 | const endOfHostDelimiters string = endOfHostWithPortDelimiters + ":"
 27 | const invalidUserInfoChars string = endOfHostWithPortDelimiters + "[]"
 28 | 
 29 | // asciiSet ---------------------------------------------------------------
 30 | 
 31 | var numericSet asciiSet = makeASCIISet(numbers)
 32 | var alphaNumericSet asciiSet = makeASCIISet(alphabets + numbers)
 33 | var endOfHostWithPortDelimitersSet asciiSet = makeASCIISet(endOfHostWithPortDelimiters)
 34 | var endOfHostDelimitersSet asciiSet = makeASCIISet(endOfHostDelimiters)
 35 | var invalidUserInfoCharsSet asciiSet = makeASCIISet(invalidUserInfoChars)
 36 | 
 37 | var schemeFirstCharSet asciiSet = makeASCIISet(alphabets)
 38 | var schemeRemainingCharSet asciiSet = makeASCIISet(alphabets + numbers + "+-.")
 39 | var slashes asciiSet = makeASCIISet(`/\`)
 40 | 
 41 | // asciiSet is a 32-byte value, where each bit represents the presence of a
 42 | // given ASCII character in the set. The 128-bits of the lower 16 bytes,
 43 | // starting with the least-significant bit of the lowest word to the
 44 | // most-significant bit of the highest word, map to the full range of all
 45 | // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
 46 | // ensuring that any non-ASCII character will be reported as not in the set.
 47 | // This allocates a total of 32 bytes even though the upper half
 48 | // is unused to avoid bounds checks in asciiSet.contains.
 49 | type asciiSet [8]uint32
 50 | 
 51 | // makeASCIISet creates a set of ASCII characters from runes in chars.
 52 | // Non-ASCII runes are skipped. Similar to strings.makeASCIISet.
 53 | func makeASCIISet(chars string) (as asciiSet) {
 54 | 	for _, c := range chars {
 55 | 		if c < utf8.RuneSelf {
 56 | 			as[c/32] |= 1 << (c % 32)
 57 | 		}
 58 | 	}
 59 | 	return as
 60 | }
 61 | 
 62 | // contains reports whether c is inside the set.
 63 | //
 64 | // same as strings.contains.
 65 | func (as *asciiSet) contains(c byte) bool {
 66 | 	return (as[c/32] & (1 << (c % 32))) != 0
 67 | }
 68 | 
 69 | // *intset.Rune -----------------------------------------------------------
 70 | 
 71 | var labelSeparatorsRuneSet *intset.Rune = makeRuneSet(labelSeparators)
 72 | var whitespaceRuneSet *intset.Rune = makeRuneSet(whitespace)
 73 | var invalidHostNameCharsRuneSet *intset.Rune = makeRuneSet(invalidHostNameChars)
 74 | 
 75 | // makeRuneSet converts a string to a set of unique runes
 76 | func makeRuneSet(s string) (iset *intset.Rune) {
 77 | 	var biggestRune rune
 78 | 	for idx, r := range s {
 79 | 		if idx == 0 || r > biggestRune {
 80 | 			biggestRune = r
 81 | 		}
 82 | 	}
 83 | 	// optimal target capacity
 84 | 	iset = intset.NewRune(biggestRune)
 85 | 	for _, r := range s {
 86 | 		iset.Set(r)
 87 | 	}
 88 | 	return
 89 | }
 90 | 
 91 | // ------------------------------------------------------------------------
 92 | 
 93 | // getSchemeEndIndex checks if string s begins with a URL Scheme and
 94 | // returns its last index. Returns -1 if no Scheme exists.
 95 | func getSchemeEndIndex(s string) int {
 96 | 	var colon bool
 97 | 	var slashCount int
 98 | 
 99 | 	for i := 0; i < len(s); i++ {
100 | 		// first character
101 | 		if i == 0 {
102 | 			// expecting schemeFirstCharSet or slash
103 | 			if schemeFirstCharSet.contains(s[i]) {
104 | 				continue
105 | 			}
106 | 			if slashes.contains(s[i]) {
107 | 				slashCount++
108 | 				continue
109 | 			}
110 | 			return -1
111 | 		}
112 | 		// second character onwards
113 | 		// if no slashes yet, look for schemeRemainingCharSet or colon
114 | 		// otherwise look for slashes
115 | 		if slashCount == 0 {
116 | 			if !colon {
117 | 				if schemeRemainingCharSet.contains(s[i]) {
118 | 					continue
119 | 				}
120 | 				if s[i] == ':' {
121 | 					colon = true
122 | 					continue
123 | 				}
124 | 			}
125 | 			if slashes.contains(s[i]) {
126 | 				slashCount++
127 | 				continue
128 | 			}
129 | 			return -1
130 | 		}
131 | 		// expecting only slashes
132 | 		if slashes.contains(s[i]) {
133 | 			slashCount++
134 | 			continue
135 | 		}
136 | 		if slashCount < 2 {
137 | 			return -1
138 | 		}
139 | 		return i
140 | 	}
141 | 	if slashCount >= 2 {
142 | 		return len(s)
143 | 	}
144 | 	return -1
145 | }
146 | 
147 | // indexAnyASCII returns the index of the first instance of any Unicode code point
148 | // from asciiSet in s, or -1 if no Unicode code point from asciiSet is present in s.
149 | //
150 | // Similar to strings.IndexAny but takes in an asciiSet instead of a string
151 | // and skips input validation.
152 | func indexAnyASCII(s string, as asciiSet) int {
153 | 	for i, b := range []byte(s) {
154 | 		if as.contains(b) {
155 | 			return i
156 | 		}
157 | 	}
158 | 	return -1
159 | }
160 | 
161 | // hasInvalidChars checks s for invalid runes
162 | //
163 | // or leading/consecutive label separators
164 | //
165 | // or leading/trailing dash
166 | func hasInvalidChars(s string) bool {
167 | 	var isLabelSeparator bool
168 | 	lastByteIdx := len(s) - 1
169 | 	for idx, c := range s {
170 | 		if alphaNumericSet.contains(byte(c)) {
171 | 			// check for alphanumeric characters early to avoid expensive intset search
172 | 			isLabelSeparator = false
173 | 			continue
174 | 		}
175 | 		if idx == 0 && (c == '-' || labelSeparatorsRuneSet.Exists(c)) {
176 | 			// starts with a dash or label separator
177 | 			return true
178 | 		}
179 | 		if idx == lastByteIdx && c == '-' {
180 | 			// ends with a dash
181 | 			return true
182 | 		}
183 | 		if labelSeparatorsRuneSet.Exists(c) {
184 | 			if isLabelSeparator {
185 | 				// reject consecutive label separators
186 | 				return true
187 | 			}
188 | 			isLabelSeparator = true
189 | 		} else {
190 | 			isLabelSeparator = false
191 | 		}
192 | 		if invalidHostNameCharsRuneSet.Exists(c) {
193 | 			return true
194 | 		}
195 | 	}
196 | 	return false
197 | }
198 | 
199 | // lastIndexAny returns the index of the last instance of any Unicode code
200 | // point from chars in s, or -1 if no Unicode code point from chars is
201 | // present in s.
202 | //
203 | // Similar to strings.LastIndexAny but skips input validation and uses *intset.Rune.
204 | func lastIndexAny(s string, chars *intset.Rune) int {
205 | 	for i := len(s); i > 0; {
206 | 		r, size := utf8.DecodeLastRuneInString(s[0:i])
207 | 		i -= size
208 | 		if chars.Exists(r) {
209 | 			return i
210 | 		}
211 | 	}
212 | 	return -1
213 | }
214 | 
215 | // reverse reverses a slice of strings in-place.
216 | func reverse(input []string) {
217 | 	for i, j := 0, len(input)-1; i < j; i, j = i+1, j-1 {
218 | 		input[i], input[j] = input[j], input[i]
219 | 	}
220 | }
221 | 
222 | // sepSize returns byte length of an sep rune, given the rune's first byte.
223 | func sepSize(r byte) int {
224 | 	// r is the first byte of any of the runes in labelSeparators
225 | 	if r == 46 {
226 | 		// First byte of '.' is 46
227 | 		// size of '.' is 1
228 | 		return 1
229 | 	}
230 | 	// First byte of any label separator other than '.' is not 46
231 | 	// size of separator is 3
232 | 	return 3
233 | }
234 | 
235 | var idnaToPuny *idna.Profile = idna.New(idna.MapForLookup(), idna.Transitional(true), idna.BidiRule(), idna.CheckHyphens(true))
236 | 
237 | // formatAsPunycode formats s as punycode.
238 | func formatAsPunycode(s string) string {
239 | 	asPunyCode, err := idnaToPuny.ToASCII(s)
240 | 	if err != nil {
241 | 		log.Println(strings.SplitAfterN(err.Error(), "idna: invalid label", 2)[0])
242 | 		return ""
243 | 	}
244 | 	return asPunyCode
245 | }
246 | 
247 | // indexLastByteBefore returns the index of the last instance of byte b
248 | // before any byte in notAfterCharsSet, otherwise -1
249 | func indexLastByteBefore(s string, b byte, notAfterCharsSet asciiSet) int {
250 | 	if firstNotAfterCharIdx := indexAnyASCII(s, notAfterCharsSet); firstNotAfterCharIdx != -1 {
251 | 		return strings.LastIndexByte(s[0:firstNotAfterCharIdx], b)
252 | 	}
253 | 	return strings.LastIndexByte(s, b)
254 | }
255 | 
256 | // trimMode specifies which parts of string to trim for fastTrim()
257 | type trimMode int
258 | 
259 | const (
260 | 	trimBoth trimMode = iota
261 | 	trimLeft
262 | 	trimRight
263 | )
264 | 
265 | // fastTrim works like strings.Trim but uses *intset.Rune
266 | func fastTrim(s string, charsToTrim *intset.Rune, mode trimMode) string {
267 | 	var startIdx, endIdx int
268 | 	if mode != trimRight {
269 | 		// Trim left-hand side
270 | 		var trimCharsExist bool
271 | 		var broken bool
272 | 		for idx, c := range s {
273 | 			startIdx = idx
274 | 			if !charsToTrim.Exists(c) {
275 | 				broken = true
276 | 				break
277 | 			}
278 | 			trimCharsExist = true
279 | 		}
280 | 		if trimCharsExist && !broken {
281 | 			// Return empty string if every character in s exists in charsToTrim
282 | 			return ""
283 | 		}
284 | 	}
285 | 	if mode != trimLeft {
286 | 		// Trim right-hand side
287 | 		var trimCharsExist bool
288 | 		var broken bool
289 | 		for i := len(s); i > 0; {
290 | 			endIdx = i
291 | 			r, size := utf8.DecodeLastRuneInString(s[0:i])
292 | 			i -= size
293 | 			if !charsToTrim.Exists(r) {
294 | 				broken = true
295 | 				break
296 | 			}
297 | 			trimCharsExist = true
298 | 		}
299 | 		if trimCharsExist && !broken {
300 | 			// Return empty string if every character in s exists in charsToTrim
301 | 			return ""
302 | 		}
303 | 	} else {
304 | 		endIdx = len(s)
305 | 	}
306 | 	return s[startIdx:endIdx]
307 | }
308 | 


--------------------------------------------------------------------------------
/strings_test.go:
--------------------------------------------------------------------------------
 1 | package fasttld
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"strings"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/karlseguin/intset"
 9 | )
10 | 
11 | type punyCodeTest struct {
12 | 	url      string
13 | 	expected string
14 | }
15 | 
16 | var punyCodeTests = []punyCodeTest{
17 | 	{"google.com", "google.com"},
18 | 	{"hello.世界.com", "hello.xn--rhqv96g.com"},
19 | 	{strings.Repeat("x", 65536) + "\uff00", ""}, // int32 overflow.
20 | }
21 | 
22 | func TestPunyCode(t *testing.T) {
23 | 	for _, test := range punyCodeTests {
24 | 		converted := formatAsPunycode(test.url)
25 | 		if output := reflect.DeepEqual(converted, test.expected); !output {
26 | 			t.Errorf("Output %q not equal to expected %q", converted, test.expected)
27 | 		}
28 | 	}
29 | }
30 | 
31 | type reverseTest struct {
32 | 	original []string
33 | 	expected []string
34 | }
35 | 
36 | var reverseTests = []reverseTest{
37 | 	{[]string{}, []string{}},
38 | 	{[]string{"ab"}, []string{"ab"}},
39 | 	{[]string{"ab", "cd", "gh", "ij"}, []string{"ij", "gh", "cd", "ab"}},
40 | 	{[]string{"ab", "cd", "ef", "gh", "ij"}, []string{"ij", "gh", "ef", "cd", "ab"}},
41 | }
42 | 
43 | func TestReverse(t *testing.T) {
44 | 	for _, test := range reverseTests {
45 | 		reverse(test.original)
46 | 		if output := reflect.DeepEqual(test.original, test.expected); !output {
47 | 			t.Errorf("Output %q not equal to expected %q", test.original, test.expected)
48 | 		}
49 | 	}
50 | }
51 | 
52 | func TestFastTrim(t *testing.T) {
53 | 	const charsToTrim string = ".@新"
54 | 	var charsToTrimRuneSet *intset.Rune = makeRuneSet(charsToTrim)
55 | 
56 | 	ss := []string{".abc.", ".abc", "abc.", "..abc.", ".abc..", "..abc..",
57 | 		"@abc@", "@abc", "abc@", "@@abc@", "@abc@@", "@@abc@@",
58 | 		"新abc新", "新abc", "abc新", "新新abc新", "新abc新新", "新新abc新新",
59 | 		"新@abc新.", "新.abc", "abc@新", "新新.abc新", "新abc新@新", "新新.abc.新新",
60 | 		".", "..",
61 | 		".@", "@.",
62 | 		".@新", "新@.",
63 | 		" ", " .@ ", ". .@ ", " .@ 新",
64 | 		"abc"}
65 | 
66 | 	for _, s := range ss {
67 | 		expectedTrimBoth := strings.Trim(s, charsToTrim)
68 | 		if output := fastTrim(s, charsToTrimRuneSet, trimBoth); output != expectedTrimBoth {
69 | 			t.Errorf("Output %q not equal to expected %q", output, expectedTrimBoth)
70 | 		}
71 | 		expectedTrimLeft := strings.TrimLeft(s, charsToTrim)
72 | 		if output := fastTrim(s, charsToTrimRuneSet, trimLeft); output != expectedTrimLeft {
73 | 			t.Errorf("Output %q not equal to expected %q", output, expectedTrimLeft)
74 | 		}
75 | 		expectedTrimRight := strings.TrimRight(s, charsToTrim)
76 | 		if output := fastTrim(s, charsToTrimRuneSet, trimRight); output != expectedTrimRight {
77 | 			t.Errorf("Output %q not equal to expected %q", output, expectedTrimRight)
78 | 		}
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/test/mini_public_suffix_list.dat:
--------------------------------------------------------------------------------
 1 | // ===BEGIN ICANN DOMAINS===
 2 | ac
 3 | com.ac
 4 | edu.ac
 5 | gov.ac
 6 | net.ac
 7 | mil.ac
 8 | org.ac
 9 | *.ck
10 | !www.ck
11 | org.sg
12 | // ===END ICANN DOMAINS===
13 | // ===BEGIN PRIVATE DOMAINS===
14 | blogspot.com
15 | // the following line is invalid punycode
16 | xn--0.com
17 | // ===END PRIVATE DOMAINS===
18 | 


--------------------------------------------------------------------------------