├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .licrc ├── CODE_OF_CONDUCT.md ├── CREDITS.md ├── LICENSE ├── Makefile ├── README.md ├── Trie_example.svg ├── benchmark_test.go ├── cmd ├── fasttld │ ├── extract.go │ └── root.go └── main.go ├── data └── gen.go ├── data_test.go ├── demo.gif ├── examples └── demo.go ├── fallback.go ├── fasttld.go ├── fasttld_test.go ├── go.mod ├── go.sum ├── net.go ├── net_test.go ├── print.go ├── print_test.go ├── psl.go ├── psl_test.go ├── renovate.json ├── strings.go ├── strings_test.go └── test ├── mini_public_suffix_list.dat └── public_suffix_list.dat /.gitattributes: -------------------------------------------------------------------------------- 1 | # To prevent CRLF breakages on Windows for fragile files, like testdata. 2 | * -text -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | permissions: 3 | contents: read 4 | pull-requests: write 5 | on: [push, pull_request, workflow_dispatch] 6 | jobs: 7 | format-markdown: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v4 12 | 13 | - name: Format Markdown with markdownlint 14 | run: | 15 | npm install -g markdownlint-cli 16 | markdownlint --disable MD013 MD033 --fix . --ignore CODE_OF_CONDUCT.md 17 | git add -A 18 | git diff --cached --exit-code 19 | test-and-coverage: 20 | strategy: 21 | matrix: 22 | os: [macos-latest, windows-latest, ubuntu-latest] 23 | runs-on: ${{ matrix.os }} 24 | steps: 25 | - name: Check out repository 26 | uses: actions/checkout@v4 27 | with: 28 | fetch-depth: 0 # to support `git describe` 29 | - name: Setup go 30 | uses: actions/setup-go@v5 31 | with: 32 | go-version-file: './go.mod' 33 | - name: Test 34 | run: make tests 35 | - name: Build CLI app 36 | run: make build_cli 37 | - name: | 38 | If HEAD is not tagged, CLI app version tag should be newer than latest git version tag 39 | If HEAD is tagged, CLI app version tag should be equal to latest git version tag 40 | shell: bash 41 | run: | 42 | LATEST_TAG=$(git describe --tags `git rev-list --tags --max-count=1`) 43 | CLI_TAG=$(dist/fasttld -v | awk '{print $NF}') 44 | if [[ $( printf $LATEST_TAG"\n"$CLI_TAG ) != $( printf $LATEST_TAG"\n"$CLI_TAG | sort -V ) ]] 45 | then 46 | echo "Expected CLI app version number $CLI_TAG to be newer than or equal to latest git version number $LATEST_TAG. Check Makefile." 47 | exit 1 48 | fi 49 | if [[ $(git describe --exact-match --tags HEAD 2>&1) =~ .*"no tag exactly matches".* ]]; then 50 | if [[ $LATEST_TAG == $CLI_TAG ]] 51 | then 52 | echo "HEAD is not tagged. Expected CLI app version number $CLI_TAG to be newer than latest git version number $LATEST_TAG. Check Makefile." 53 | exit 1 54 | fi 55 | else 56 | if [[ $LATEST_TAG != $CLI_TAG ]] 57 | then 58 | echo "HEAD is tagged. Expected CLI app version number $CLI_TAG to equal to latest git version number $LATEST_TAG. Check Makefile." 59 | exit 1 60 | fi 61 | fi 62 | - name: Convert Go coverage to lcov 63 | if: matrix.os == 'ubuntu-latest' 64 | run: | 65 | go install github.com/jandelgado/gcov2lcov@latest 66 | gcov2lcov -infile=coverage.out -outfile=coverage.lcov 67 | - name: Upload coverage to Coveralls 68 | if: matrix.os == 'ubuntu-latest' 69 | uses: coverallsapp/github-action@v2 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.prof 2 | 3 | # Binaries for programs and plugins 4 | *.exe 5 | *.exe~ 6 | *.dll 7 | *.so 8 | *.dylib 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | *.html 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | public_suffix_list.dat 21 | dist/ -------------------------------------------------------------------------------- /.licrc: -------------------------------------------------------------------------------- 1 | # IMPORTANT!: ALL SECTIONS ARE MANDATORY 2 | [licenses] 3 | unaccepted = ["CC0", "EPL", "MPL" , "OSL", "RPL", "LGPL", "GPL", "AGPL"] 4 | 5 | [dependencies] 6 | # This will allow users to flag some dependencies so that Licensebat will not check for their license. 7 | ignored=[] 8 | 9 | [behavior] 10 | # False by default, if true, it will only run the checks when one of the dependency files or the .licrc file has been modified. 11 | run_only_on_dependency_modification = true 12 | # False by default, if true, it will never block the build. 13 | do_not_block_pr = false -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | wutingfeng@outlook.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CREDITS.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | This application uses code from other open-source projects. The copyright statements of these open-source projects are listed below. 4 | 5 | ## Go 6 | 7 | Source: 8 | 9 | ```markdown 10 | Copyright (c) 2009 The Go Authors. All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are 14 | met: 15 | 16 | * Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | * Redistributions in binary form must reproduce the above 19 | copyright notice, this list of conditions and the following disclaimer 20 | in the documentation and/or other materials provided with the 21 | distribution. 22 | * Neither the name of Google Inc. nor the names of its 23 | contributors may be used to endorse or promote products derived from 24 | this software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | ``` 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Wu Tingfeng 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | tests: 2 | go test -v -race -covermode atomic -coverprofile coverage.out && go tool cover -html coverage.out -o coverage.html 3 | 4 | tests_without_race: 5 | go test -v -covermode atomic -coverprofile coverage.out && go tool cover -html coverage.out -o coverage.html 6 | 7 | format: 8 | go fmt . ./cmd/... ./cmd/fasttld/... ./examples/... 9 | 10 | bench: 11 | go test -bench . -benchmem -cpu 1 12 | 13 | report_bench: 14 | go test -cpuprofile cpu.prof -memprofile mem.prof -bench . -cpu 1 15 | 16 | cpu_report: 17 | go tool pprof cpu.prof 18 | 19 | mem_report: 20 | go tool pprof mem.prof 21 | 22 | build_cli: 23 | go build -o ./dist/fasttld -ldflags "-X 'github.com/elliotwutingfeng/go-fasttld/cmd/fasttld.version=v0.4.5'" ./cmd/main.go 24 | 25 | demo: 26 | go run ./examples/demo.go 27 | 28 | update_psl: 29 | go generate data/gen.go 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-fasttld 2 | 3 | [![Go Reference](https://img.shields.io/badge/go-reference-blue?logo=go&logoColor=white&style=for-the-badge)](https://pkg.go.dev/github.com/elliotwutingfeng/go-fasttld) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/elliotwutingfeng/go-fasttld?style=for-the-badge)](https://goreportcard.com/report/github.com/elliotwutingfeng/go-fasttld) 5 | [![Coveralls](https://img.shields.io/coverallsCoverage/github/elliotwutingfeng/go-fasttld?logo=coveralls&style=for-the-badge)](https://coveralls.io/github/elliotwutingfeng/go-fasttld?branch=main) 6 | [![Mentioned in Awesome Go](https://img.shields.io/static/v1?logo=awesomelists&label=&labelColor=CCA6C4&logoColor=261120&message=Mentioned%20in%20awesome&color=494368&style=for-the-badge)](https://github.com/avelino/awesome-go) 7 | 8 | [![GitHub license](https://img.shields.io/badge/LICENSE-BSD--3--CLAUSE-GREEN?style=for-the-badge)](LICENSE) 9 | 10 | ## Summary 11 | 12 | **go-fasttld** is a high performance [effective top level domains (eTLD)](https://wiki.mozilla.org/Public_Suffix_List) extraction module that extracts subcomponents from [URLs](https://en.wikipedia.org/wiki/URL). 13 | 14 | URLs can either contain hostnames, IPv4 addresses, or IPv6 addresses. eTLD extraction is based on the [Mozilla Public Suffix List](http://www.publicsuffix.org). Private domains listed in the [Mozilla Public Suffix List](http://www.publicsuffix.org) like 'blogspot.co.uk' and 'sinaapp.com' are also supported. 15 | 16 | ![Demo](demo.gif) 17 | 18 | Spot any bugs? Report them [here](https://github.com/elliotwutingfeng/go-fasttld/issues) 19 | 20 | ## Installation 21 | 22 | ```sh 23 | go get github.com/elliotwutingfeng/go-fasttld 24 | ``` 25 | 26 | ## Try the CLI 27 | 28 | First, build the CLI application. 29 | 30 | ```sh 31 | # `git clone` and `cd` to the go-fasttld repository folder first 32 | make build_cli 33 | ``` 34 | 35 | Afterwards, try extracting subcomponents from a URL. 36 | 37 | ```sh 38 | # `git clone` and `cd` to the go-fasttld repository folder first 39 | ./dist/fasttld extract https://user@a.subdomain.example.a%63.uk:5000/a/b\?id\=42 40 | ``` 41 | 42 | ## Try the example code 43 | 44 | All of the following examples can be found at `examples/demo.go`. To play the demo, run the following command: 45 | 46 | ```sh 47 | # `git clone` and `cd` to the go-fasttld repository folder first 48 | make demo 49 | ``` 50 | 51 | ### Hostname 52 | 53 | ```go 54 | // Initialise fasttld extractor 55 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 56 | 57 | // Extract URL subcomponents 58 | url := "https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42" 59 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 60 | 61 | // Display results 62 | fasttld.PrintRes(url, res) // Pretty-prints res.Scheme, res.UserInfo, res.SubDomain etc. 63 | ``` 64 | 65 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 66 | |----------|----------|-------------|---------|--------|------------------|------|------------|--------------| 67 | | https:// | user | a.subdomain | example | a%63.uk | example.a%63.uk | 5000 | /a/b?id=42 | hostname | 68 | 69 | ### IPv4 Address 70 | 71 | ```go 72 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 73 | url := "https://127.0.0.1:5000" 74 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 75 | ``` 76 | 77 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 78 | |----------|----------|-----------|-----------|--------|------------------|------|------|--------------| 79 | | https:// | | | 127.0.0.1 | | 127.0.0.1 | 5000 | | ipv4 address | 80 | 81 | ### IPv6 Address 82 | 83 | ```go 84 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 85 | url := "https://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000" 86 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 87 | ``` 88 | 89 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 90 | |----------|----------|-----------|-----------------------------------------|--------|-----------------------------------------|------|------|--------------| 91 | | https:// | | | aBcD:ef01:2345:6789:aBcD:ef01:2345:6789 | | aBcD:ef01:2345:6789:aBcD:ef01:2345:6789 | 5000 | | ipv6 address | 92 | 93 | ### Internationalised label separators 94 | 95 | **go-fasttld** supports the following internationalised label separators (IETF RFC 3490) 96 | 97 | | Full Stop | Ideographic Full Stop | Fullwidth Full Stop | Halfwidth Ideographic Full Stop | 98 | |------------|-----------------------|---------------------|---------------------------------| 99 | | U+002E `.` | U+3002 `。` | U+FF0E `.` | U+FF61 `。` | 100 | 101 | ```go 102 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 103 | url := "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk" 104 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 105 | ``` 106 | 107 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 108 | |----------|----------|---------------------------------------|--------|-----------|-------------------|------|------|--------------| 109 | | https:// | | brb\u002ei\u3002am\uff0egoing\uff61to | be | a\uff61fk | be\u3002a\uff61fk | | | hostname | 110 | 111 | ## Public Suffix List options 112 | 113 | ### Specify custom public suffix list file 114 | 115 | You can use a custom public suffix list file by setting `CacheFilePath` in `fasttld.SuffixListParams{}` to its absolute path. 116 | 117 | ```go 118 | cacheFilePath := "/absolute/path/to/file.dat" 119 | extractor, err := fasttld.New(fasttld.SuffixListParams{CacheFilePath: cacheFilePath}) 120 | ``` 121 | 122 | ### Updating the default Public Suffix List cache 123 | 124 | Whenever `fasttld.New` is called without specifying `CacheFilePath` in `fasttld.SuffixListParams{}`, the local cache of the default Public Suffix List is updated automatically if it is more than 3 days old. You can also manually update the cache by using `Update()`. 125 | 126 | ```go 127 | // Automatic update performed if `CacheFilePath` is not specified 128 | // and local cache is more than 3 days old 129 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 130 | 131 | // Manually update local cache 132 | if err := extractor.Update(); err != nil { 133 | log.Println(err) 134 | } 135 | ``` 136 | 137 | ### Private domains 138 | 139 | According to the [Mozilla.org wiki](https://wiki.mozilla.org/Public_Suffix_List/Uses), the Mozilla Public Suffix List contains private domains like `blogspot.com` and `sinaapp.com`. 140 | 141 | By default, these private domains are excluded (i.e. `IncludePrivateSuffix = false`) 142 | 143 | ```go 144 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 145 | url := "https://google.blogspot.com" 146 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 147 | ``` 148 | 149 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 150 | |----------|----------|-----------|----------|--------|------------------|------|------|--------------| 151 | | https:// | | google | blogspot | com | blogspot.com | | | hostname | 152 | 153 | You can _include_ private domains by setting `IncludePrivateSuffix = true` 154 | 155 | ```go 156 | extractor, _ := fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: true}) 157 | url := "https://google.blogspot.com" 158 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 159 | ``` 160 | 161 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 162 | |----------|----------|-----------|--------|--------------|---------------------|------|------|--------------| 163 | | https:// | | | google | blogspot.com | google.blogspot.com | | | hostname | 164 | 165 | ## Extraction options 166 | 167 | ### Ignore Subdomains 168 | 169 | You can ignore subdomains by setting `IgnoreSubDomains = true`. By default, subdomains are extracted. 170 | 171 | ```go 172 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 173 | url := "https://maps.google.com" 174 | res, _ := extractor.Extract(fasttld.URLParams{URL: url, IgnoreSubDomains: true}) 175 | ``` 176 | 177 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 178 | |----------|----------|-----------|--------|--------|------------------|------|------|--------------| 179 | | https:// | | | google | com | google.com | | | hostname | 180 | 181 | ### Punycode 182 | 183 | By default, internationalised URLs are not converted to punycode before extraction. 184 | 185 | ```go 186 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 187 | url := "https://hello.世界.com" 188 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 189 | ``` 190 | 191 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 192 | |----------|----------|-----------|--------|--------|------------------|------|------|--------------| 193 | | https:// | | hello | 世界 | com | 世界.com | | | hostname | 194 | 195 | You can convert internationalised URLs to [punycode](https://en.wikipedia.org/wiki/Punycode) before extraction by setting `ConvertURLToPunyCode = true`. 196 | 197 | ```go 198 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 199 | url := "https://hello.世界.com" 200 | res, _ := extractor.Extract(fasttld.URLParams{URL: url, ConvertURLToPunyCode: true}) 201 | ``` 202 | 203 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 204 | |----------|----------|-----------|-------------|--------|------------------|------|------|--------------| 205 | | https:// | | hello | xn--rhqv96g | com | xn--rhqv96g.com | | | hostname | 206 | 207 | ## Parsing errors 208 | 209 | If the URL is invalid, the second value returned by `Extract()`, **error**, will be non-nil. Partially extracted subcomponents can still be retrieved from the first value returned, **ExtractResult**. 210 | 211 | ```go 212 | extractor, _ := fasttld.New(fasttld.SuffixListParams{}) 213 | url := "https://example!.com" // invalid characters in hostname 214 | color.New().Println("The following line should be an error message") 215 | if res, err := extractor.Extract(fasttld.URLParams{URL: url}); err != nil { 216 | color.New(color.FgHiRed, color.Bold).Print("Error: ") 217 | color.New(color.FgHiWhite).Println(err) 218 | } 219 | fasttld.PrintRes(url, res) // Partially extracted subcomponents can still be retrieved 220 | ``` 221 | 222 | | Scheme | UserInfo | SubDomain | Domain | Suffix | RegisteredDomain | Port | Path | HostType | 223 | |----------|----------|-----------|--------|--------|------------------|------|------|----------| 224 | | https:// | | | | | | | | | 225 | 226 | ## Testing 227 | 228 | ```sh 229 | # `git clone` and `cd` to the go-fasttld repository folder first 230 | make tests 231 | 232 | # Alternatively, run tests without race detection 233 | # Useful for systems that do not support the -race flag like windows/386 234 | # See https://tip.golang.org/src/cmd/dist/test.go 235 | make tests_without_race 236 | ``` 237 | 238 | ## Benchmarks 239 | 240 | ```sh 241 | # `git clone` and `cd` to the go-fasttld repository folder first 242 | make bench 243 | ``` 244 | 245 | ### Modules used 246 | 247 | | Benchmark Name | Source | 248 | |----------------------|----------------------------------| 249 | | GoFastTld | go-fasttld (this module) | 250 | | JPilloraGoTld | github.com/jpillora/go-tld | 251 | | JoeGuoTldExtract | github.com/joeguo/tldextract | 252 | | Mjd2021USATldExtract | github.com/mjd2021usa/tldextract | 253 | 254 | ### Results 255 | 256 | Benchmarks performed on AMD Ryzen 7 5800X, Manjaro Linux. 257 | 258 | **go-fasttld** performs especially well on longer URLs. 259 | 260 | --- 261 | 262 | #### #1 263 | 264 | https://iupac.org/iupac-announces-the-2021-top-ten-emerging-technologies-in-chemistry/ 265 | 266 | | Benchmark Name | Iterations | ns/op | B/op | allocs/op | Fastest | 267 | |----------------------|------------|-------------|----------|-------------|--------------------| 268 | | GoFastTld | 8037906 | 150.8 ns/op | 0 B/op | 0 allocs/op | :heavy_check_mark: | 269 | | JPilloraGoTld | 1675113 | 716.1 ns/op | 224 B/op | 2 allocs/op | | 270 | | JoeGuoTldExtract | 2204854 | 515.1 ns/op | 272 B/op | 5 allocs/op | | 271 | | Mjd2021USATldExtract | 1676722 | 712.0 ns/op | 288 B/op | 6 allocs/op | | 272 | 273 | --- 274 | 275 | #### #2 276 | 277 | https://www.google.com/maps/dir/Parliament+Place,+Parliament+House+Of+Singapore,+Singapore/Parliament+St,+London,+UK/@25.2440033,33.6721455,4z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x31da19a0abd4d71d:0xeda26636dc4ea1dc!2m2!1d103.8504863!2d1.2891543!1m5!1m1!1s0x487604c5aaa7da5b:0xf13a2197d7e7dd26!2m2!1d-0.1260826!2d51.5017061!3e4 278 | 279 | | Benchmark Name | Iterations | ns/op | B/op | allocs/op | Fastest | 280 | |----------------------|------------|-------------|-----------|-------------|--------------------| 281 | | GoFastTld | 6381516 | 181.9 ns/op | 0 B/op | 0 allocs/op | :heavy_check_mark: | 282 | | JPilloraGoTld | 431671 | 2603 ns/op | 928 B/op | 4 allocs/op | | 283 | | JoeGuoTldExtract | 893347 | 1176 ns/op | 1120 B/op | 6 allocs/op | | 284 | | Mjd2021USATldExtract | 1030250 | 1165 ns/op | 1120 B/op | 6 allocs/op | | 285 | 286 | --- 287 | 288 | #### #3 289 | 290 | https://a.b.c.d.e.f.g.h.i.j.k.l.m.n.oo.pp.qqq.rrrr.ssssss.tttttttt.uuuuuuuuuuu.vvvvvvvvvvvvvvv.wwwwwwwwwwwwwwwwwwwwww.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy.zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.cc 291 | 292 | | Benchmark Name | Iterations | ns/op | B/op | allocs/op | Fastest | 293 | |----------------------|------------|------------|-----------|-------------|--------------------| 294 | | GoFastTld | 833682 | 1424 ns/op | 0 B/op | 0 allocs/op | :heavy_check_mark: | 295 | | JPilloraGoTld | 734790 | 1640 ns/op | 304 B/op | 3 allocs/op | | 296 | | JoeGuoTldExtract | 695475 | 1452 ns/op | 1040 B/op | 5 allocs/op | | 297 | | Mjd2021USATldExtract | 330717 | 3628 ns/op | 1904 B/op | 8 allocs/op | | 298 | 299 | --- 300 | 301 | ## Implementation details 302 | 303 | ### Why not split on "." and take the last element instead? 304 | 305 | Splitting on "." and taking the last element only works for simple eTLDs like `com`, but not more complex ones like `oseto.nagasaki.jp`. 306 | 307 | ### eTLD tries 308 | 309 | ![Trie](Trie_example.svg) 310 | 311 | **go-fasttld** stores eTLDs in [compressed tries](https://en.wikipedia.org/wiki/Trie). 312 | 313 | Valid eTLDs from the [Mozilla Public Suffix List](http://www.publicsuffix.org) are appended to the compressed trie in reverse-order. 314 | 315 | ```sh 316 | Given the following eTLDs 317 | au 318 | nsw.edu.au 319 | com.ac 320 | edu.ac 321 | gov.ac 322 | 323 | and the example URL host `example.nsw.edu.au` 324 | 325 | The compressed trie will be structured as follows: 326 | 327 | START 328 | ╠═ au 🚩 ✅ 329 | ║ ╚═ edu ✅ 330 | ║ ╚═ nsw 🚩 ✅ 331 | ╚═ ac 332 | ╠═ com 🚩 333 | ╠═ edu 🚩 334 | ╚═ gov 🚩 335 | 336 | === Symbol meanings === 337 | 🚩 : path to this node is a valid eTLD 338 | ✅ : path to this node found in example URL host `example.nsw.edu.au` 339 | ``` 340 | 341 | The URL host subcomponents are parsed from right-to-left until no more matching nodes can be found. In this example, the path of matching nodes are `au -> edu -> nsw`. Reversing the nodes gives the extracted eTLD `nsw.edu.au`. 342 | 343 | ## Acknowledgements 344 | 345 | This module is a port of the Python [fasttld](https://github.com/jophy/fasttld) module, with additional modifications to support extraction of subcomponents from full URLs, IPv4 addresses, and IPv6 addresses. 346 | 347 | - [fasttld (Python)](https://github.com/jophy/fasttld) 348 | - [tldextract (Python)](https://github.com/john-kurkowski/tldextract) 349 | - [ICANN IDN Character Validation Guidance](https://www.icann.org/resources/pages/idna-protocol-2012-02-25-en) 350 | - [IETF RFC 2396](https://www.ietf.org/rfc/rfc2396.txt) 351 | - [IETF RFC 3490](https://www.ietf.org/rfc/rfc3490.txt) 352 | - [IETF RFC 3986](https://www.ietf.org/rfc/rfc3986.txt) 353 | - [IETF RFC 6874](https://www.ietf.org/rfc/rfc6874.txt) 354 | -------------------------------------------------------------------------------- /Trie_example.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 34 | 36 | 42 | 47 | 48 | 54 | 59 | 60 | 61 | 66 | 74 | 82 | 90 | 98 | 106 | 114 | 122 | 130 | i 139 | t 148 | e 157 | o 166 | n 175 | n 184 | n 193 | a 202 | t 211 | i 220 | in 229 | inn 238 | te 247 | tea 256 | ten 265 | to 274 | 3 283 | 12 292 | 9 301 | 7 310 | 5 319 | 11 328 | 332 | 336 | 339 | 343 | 347 | 348 | 351 | 355 | 359 | 360 | 363 | 367 | 371 | 372 | 375 | 379 | 383 | 384 | 387 | 391 | 395 | 396 | 399 | 403 | 407 | 408 | 416 | 419 | 423 | 427 | 428 | 431 | 435 | 439 | 440 | ted 449 | d 458 | A 467 | 470 | 474 | 478 | 479 | 487 | A 496 | 15 505 | 4 514 | 515 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/fatih/color" 8 | joeguotldextract "github.com/joeguo/tldextract" 9 | tld "github.com/jpillora/go-tld" 10 | mjd2021usatldextract "github.com/mjd2021usa/tldextract" 11 | ) 12 | 13 | func BenchmarkComparison(b *testing.B) { 14 | var benchmarkURLs = []string{ 15 | "https://iupac.org/iupac-announces-the-2021-top-ten-emerging-technologies-in-chemistry/", 16 | "https://www.google.com/maps/dir/Parliament+Place,+Parliament+House+Of+Singapore,+" + 17 | "Singapore/Parliament+St,+London,+UK/@25.2440033,33.6721455,4z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x31d" + 18 | "a19a0abd4d71d:0xeda26636dc4ea1dc!2m2!1d103.8504863!2d1.2891543!1m5!1m1!1s0x487604c5aaa7da5b:0xf13a2" + 19 | "197d7e7dd26!2m2!1d-0.1260826!2d51.5017061!3e4", 20 | "https://a.b.c.d.e.f.g.h.i.j.k.l.m.n.oo.pp.qqq.rrrr.ssssss.tttttttt.uuuuuuuuuuu.vvvvvvvvvvvvvvv.wwwwwwwwwwwwwwwwwwwwww.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy.zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.cc", 21 | } 22 | 23 | benchmarks := []struct { 24 | name string 25 | }{ 26 | {"GoFastTld"}, // this module 27 | {"JPilloraGoTld"}, // github.com/jpillora/go-tld 28 | {"JoeGuoTldExtract"}, // github.com/joeguo/tldextract 29 | {"Mjd2021USATldExtract"}, // github.com/mjd2021usa/tldextract 30 | } 31 | 32 | cache := "/tmp/tld.cache" 33 | 34 | for _, benchmarkURL := range benchmarkURLs { 35 | for _, bm := range benchmarks { 36 | if bm.name == "GoFastTld" { 37 | testPSLFilePath, _ := getTestPSLFilePath() 38 | GoFastTld, _ := New(SuffixListParams{ 39 | CacheFilePath: testPSLFilePath, 40 | IncludePrivateSuffix: false, 41 | }) 42 | b.Run(fmt.Sprint(bm.name), func(b *testing.B) { 43 | for i := 0; i < b.N; i++ { 44 | GoFastTld.Extract(URLParams{URL: benchmarkURL}) 45 | } 46 | }) 47 | } else if bm.name == "JPilloraGoTld" { 48 | // Provides the Port and Path subcomponents 49 | // Cannot handle "+://google.com" and IP addresses 50 | // Cannot handle urls without Scheme subcomponent 51 | // Cannot handle trailing whitespace 52 | b.Run(fmt.Sprint(bm.name), func(b *testing.B) { 53 | for i := 0; i < b.N; i++ { 54 | tld.Parse(benchmarkURL) 55 | } 56 | }) 57 | } else if bm.name == "JoeGuoTldExtract" { 58 | JoeGuoTldExtract, _ := joeguotldextract.New(cache, false) 59 | b.Run(fmt.Sprint(bm.name), func(b *testing.B) { 60 | for i := 0; i < b.N; i++ { 61 | JoeGuoTldExtract.Extract(benchmarkURL) 62 | } 63 | }) 64 | 65 | } else if bm.name == "Mjd2021USATldExtract" { 66 | Mjd2021USATldExtract, _ := mjd2021usatldextract.New(cache, false) 67 | b.Run(fmt.Sprint(bm.name), func(b *testing.B) { 68 | for i := 0; i < b.N; i++ { 69 | Mjd2021USATldExtract.Extract(benchmarkURL) 70 | } 71 | }) 72 | } 73 | } 74 | color.New().Println() 75 | color.New(color.FgHiGreen, color.Bold).Print("Benchmarks completed for URL : ") 76 | color.New(color.FgHiBlue).Println(benchmarkURL) 77 | color.New(color.FgHiWhite).Println("=======") 78 | } 79 | } 80 | 81 | /* 82 | 83 | Omitted modules 84 | 85 | github.com/M507/tlde | Almost exactly the same as github.com/joeguo/tldextract 86 | 87 | github.com/ImVexed/fasturl | Fast, but cannot extract eTLDs 88 | 89 | github.com/weppos/publicsuffix-go | Cannot handle full URLs with scheme (i.e. https:// ftp:// etc.) 90 | 91 | github.com/forease/gotld | Does not extract subdomain properly and cannot handle ip addresses 92 | 93 | */ 94 | -------------------------------------------------------------------------------- /cmd/fasttld/extract.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/elliotwutingfeng/go-fasttld" 7 | "github.com/fatih/color" 8 | "github.com/spf13/cobra" 9 | ) 10 | 11 | var includePrivateSuffix, ignoreSubDomains, toPunyCode bool 12 | 13 | var extractCmd = &cobra.Command{ 14 | Use: "extract", 15 | Aliases: []string{"ext"}, 16 | Short: "Extracts subcomponents from a URL.", 17 | Long: `Extracts subcomponents from a URL. 18 | 19 | For Example 20 | --- 21 | fasttld extract abc.example.com:5000/a/path 22 | --- 23 | `, 24 | Args: cobra.ExactArgs(1), 25 | Run: func(cmd *cobra.Command, args []string) { 26 | extractor, err := fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: includePrivateSuffix}) 27 | if err != nil { 28 | log.Fatal(err) 29 | } 30 | res, err := extractor.Extract(fasttld.URLParams{URL: args[0], IgnoreSubDomains: ignoreSubDomains, ConvertURLToPunyCode: toPunyCode}) 31 | if err != nil { 32 | color.New(color.FgHiRed, color.Bold).Print("Error: ") 33 | color.New(color.FgHiWhite).Println(err) 34 | } 35 | fasttld.PrintRes(args[0], res) 36 | }, 37 | } 38 | 39 | func init() { 40 | extractCmd.Flags().BoolVarP(&includePrivateSuffix, "private-suffix", "p", false, "Include private suffix") 41 | extractCmd.Flags().BoolVarP(&ignoreSubDomains, "ignore-subdomains", "i", false, "Ignore subdomains") 42 | extractCmd.Flags().BoolVarP(&toPunyCode, "to-punycode", "t", false, "Convert to punycode") 43 | rootCmd.AddCommand(extractCmd) 44 | } 45 | -------------------------------------------------------------------------------- /cmd/fasttld/root.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/spf13/cobra" 8 | ) 9 | 10 | var version string = "" 11 | 12 | var rootCmd = &cobra.Command{ 13 | Use: "fasttld", 14 | Version: version, 15 | Short: `fasttld is a high performance effective top level domains (eTLD) extraction module.`, 16 | Long: `fasttld is a high performance effective top level domains (eTLD) extraction module.`, 17 | Run: func(cmd *cobra.Command, args []string) {}, 18 | } 19 | 20 | // Execute runs the cobra.Command CLI 21 | func Execute() { 22 | if err := rootCmd.Execute(); err != nil { 23 | fmt.Fprintf(os.Stderr, "Whoops. There was an error while executing your CLI '%s'", err) 24 | os.Exit(1) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/elliotwutingfeng/go-fasttld/cmd/fasttld" 4 | 5 | // main executes the fasttld CLI 6 | func main() { 7 | fasttld.Execute() 8 | } 9 | -------------------------------------------------------------------------------- /data/gen.go: -------------------------------------------------------------------------------- 1 | // The following directive is necessary to make the package coherent: 2 | 3 | //go:build ignore 4 | // +build ignore 5 | 6 | // This program generates fallback.go. It can be invoked by running 7 | // go generate 8 | 9 | //go:generate go run gen.go 10 | 11 | package main 12 | 13 | import ( 14 | "log" 15 | "net/http" 16 | "os" 17 | "text/template" 18 | "time" 19 | 20 | "github.com/spf13/afero" 21 | ) 22 | 23 | func main() { 24 | const url = "https://publicsuffix.org/list/public_suffix_list.dat" 25 | 26 | rsp, err := http.Get(url) 27 | fail(err) 28 | defer rsp.Body.Close() 29 | 30 | b, err := afero.ReadAll(rsp.Body) 31 | fail(err) 32 | content := string(b) 33 | 34 | f, err := os.Create("../fallback.go") 35 | f.Seek(0, 0) 36 | fail(err) 37 | defer f.Close() 38 | 39 | pslTemplate.Execute(f, struct { 40 | Timestamp time.Time 41 | URL string 42 | Content string 43 | }{ 44 | Timestamp: time.Now(), 45 | URL: url, 46 | Content: content, 47 | }) 48 | } 49 | 50 | func fail(err error) { 51 | if err != nil { 52 | log.Fatal(err) 53 | } 54 | } 55 | 56 | var pslTemplate = template.Must(template.New("").Parse(`package fasttld 57 | 58 | // Code generated by go generate; DO NOT EDIT. 59 | // This file was generated by robots at 60 | // {{ .Timestamp }} 61 | // using data from 62 | // {{ .URL }} 63 | 64 | const hardcodedPSL string = ` + "`{{ .Content }}`\n")) 65 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elliotwutingfeng/go-fasttld/bcce76cf9926fe015be8ef1f5cb354bbbd9f5165/demo.gif -------------------------------------------------------------------------------- /examples/demo.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/elliotwutingfeng/go-fasttld" 7 | "github.com/fatih/color" 8 | ) 9 | 10 | func main() { 11 | var fontStyle = []color.Attribute{color.FgHiWhite, color.Bold} 12 | 13 | // Hostname 14 | url := "https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42" 15 | 16 | extractor, err := fasttld.New(fasttld.SuffixListParams{}) 17 | // or instead, specify a custom public suffix list file via SuffixListParams 18 | 19 | if err != nil { 20 | log.Fatal(err) 21 | } 22 | 23 | res, _ := extractor.Extract(fasttld.URLParams{URL: url}) 24 | color.New(fontStyle...).Println("Hostname") 25 | fasttld.PrintRes(url, res) 26 | 27 | // IPv4 Address 28 | url = "https://127.0.0.1:5000" 29 | 30 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 31 | color.New(fontStyle...).Println("IPv4 Address") 32 | fasttld.PrintRes(url, res) 33 | 34 | // IPv6 Address 35 | url = "https://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000" 36 | 37 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 38 | color.New(fontStyle...).Println("IPv6 Address") 39 | fasttld.PrintRes(url, res) 40 | 41 | // Internationalised label separators 42 | url = "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk" 43 | 44 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 45 | color.New(fontStyle...).Println("Internationalised label separators") 46 | fasttld.PrintRes(url, res) 47 | 48 | // Manually update local cache 49 | if err := extractor.Update(); err != nil { 50 | log.Println(err) 51 | } 52 | 53 | // Private domains 54 | url = "https://google.blogspot.com" 55 | 56 | extractor, _ = fasttld.New(fasttld.SuffixListParams{}) 57 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 58 | color.New(fontStyle...).Println("Exclude Private Domains") 59 | fasttld.PrintRes(url, res) 60 | 61 | extractor, _ = fasttld.New(fasttld.SuffixListParams{IncludePrivateSuffix: true}) 62 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 63 | color.New(fontStyle...).Println("Include Private Domains") 64 | fasttld.PrintRes(url, res) 65 | 66 | // Ignore Subdomains 67 | url = "https://maps.google.com" 68 | 69 | extractor, _ = fasttld.New(fasttld.SuffixListParams{}) 70 | res, _ = extractor.Extract(fasttld.URLParams{URL: url, IgnoreSubDomains: true}) 71 | color.New(fontStyle...).Println("Ignore Subdomains") 72 | fasttld.PrintRes(url, res) 73 | 74 | // Punycode 75 | url = "https://hello.世界.com" 76 | 77 | res, _ = extractor.Extract(fasttld.URLParams{URL: url}) 78 | color.New(fontStyle...).Println("No Punycode") 79 | fasttld.PrintRes(url, res) 80 | 81 | res, _ = extractor.Extract(fasttld.URLParams{URL: url, ConvertURLToPunyCode: true}) 82 | color.New(fontStyle...).Println("Punycode") 83 | fasttld.PrintRes(url, res) 84 | 85 | // Parsing errors 86 | url = "https://example!.com" // invalid characters in hostname 87 | 88 | color.New(fontStyle...).Println("Parsing errors") 89 | color.New().Println("The following line should be an error message") 90 | if res, err = extractor.Extract(fasttld.URLParams{URL: url}); err != nil { 91 | color.New(color.FgHiRed, color.Bold).Print("Error: ") 92 | color.New(color.FgHiWhite).Println(err) 93 | } 94 | fasttld.PrintRes(url, res) // Partially extracted subcomponents can still be retrieved 95 | } 96 | -------------------------------------------------------------------------------- /fasttld.go: -------------------------------------------------------------------------------- 1 | // Package fasttld is a high performance effective top level domains (eTLD) 2 | // extraction module implemented with compressed tries. 3 | // 4 | // This module is a port of the Python fasttld module, 5 | // with additional modifications to support extraction 6 | // of subcomponents from full URLs, IPv4 addresses, and IPv6 addresses. 7 | package fasttld 8 | 9 | import ( 10 | "errors" 11 | "log" 12 | "net/url" 13 | "strconv" 14 | "strings" 15 | 16 | "github.com/spf13/afero" 17 | "github.com/tidwall/hashmap" 18 | "golang.org/x/net/idna" 19 | ) 20 | 21 | const defaultPSLFolder string = "data" 22 | const defaultPSLFileName string = "public_suffix_list.dat" 23 | const largestPortNumber int = 65535 24 | const pslMaxAgeHours float64 = 72 25 | 26 | // FastTLD provides the Extract() function, to extract 27 | // URLs using tldTrie generated from the 28 | // Public Suffix List file at cacheFilePath. 29 | type FastTLD struct { 30 | cacheFilePath string 31 | tldTrie *trie 32 | includePrivateSuffix bool 33 | } 34 | 35 | // HostType indicates whether parsed URL 36 | // contains a HostName, IPv4 address, IPv6 address 37 | // or none of them 38 | type HostType int 39 | 40 | // None, HostName, IPv4 and IPv6 indicate whether parsed URL 41 | // contains a HostName, IPv4 address, IPv6 address 42 | // or none of them 43 | const ( 44 | None HostType = iota 45 | HostName 46 | IPv4 47 | IPv6 48 | ) 49 | 50 | // ExtractResult contains components extracted from URL. 51 | type ExtractResult struct { 52 | Scheme, UserInfo, SubDomain, Domain, Suffix, RegisteredDomain, Port, Path string 53 | HostType HostType 54 | } 55 | 56 | // SuffixListParams contains parameters for specifying path to Public Suffix List file and 57 | // whether to extract private suffixes (e.g. blogspot.com). 58 | type SuffixListParams struct { 59 | CacheFilePath string 60 | IncludePrivateSuffix bool 61 | } 62 | 63 | // URLParams specifies URL to extract components from. 64 | // 65 | // If IgnoreSubDomains = true, do not extract SubDomain. 66 | // 67 | // If ConvertURLToPunyCode = true, convert non-ASCII characters like 世界 to punycode. 68 | type URLParams struct { 69 | URL string 70 | IgnoreSubDomains bool 71 | ConvertURLToPunyCode bool 72 | } 73 | 74 | // trie is a node of the compressed trie 75 | // used to store Public Suffix List eTLDs. 76 | type trie struct { 77 | matches hashmap.Map[string, *trie] 78 | end bool 79 | } 80 | 81 | // nestedDict stores a slice of keys in the trie, by traversing the trie using the keys as a "path", 82 | // creating new tries for keys that do not exist yet. 83 | // 84 | // If a new path overlaps an existing path, flag the previous path's trie node as end = true. 85 | func nestedDict(dic *trie, keys []string) { 86 | for _, key := range keys { 87 | if _, ok := dic.matches.Get(key); !ok { 88 | // key doesn't exist; add new node 89 | var m hashmap.Map[string, *trie] 90 | dic.matches.Set(key, &trie{matches: m}) 91 | } 92 | dic, _ = dic.matches.Get(key) 93 | } 94 | // set last node to end = true 95 | dic.end = true 96 | } 97 | 98 | // trieConstruct constructs a compressed trie to store Public Suffix List eTLDs split at "." in reverse-order. 99 | // 100 | // For example: "us.gov.pl" will be stored in the order {"pl", "gov", "us"}. 101 | func trieConstruct(includePrivateSuffix bool, cacheFilePath string) (*trie, error) { 102 | var m hashmap.Map[string, *trie] 103 | tldTrie := &trie{matches: m} 104 | 105 | var suffixLists suffixes 106 | var err error 107 | if cacheFilePath != "" { 108 | suffixLists, err = getPublicSuffixList(cacheFilePath) 109 | } else { 110 | suffixLists, err = getHardcodedPublicSuffixList() 111 | } 112 | 113 | if err != nil { 114 | log.Println(err) 115 | return tldTrie, err 116 | } 117 | 118 | var suffixList []string 119 | if includePrivateSuffix { 120 | suffixList = suffixLists.allSuffixes 121 | } else { 122 | suffixList = suffixLists.publicSuffixes 123 | } 124 | 125 | for _, suffix := range suffixList { 126 | sp := strings.Split(suffix, ".") 127 | reverse(sp) 128 | nestedDict(tldTrie, sp) 129 | } 130 | 131 | tldTrie.matches.Scan(func(key string, value *trie) bool { 132 | if _, ok := value.matches.Get("*"); ok { 133 | value.end = true 134 | } 135 | return true 136 | }) 137 | 138 | return tldTrie, nil 139 | } 140 | 141 | // Extract components from a given `url`. 142 | func (f *FastTLD) Extract(e URLParams) (ExtractResult, error) { 143 | urlParts := ExtractResult{} 144 | 145 | // Extract URL scheme 146 | netloc := fastTrim(e.URL, whitespaceRuneSet, trimBoth) 147 | if schemeEndIndex := getSchemeEndIndex(netloc); schemeEndIndex != -1 { 148 | urlParts.Scheme = netloc[0:schemeEndIndex] 149 | netloc = netloc[schemeEndIndex:] 150 | } 151 | 152 | // Extract URL userinfo 153 | if atIdx := indexLastByteBefore(netloc, '@', invalidUserInfoCharsSet); atIdx != -1 { 154 | urlParts.UserInfo = netloc[0:atIdx] 155 | netloc = netloc[atIdx+1:] 156 | } 157 | 158 | // Find square brackets (if any) and host end index 159 | openingSquareBracketIdx := -1 160 | closingSquareBracketIdx := -1 161 | hostEndIdx := -1 162 | 163 | for i, r := range []byte(netloc) { 164 | if r == '[' { 165 | // Check for opening square bracket 166 | if i > 0 { 167 | // Reject if opening square bracket is not first character of hostname 168 | return urlParts, errors.New("opening square bracket is not first character of hostname") 169 | } 170 | openingSquareBracketIdx = i 171 | } 172 | if r == ']' { 173 | // Check for closing square bracket 174 | closingSquareBracketIdx = i 175 | } 176 | 177 | if openingSquareBracketIdx == -1 { 178 | if closingSquareBracketIdx != -1 { 179 | // Reject if closing square bracket present but no opening square bracket 180 | return urlParts, errors.New("closing square bracket present but no opening square bracket") 181 | } 182 | if endOfHostDelimitersSet.contains(r) { 183 | // If no square brackets 184 | // Check for endOfHostDelimitersSet 185 | hostEndIdx = i 186 | break 187 | } 188 | } else if closingSquareBracketIdx > openingSquareBracketIdx && endOfHostWithPortDelimitersSet.contains(r) { 189 | // If opening + closing square bracket are present in correct order 190 | // check for endOfHostWithPortDelimitersSet 191 | hostEndIdx = i 192 | break 193 | } 194 | 195 | if i == len(netloc)-1 && closingSquareBracketIdx < openingSquareBracketIdx { 196 | // Reject if end of netloc reached but incomplete square bracket pair 197 | return urlParts, errors.New("incomplete square bracket pair") 198 | } 199 | } 200 | 201 | if closingSquareBracketIdx == len(netloc)-1 { 202 | hostEndIdx = -1 203 | } else if closingSquareBracketIdx != -1 { 204 | hostEndIdx = closingSquareBracketIdx + 1 205 | } 206 | 207 | // Check for IPv6 address 208 | if closingSquareBracketIdx > openingSquareBracketIdx { 209 | if !isIPv6(netloc[1:closingSquareBracketIdx]) { 210 | // Have square brackets but invalid IPv6 address => Domain is invalid 211 | return urlParts, errors.New("invalid IPv6 address") 212 | } 213 | if hostEndIdx != -1 { 214 | afterHost := netloc[hostEndIdx:] 215 | if indexAnyASCII(afterHost, endOfHostDelimitersSet) != 0 { 216 | // Reject IPv6 if there are invalid trailing characters after IPv6 address 217 | return urlParts, errors.New("invalid trailing characters after IPv6 address") 218 | } 219 | } 220 | // Closing square bracket in correct place and IPv6 is valid 221 | urlParts.HostType = IPv6 222 | urlParts.Domain = netloc[1:closingSquareBracketIdx] 223 | urlParts.RegisteredDomain = netloc[1:closingSquareBracketIdx] 224 | } 225 | 226 | var afterHost string 227 | // Separate URL host from subcomponents thereafter 228 | if hostEndIdx != -1 { 229 | afterHost = netloc[hostEndIdx:] 230 | netloc = netloc[0:hostEndIdx] 231 | } 232 | 233 | // Extract Port and "Path" if any 234 | if len(afterHost) != 0 { 235 | pathStartIndex := indexAnyASCII(afterHost, endOfHostWithPortDelimitersSet) 236 | if afterHost[0] == ':' { 237 | var maybePort string 238 | if pathStartIndex == -1 { 239 | maybePort = afterHost[1:] 240 | } else { 241 | maybePort = afterHost[1:pathStartIndex] 242 | } 243 | if port, err := strconv.Atoi(maybePort); err == nil && 0 <= port && port <= largestPortNumber { 244 | urlParts.Port = maybePort 245 | } else { 246 | return urlParts, errors.New("invalid port") 247 | } 248 | } 249 | if pathStartIndex != -1 && pathStartIndex != len(afterHost) { 250 | // If there is any path/query/fragment after the URL authority component... 251 | // See https://stackoverflow.com/questions/47543432/what-do-we-call-the-combined-path-query-and-fragment-in-a-uri 252 | // For simplicity, we shall call this the "Path". 253 | urlParts.Path = afterHost[pathStartIndex:] 254 | } 255 | } 256 | 257 | if urlParts.HostType == IPv6 { 258 | return urlParts, nil 259 | } 260 | 261 | // decode all percentage encoded characters, if any 262 | unescapedNetloc, err := url.QueryUnescape(netloc) 263 | if err != nil { 264 | return urlParts, err 265 | } 266 | 267 | if e.ConvertURLToPunyCode { 268 | netloc = formatAsPunycode(unescapedNetloc) 269 | } else if _, err := idna.ToUnicode(unescapedNetloc); err != nil { 270 | // host is invalid if host cannot be converted to Unicode 271 | // 272 | // skip if host already converted to punycode 273 | log.Println(strings.SplitAfterN(err.Error(), "idna: invalid label", 2)[0]) 274 | return urlParts, err 275 | } 276 | 277 | // Check for eTLD Suffix 278 | node := f.tldTrie 279 | 280 | var ( 281 | hasSuffix bool 282 | hasLabels bool 283 | end bool 284 | previousSepIdx int 285 | ) 286 | sepIdx, suffixStartIdx, suffixEndIdx := len(netloc), len(netloc), len(netloc) 287 | 288 | for !end { 289 | var label string 290 | previousSepIdx = sepIdx 291 | sepIdx = lastIndexAny(netloc[0:sepIdx], labelSeparatorsRuneSet) 292 | if sepIdx != -1 { 293 | label = netloc[sepIdx+sepSize(netloc[sepIdx]) : previousSepIdx] 294 | if len(label) == 0 { 295 | // allow consecutive label separators if suffix not found yet 296 | if !hasLabels { 297 | suffixEndIdx = sepIdx 298 | continue 299 | } 300 | // any occurrences of consecutive label separators on left-hand side of a label are illegal. 301 | return urlParts, errors.New("invalid consecutive label separators on left-hand side of a label") 302 | } 303 | hasLabels = true 304 | } else { 305 | label = netloc[0:previousSepIdx] 306 | end = true 307 | } 308 | 309 | if _, ok := node.matches.Get("*"); ok { 310 | // check if label falls under any wildcard exception rule 311 | // e.g. !www.ck 312 | if _, ok := node.matches.Get("!" + label); ok { 313 | sepIdx = previousSepIdx 314 | } 315 | break 316 | } 317 | 318 | // check if label is part of an eTLD 319 | label, _ = url.QueryUnescape(label) 320 | if val, ok := node.matches.Get(label); ok { 321 | suffixStartIdx = sepIdx 322 | if !hasSuffix && val.end { 323 | // index of end of suffix without trailing label separators 324 | suffixEndIdx = previousSepIdx 325 | hasSuffix = true 326 | } 327 | node = val 328 | if val.matches.Len() == 0 { 329 | // label is at a leaf node (no children) ; break out of loop 330 | break 331 | } 332 | } else { 333 | if previousSepIdx != len(netloc) { 334 | sepIdx = previousSepIdx 335 | } 336 | break 337 | } 338 | } 339 | 340 | // Check for IPv4 address 341 | // Minimum possible length: len("0.0.0.0") -> 7 342 | // Ensure first rune is numeric before expensive isIPv4() 343 | if len(netloc) >= 7 && numericSet.contains(netloc[0]) && isIPv4(netloc) { 344 | urlParts.HostType = IPv4 345 | urlParts.Domain = netloc[0:previousSepIdx] 346 | urlParts.RegisteredDomain = urlParts.Domain 347 | return urlParts, nil 348 | } 349 | 350 | if sepIdx == -1 { 351 | sepIdx, suffixStartIdx = len(netloc), len(netloc) 352 | } 353 | 354 | // Reject if invalidHostNameChars or consecutive label separators 355 | // appears before Suffix 356 | if hasSuffix { 357 | if hasInvalidChars(netloc[0:suffixStartIdx]) { 358 | return urlParts, errors.New("invalid characters in hostname") 359 | } 360 | } else { 361 | if hasInvalidChars(netloc[0:previousSepIdx]) { 362 | return urlParts, errors.New("invalid characters in hostname") 363 | } 364 | } 365 | 366 | var domainStartSepIdx int 367 | if hasSuffix { 368 | if sepIdx < len(netloc) { // If there is a Domain 369 | urlParts.Suffix = netloc[sepIdx+sepSize(netloc[sepIdx]) : suffixEndIdx] 370 | domainStartSepIdx = lastIndexAny(netloc[0:sepIdx], labelSeparatorsRuneSet) 371 | if domainStartSepIdx != -1 { // If there is a SubDomain 372 | domainStartIdx := domainStartSepIdx + sepSize(netloc[domainStartSepIdx]) 373 | urlParts.Domain = netloc[domainStartIdx:sepIdx] 374 | urlParts.RegisteredDomain = netloc[domainStartIdx:suffixEndIdx] 375 | } else { 376 | urlParts.Domain = netloc[0:sepIdx] 377 | urlParts.RegisteredDomain = netloc[0:suffixEndIdx] 378 | } 379 | } else { 380 | // Only Suffix exists 381 | urlParts.Suffix = netloc[0:suffixEndIdx] 382 | } 383 | } else { 384 | domainStartSepIdx = lastIndexAny(netloc[0:suffixEndIdx], labelSeparatorsRuneSet) 385 | var domainStartIdx int 386 | if domainStartSepIdx != -1 { // If there is a SubDomain 387 | domainStartIdx = domainStartSepIdx + sepSize(netloc[domainStartSepIdx]) 388 | } 389 | urlParts.Domain = netloc[domainStartIdx:suffixEndIdx] 390 | } 391 | if !e.IgnoreSubDomains && domainStartSepIdx != -1 { // If SubDomain is to be included 392 | urlParts.SubDomain = netloc[0:domainStartSepIdx] 393 | } 394 | 395 | if len(urlParts.Domain) == 0 { 396 | return urlParts, errors.New("empty domain") 397 | } 398 | urlParts.HostType = HostName 399 | return urlParts, nil 400 | } 401 | 402 | // New creates a new *FastTLD using data from a Public Suffix List file. 403 | func New(n SuffixListParams) (*FastTLD, error) { 404 | extractor := &FastTLD{cacheFilePath: n.CacheFilePath, tldTrie: &trie{}, includePrivateSuffix: n.IncludePrivateSuffix} 405 | // If cacheFilePath is unreachable, use temporary folder 406 | if isValid, _ := checkCacheFile(extractor.cacheFilePath); !isValid { 407 | filesystem := new(afero.OsFs) 408 | defaultCacheFolderPath := afero.GetTempDir(filesystem, "") 409 | defaultCacheFilePath := defaultCacheFolderPath + defaultPSLFileName 410 | defaultCacheFolder, err := filesystem.Open(defaultCacheFolderPath) 411 | if err != nil { 412 | // temporary folder not accessible, fallback to hardcoded Public Suffix list 413 | return newHardcodedPSL(err, n) 414 | } 415 | defer defaultCacheFolder.Close() 416 | extractor.cacheFilePath = defaultCacheFilePath 417 | isValid, lastModifiedHours := checkCacheFile(extractor.cacheFilePath) 418 | if !isValid || lastModifiedHours > pslMaxAgeHours { 419 | // update Public Suffix list cache if it is outdated 420 | if updateErr := extractor.Update(); updateErr != nil { 421 | // update failed, fallback to hardcoded Public Suffix list 422 | return newHardcodedPSL(err, n) 423 | } 424 | return extractor, err 425 | } 426 | } 427 | 428 | tldTrie, err := trieConstruct(n.IncludePrivateSuffix, extractor.cacheFilePath) 429 | if err != nil { 430 | return newHardcodedPSL(err, n) 431 | } 432 | extractor.tldTrie = tldTrie 433 | return extractor, err 434 | } 435 | -------------------------------------------------------------------------------- /fasttld_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/tidwall/hashmap" 12 | ) 13 | 14 | var errs = [...]error{ 15 | errors.New("opening square bracket is not first character of hostname"), 16 | errors.New("closing square bracket present but no opening square bracket"), 17 | errors.New("invalid square bracket pair"), 18 | errors.New("incomplete square bracket pair"), 19 | errors.New("invalid IPv6 address"), 20 | errors.New("invalid trailing characters after IPv6 address"), 21 | errors.New("invalid consecutive label separators on left-hand side of a label"), 22 | errors.New("invalid characters in hostname before suffix"), 23 | errors.New("invalid characters in hostname"), 24 | errors.New("empty domain"), 25 | errors.New("invalid port"), 26 | } 27 | 28 | func getTestPSLFilePath() (string, bool) { 29 | var sb strings.Builder 30 | currentFilePath, ok := getCurrentFilePath() 31 | if !ok { 32 | return "", ok 33 | } 34 | sb.WriteString(currentFilePath) 35 | sb.WriteString(string(os.PathSeparator)) 36 | sb.WriteString("test") 37 | sb.WriteString(string(os.PathSeparator)) 38 | sb.WriteString(defaultPSLFileName) 39 | return sb.String(), ok 40 | } 41 | 42 | func TestNestedDict(t *testing.T) { 43 | keysSequences := [][][]string{ 44 | {{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"c", "b"}, {"d", "f"}}, 45 | {{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"d", "f"}, {"c", "b"}}, 46 | {{"a"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c", "b"}, {"c"}, {"d", "f"}}, 47 | {{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a"}, {"a", "b", "c"}, {"a", "b"}}, 48 | {{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b"}, {"a"}, {"a", "b", "c"}}, 49 | {{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b"}, {"a", "b", "c"}, {"a"}}, 50 | {{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b", "c"}, {"a"}, {"a", "b"}}, 51 | {{"c"}, {"a", "d"}, {"c", "b"}, {"d", "f"}, {"a", "b", "c"}, {"a", "b"}, {"a"}}, 52 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a"}, {"c"}, {"a", "b", "c"}}, 53 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"a"}, {"c"}}, 54 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"a", "b", "c"}, {"c"}, {"a"}}, 55 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"c"}, {"a"}, {"a", "b", "c"}}, 56 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b"}, {"c"}, {"a", "b", "c"}, {"a"}}, 57 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b", "c"}, {"a"}, {"a", "b"}, {"c"}}, 58 | {{"d", "f"}, {"c", "b"}, {"a", "d"}, {"a", "b", "c"}, {"a"}, {"c"}, {"a", "b"}}, 59 | } 60 | for _, keysSequence := range keysSequences { 61 | var m hashmap.Map[string, *trie] 62 | originalDict := &trie{matches: m} 63 | for _, keys := range keysSequence { 64 | nestedDict(originalDict, keys) 65 | } 66 | // check each nested value 67 | //Top level c 68 | c, _ := originalDict.matches.Get("c") 69 | if c.matches.Len() != 1 { 70 | t.Errorf("Top level c must have matches map of length 1") 71 | } 72 | if _, ok := c.matches.Get("b"); !ok { 73 | t.Errorf("Top level c must have b in matches map") 74 | } 75 | if !c.end { 76 | t.Errorf("Top level c must have end = true") 77 | } 78 | // Top level a 79 | a, _ := originalDict.matches.Get("a") 80 | if a.matches.Len() != 2 { 81 | t.Errorf("Top level a must have matches map of length 2") 82 | } 83 | // a -> d 84 | aToD, ok := a.matches.Get("d") 85 | if !ok { 86 | t.Errorf("Top level a must have d in matches map") 87 | } 88 | if aToD.matches.Len() != 0 { 89 | t.Errorf("a -> d must have empty matches map") 90 | } 91 | // a -> b 92 | aToB, ok := a.matches.Get("b") 93 | if !ok { 94 | t.Errorf("Top level a must have b in matches map") 95 | } 96 | if !aToB.end { 97 | t.Errorf("a -> b must have end = true") 98 | } 99 | if aToB.matches.Len() != 1 { 100 | t.Errorf("a -> b must have matches map of length 1") 101 | } 102 | // a -> b -> c 103 | aToBToC, ok := aToB.matches.Get("c") 104 | if !ok { 105 | t.Errorf("a -> b must have c in matches map") 106 | } 107 | if aToBToC.matches.Len() != 0 { 108 | t.Errorf("a -> b -> c must have empty matches map") 109 | } 110 | if !a.end { 111 | t.Errorf("Top level a must have end = true") 112 | } 113 | // d -> f 114 | d, _ := originalDict.matches.Get("d") 115 | if d.end { 116 | t.Errorf("Top level d must have end = false") 117 | } 118 | dToF, _ := d.matches.Get("f") 119 | if !dToF.end { 120 | t.Errorf("d -> f must have end = true") 121 | } 122 | if dToF.matches.Len() != 0 { 123 | t.Errorf("d -> f must have empty matches map") 124 | } 125 | } 126 | } 127 | 128 | func TestTrieConstruct(t *testing.T) { 129 | if _, err := trieConstruct(false, fmt.Sprintf("test%sthis_file_does_not_exist.dat", string(os.PathSeparator))); err == nil { 130 | t.Errorf("error returned by trieConstruct should not be nil") 131 | } 132 | if _, err := trieConstruct(false, ""); err != nil { 133 | t.Errorf("error returned by trieConstruct should be nil") 134 | } 135 | } 136 | 137 | func TestTrie(t *testing.T) { 138 | trie, err := trieConstruct(false, fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator))) 139 | if err != nil { 140 | t.Errorf("trieConstruct failed | %q", err) 141 | } 142 | if lenTrieMatches := trie.matches.Len(); lenTrieMatches != 3 { 143 | t.Errorf("Expected top level Trie matches map length of 3. Got %d.", lenTrieMatches) 144 | } 145 | for _, tld := range []string{"ac", "ck"} { 146 | if _, ok := trie.matches.Get(tld); !ok { 147 | t.Errorf("Top level %q must exist", tld) 148 | } 149 | } 150 | ac, _ := trie.matches.Get("ac") 151 | if !ac.end { 152 | t.Errorf("Top level ac must have end = true") 153 | } 154 | ck, _ := trie.matches.Get("ck") 155 | if !ck.end { 156 | t.Errorf("Top level ck must have end = true") 157 | } 158 | if ck.matches.Len() != 2 { 159 | t.Errorf("Top level ck must have matches map of length 2") 160 | } 161 | ckToStar, ok := ck.matches.Get("*") 162 | if !ok { 163 | t.Errorf("Top level ck must have * in matches map") 164 | } 165 | if ckToStar.matches.Len() != 0 { 166 | t.Errorf("ck -> * must have empty matches map") 167 | } 168 | ckToExcWww, ok := ck.matches.Get("!www") 169 | if !ok { 170 | t.Errorf("Top level ck must have !www in matches map") 171 | } 172 | if ckToExcWww.matches.Len() != 0 { 173 | t.Errorf("ck -> !www must have empty matches map") 174 | } 175 | for _, tld := range []string{"com", "edu", "gov", "net", "mil", "org"} { 176 | ac, _ := trie.matches.Get("ac") 177 | acToTld, ok := ac.matches.Get(tld) 178 | if !ok { 179 | t.Errorf("Top level ac must have %q in matches map", tld) 180 | } 181 | if acToTld.matches.Len() != 0 { 182 | t.Errorf("ac -> %q must have empty matches map", tld) 183 | } 184 | } 185 | } 186 | 187 | type newTest struct { 188 | cacheFilePath string 189 | includePrivateSuffix bool 190 | expected int 191 | } 192 | 193 | var newTests = []newTest{ 194 | {cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: false, expected: 1656}, 195 | {cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: true, expected: 1656}, 196 | {cacheFilePath: fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator)), includePrivateSuffix: true, expected: 4}, 197 | } 198 | 199 | func TestNew(t *testing.T) { 200 | for _, test := range newTests { 201 | cacheFilePath := test.cacheFilePath 202 | if cacheFilePath == "" { 203 | testPSLFilePath, ok := getTestPSLFilePath() 204 | if !ok { 205 | t.Errorf("Cannot get path to current module file") 206 | } 207 | cacheFilePath = testPSLFilePath 208 | } 209 | extractor, _ := New(SuffixListParams{ 210 | CacheFilePath: cacheFilePath, 211 | IncludePrivateSuffix: test.includePrivateSuffix, 212 | }) 213 | if numTopLevelKeys := extractor.tldTrie.matches.Len(); numTopLevelKeys != test.expected { 214 | t.Errorf("Expected number of top level keys to be %d. Got %d.", test.expected, numTopLevelKeys) 215 | } 216 | } 217 | } 218 | 219 | type extractTest struct { 220 | includePrivateSuffix bool 221 | urlParams URLParams 222 | expected ExtractResult 223 | err error 224 | description string 225 | } 226 | 227 | var schemeTests = []extractTest{ 228 | {urlParams: URLParams{URL: "h://example.com"}, 229 | expected: ExtractResult{ 230 | Scheme: "h://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Single character Scheme"}, 231 | {urlParams: URLParams{URL: "hTtPs://example.com"}, 232 | expected: ExtractResult{ 233 | Scheme: "hTtPs://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Capitalised Scheme"}, 234 | {urlParams: URLParams{URL: "git-ssh://example.com"}, 235 | expected: ExtractResult{ 236 | Scheme: "git-ssh://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Scheme with -"}, 237 | {urlParams: URLParams{URL: "https://username:password@foo.example.com:999/some/path?param1=value1¶m2=葡萄"}, 238 | expected: ExtractResult{ 239 | Scheme: "https://", UserInfo: "username:password", SubDomain: "foo", 240 | Domain: "example", Suffix: "com", RegisteredDomain: "example.com", 241 | Port: "999", Path: "/some/path?param1=value1¶m2=葡萄", HostType: HostName}, description: "Full https URL with SubDomain"}, 242 | {urlParams: URLParams{URL: "http://www.example.com"}, 243 | expected: ExtractResult{ 244 | Scheme: "http://", SubDomain: "www", 245 | Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, 246 | description: "Full http URL with SubDomain no path"}, 247 | {urlParams: URLParams{ 248 | URL: "http://example.co.uk/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F"}, 249 | expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "co.uk", 250 | RegisteredDomain: "example.co.uk", 251 | Path: "/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F", 252 | HostType: HostName}, 253 | description: "Full http URL with no SubDomain"}, 254 | {urlParams: URLParams{ 255 | URL: "http://big.long.sub.domain.example.co.uk/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F"}, 256 | expected: ExtractResult{Scheme: "http://", SubDomain: "big.long.sub.domain", 257 | Domain: "example", Suffix: "co.uk", RegisteredDomain: "example.co.uk", 258 | Path: "/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F", 259 | HostType: HostName}, 260 | description: "Full http URL with SubDomain"}, 261 | {urlParams: URLParams{ 262 | URL: "ftp://username名字:password@mail.example.co.uk:666/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F"}, 263 | expected: ExtractResult{Scheme: "ftp://", UserInfo: "username名字:password", SubDomain: "mail", 264 | Domain: "example", Suffix: "co.uk", RegisteredDomain: "example.co.uk", Port: "666", 265 | Path: "/path?param1=value1¶m2=葡萄¶m3=value3¶m4=value4&src=https%3A%2F%2Fwww.example.net%2F", 266 | HostType: HostName}, 267 | description: "Full ftp URL with SubDomain"}, 268 | {urlParams: URLParams{URL: "git+ssh://www.example.com/"}, 269 | expected: ExtractResult{Scheme: "git+ssh://", SubDomain: "www", 270 | Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/", HostType: HostName}, description: "Full git+ssh URL with SubDomain"}, 271 | {urlParams: URLParams{URL: "ssh://server.example.com/"}, 272 | expected: ExtractResult{Scheme: "ssh://", SubDomain: "server", 273 | Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/", HostType: HostName}, description: "Full ssh URL with SubDomain"}, 274 | {urlParams: URLParams{URL: "http://www.www.net"}, 275 | expected: ExtractResult{Scheme: "http://", SubDomain: "www", 276 | Domain: "www", Suffix: "net", RegisteredDomain: "www.net", HostType: HostName}, description: "Multiple www"}, 277 | } 278 | var noSchemeTests = []extractTest{ 279 | {urlParams: URLParams{URL: "localhost"}, expected: ExtractResult{Domain: "localhost", HostType: HostName}, description: "localhost"}, 280 | {urlParams: URLParams{URL: "16777215"}, expected: ExtractResult{Domain: "16777215", HostType: HostName}, description: "Number >= 0xFFFFFF"}, 281 | {urlParams: URLParams{URL: "org"}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only"}, 282 | {urlParams: URLParams{URL: "org."}, expected: ExtractResult{Suffix: "org"}, err: errs[9], description: "Single eTLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot 283 | {urlParams: URLParams{URL: "org.."}, expected: ExtractResult{}, err: errs[8], description: "Single eTLD | Suffix Only with 2 trailing dots"}, 284 | {urlParams: URLParams{URL: "co.th"}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only"}, 285 | {urlParams: URLParams{URL: "co.th."}, expected: ExtractResult{Suffix: "co.th"}, err: errs[9], description: "Double eTLD | Suffix Only with single trailing dot"}, // RFC 1034 - allow single trailing dot 286 | {urlParams: URLParams{URL: "co.th.."}, expected: ExtractResult{}, err: errs[8], description: "Double eTLD | Suffix Only with 2 trailing dots"}, 287 | {urlParams: URLParams{URL: "users@example.com"}, expected: ExtractResult{UserInfo: "users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "UserInfo + Domain | No Scheme"}, 288 | {urlParams: URLParams{URL: "mailto:users@example.com"}, expected: ExtractResult{UserInfo: "mailto:users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Mailto | No Scheme"}, 289 | {urlParams: URLParams{URL: "example.com:999"}, expected: ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Port: "999", HostType: HostName}, description: "Domain + Port | No Scheme"}, 290 | {urlParams: URLParams{URL: "example.com"}, expected: ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Domain | No Scheme"}, 291 | {urlParams: URLParams{URL: "255.255.example.com"}, expected: ExtractResult{SubDomain: "255.255", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "Numeric SubDomain + Domain | No Scheme"}, 292 | {urlParams: URLParams{URL: "server.example.com/path"}, expected: ExtractResult{SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/path", HostType: HostName}, description: "SubDomain, Domain and Path | No Scheme"}, 293 | } 294 | var userInfoTests = []extractTest{ 295 | {urlParams: URLParams{URL: "https://username@example.com"}, expected: ExtractResult{Scheme: "https://", 296 | UserInfo: "username", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "username"}, 297 | {urlParams: URLParams{URL: "https://password@example.com"}, expected: ExtractResult{Scheme: "https://", 298 | UserInfo: "password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "username + password"}, 299 | {urlParams: URLParams{URL: "https://:password@example.com"}, expected: ExtractResult{Scheme: "https://", 300 | UserInfo: ":password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty username"}, 301 | {urlParams: URLParams{URL: "https://username:@example.com"}, expected: ExtractResult{Scheme: "https://", 302 | UserInfo: "username:", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty password"}, 303 | {urlParams: URLParams{URL: "https://usern@me:password@example.com"}, expected: ExtractResult{Scheme: "https://", 304 | UserInfo: "usern@me:password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "@ in username"}, 305 | {urlParams: URLParams{URL: "https://usern@me:p@ssword@example.com"}, expected: ExtractResult{Scheme: "https://", 306 | UserInfo: "usern@me:p@ssword", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "@ in password"}, 307 | {urlParams: URLParams{URL: "https://usern@me:@example.com"}, expected: ExtractResult{Scheme: "https://", 308 | UserInfo: "usern@me:", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty password; @ in username"}, 309 | {urlParams: URLParams{URL: "https://:p@ssword@example.com"}, expected: ExtractResult{Scheme: "https://", 310 | UserInfo: ":p@ssword", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", HostType: HostName}, description: "colon but empty username; @ in password"}, 311 | {urlParams: URLParams{URL: "https://usern@m%40e:password@example.com/p@th?q=@go"}, expected: ExtractResult{Scheme: "https://", 312 | UserInfo: "usern@m%40e:password", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/p@th?q=@go", HostType: HostName}, description: "@ in UserInfo and Path"}, 313 | } 314 | var ipv4Tests = []extractTest{ 315 | {urlParams: URLParams{URL: "127.0.0.1"}, 316 | expected: ExtractResult{Domain: "127.0.0.1", 317 | RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Basic IPv4 Address"}, 318 | {urlParams: URLParams{URL: "http://127.0.0.1:5000"}, 319 | expected: ExtractResult{ 320 | Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", Port: "5000", HostType: IPv4}, 321 | description: "Basic IPv4 Address with Scheme and Port"}, 322 | {urlParams: URLParams{URL: "127\uff0e0\u30020\uff611"}, 323 | expected: ExtractResult{Domain: "127\uff0e0\u30020\uff611", 324 | RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Basic IPv4 Address | Internationalised label separators"}, 325 | {urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff611:5000"}, 326 | expected: ExtractResult{Scheme: "http://", Domain: "127\uff0e0\u30020\uff611", Port: "5000", 327 | RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Basic IPv4 Address with Scheme and Port | Internationalised label separators"}, 328 | } 329 | var ipv6Tests = []extractTest{ 330 | {urlParams: URLParams{URL: "[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"}, 331 | expected: ExtractResult{Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", 332 | RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", HostType: IPv6}, description: "Basic IPv6 Address"}, 333 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]:5000"}, 334 | expected: ExtractResult{ 335 | Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", Port: "5000", 336 | HostType: IPv6}, 337 | description: "Basic IPv6 Address with Scheme and Port"}, 338 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]:5000"}, 339 | expected: ExtractResult{ 340 | Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1", RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1", Port: "5000", 341 | HostType: IPv6}, 342 | description: "Basic IPv6 Address + trailing IPv4 address with Scheme and Port"}, 343 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611]:5000"}, 344 | expected: ExtractResult{ 345 | Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611", 346 | RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e0\u30020\uff611", Port: "5000", 347 | HostType: IPv6}, 348 | description: "Basic IPv6 Address + trailing IPv4 address with Scheme and Port | Internationalised label separators"}, 349 | {urlParams: URLParams{URL: "http://[::2345:6789:aBcD:ef01:2345:678]:5000"}, 350 | expected: ExtractResult{Scheme: "http://", Domain: "::2345:6789:aBcD:ef01:2345:678", 351 | RegisteredDomain: "::2345:6789:aBcD:ef01:2345:678", Port: "5000", HostType: IPv6}, 352 | description: "Basic IPv6 Address with Scheme and Port | have leading ellipsis"}, 353 | {urlParams: URLParams{URL: "http://[::]:5000"}, 354 | expected: ExtractResult{Scheme: "http://", Domain: "::", 355 | RegisteredDomain: "::", Port: "5000", HostType: IPv6}, 356 | description: "Basic IPv6 Address with Scheme and Port | only ellipsis"}, 357 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01::]:5000"}, 358 | expected: ExtractResult{Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01::", 359 | RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01::", Port: "5000", HostType: IPv6}, 360 | description: "Basic IPv6 Address with Scheme and Port bad IP with even number of trailing empty hextets"}, 361 | } 362 | var ignoreSubDomainsTests = []extractTest{ 363 | {urlParams: URLParams{URL: "maps.google.com.sg", 364 | IgnoreSubDomains: true}, 365 | expected: ExtractResult{ 366 | Domain: "google", Suffix: "com.sg", 367 | RegisteredDomain: "google.com.sg", HostType: HostName, 368 | }, description: "Ignore SubDomain", 369 | }, 370 | {urlParams: URLParams{URL: "example.za/en", 371 | IgnoreSubDomains: true}, 372 | expected: ExtractResult{ 373 | Domain: "za", Path: "/en", 374 | HostType: HostName}, 375 | description: "za has no 1st-level TLD | IgnoreSubDomains", 376 | }, 377 | {urlParams: URLParams{URL: "https://example.za/en", 378 | IgnoreSubDomains: true}, 379 | expected: ExtractResult{ 380 | Scheme: "https://", 381 | Domain: "za", Path: "/en", 382 | HostType: HostName}, 383 | description: "za has no 1st-level TLD | Scheme + IgnoreSubDomains", 384 | }, 385 | } 386 | var privateSuffixTests = []extractTest{ 387 | {includePrivateSuffix: true, 388 | urlParams: URLParams{URL: "https://brb.i.am.going.to.be.blogspot.com:5000/a/b/c/d.txt?id=42"}, 389 | expected: ExtractResult{ 390 | Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "blogspot.com", 391 | RegisteredDomain: "be.blogspot.com", Port: "5000", Path: "/a/b/c/d.txt?id=42", HostType: HostName, 392 | }, description: "Include Private Suffix"}, 393 | {includePrivateSuffix: true, 394 | urlParams: URLParams{URL: "global.prod.fastly.net"}, 395 | expected: ExtractResult{ 396 | Suffix: "global.prod.fastly.net", 397 | }, err: errs[9], description: "Include Private Suffix | Suffix only"}, 398 | } 399 | var periodsAndWhiteSpacesTests = []extractTest{ 400 | {urlParams: URLParams{URL: "http://127.0.0.1.."}, 401 | expected: ExtractResult{Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Consecutive label separators after IPv4 address", 402 | }, 403 | {urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff611..:5000"}, 404 | expected: ExtractResult{Scheme: "http://", Domain: "127\uff0e0\u30020\uff611", 405 | Port: "5000", RegisteredDomain: "127\uff0e0\u30020\uff611", HostType: IPv4}, description: "Consecutive label separators between IPv4 address and Port", 406 | }, 407 | {urlParams: URLParams{URL: "http://127.0.0.1 "}, 408 | expected: ExtractResult{Scheme: "http://", Domain: "127.0.0.1", RegisteredDomain: "127.0.0.1", HostType: IPv4}, description: "Spaces after IPv4 address", 409 | }, 410 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789] "}, 411 | expected: ExtractResult{Scheme: "http://", Domain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", 412 | RegisteredDomain: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", HostType: IPv6}, description: "Spaces after IPv6 address", 413 | }, 414 | {urlParams: URLParams{URL: "localhost.\u3002"}, expected: ExtractResult{Domain: "localhost", HostType: HostName}, description: "localhost with trailing periods"}, 415 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk\uff0e\u002e\u3002"}, 416 | expected: ExtractResult{Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be", 417 | Suffix: "a\uff61fk", RegisteredDomain: "be\u3002a\uff61fk", HostType: HostName}, 418 | description: "Consecutive label separators after Suffix", 419 | }, 420 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"}, 421 | expected: ExtractResult{ 422 | Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be", Suffix: "a\uff61fk", 423 | RegisteredDomain: "be\u3002a\uff61fk", HostType: HostName, 424 | }, description: "Internationalised label separators", 425 | }, 426 | {urlParams: URLParams{URL: "a\uff61fk"}, 427 | expected: ExtractResult{Suffix: "a\uff61fk"}, err: errs[9], description: "Internationalised label separators | Suffix only", 428 | }, 429 | {urlParams: URLParams{URL: " https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk/a/b/c. \uff61 "}, 430 | expected: ExtractResult{ 431 | Scheme: "https://", SubDomain: "brb\u002ei\u3002am\uff0egoing\uff61to", Domain: "be", Suffix: "a\uff61fk", 432 | RegisteredDomain: "be\u3002a\uff61fk", Path: "/a/b/c. \uff61", HostType: HostName, 433 | }, description: "Surrounded by extra whitespace"}, 434 | 435 | {urlParams: URLParams{URL: " https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk/a/B/c. \uff61 ", 436 | ConvertURLToPunyCode: true}, 437 | expected: ExtractResult{ 438 | Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "a.fk", 439 | RegisteredDomain: "be.a.fk", Path: "/a/B/c. \uff61", HostType: HostName, 440 | }, description: "Surrounded by extra whitespace | PunyCode"}, 441 | {urlParams: URLParams{URL: "http://1.1.1.1 &@2.2.2.2:33/4.4.4.4?1.1.1.1# @3.3.3.3/"}, 442 | expected: ExtractResult{ 443 | Scheme: "http://", UserInfo: "1.1.1.1 &", Domain: "2.2.2.2", 444 | RegisteredDomain: "2.2.2.2", Port: "33", Path: "/4.4.4.4?1.1.1.1# @3.3.3.3/", HostType: IPv4, 445 | }, description: "Whitespace in UserInfo"}, 446 | {urlParams: URLParams{URL: "example.za./en"}, 447 | expected: ExtractResult{SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName}, 448 | description: "za has no 1st-level TLD | One trailing label separator", 449 | }, 450 | {urlParams: URLParams{URL: "example.za.\u3002/en"}, 451 | expected: ExtractResult{SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName}, 452 | description: "za has no 1st-level TLD | 2 trailing label separators", 453 | }, 454 | } 455 | var invalidTests = []extractTest{ 456 | {urlParams: URLParams{URL: "localhost!"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character !"}, 457 | {urlParams: URLParams{URL: "localhost+"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character +"}, 458 | {urlParams: URLParams{URL: "localhost-"}, expected: ExtractResult{}, err: errs[8], description: "localhost + invalid character -"}, 459 | {urlParams: URLParams{}, expected: ExtractResult{}, err: errs[9], description: "empty string"}, 460 | {urlParams: URLParams{URL: "https://"}, expected: ExtractResult{Scheme: "https://"}, err: errs[9], description: "Scheme only"}, 461 | {urlParams: URLParams{URL: "1b://example.com"}, expected: ExtractResult{}, err: errs[10], description: "Scheme beginning with non-alphabet (parser unsuccessfully tries to interpret runes after colon as port"}, 462 | {urlParams: URLParams{URL: "maps.google.com.sg:8589934592/this/path/will/not/be/parsed"}, expected: ExtractResult{}, err: errs[10], description: "Invalid Port number"}, 463 | {urlParams: URLParams{URL: "http://.\u3002127.0.0.1"}, 464 | expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Consecutive label separators before IPv4 address", 465 | }, 466 | {urlParams: URLParams{URL: "http://.\u3002[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"}, 467 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], description: "Consecutive label separators before IPv6 address", 468 | }, 469 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789].."}, 470 | expected: ExtractResult{Scheme: "http://"}, err: errs[5], description: "Consecutive label separators after IPv6 address", 471 | }, 472 | {urlParams: URLParams{URL: "http://example.com :50"}, 473 | expected: ExtractResult{Scheme: "http://", Port: "50"}, err: errs[8], description: "Spaces between domain and Port/Path", 474 | }, 475 | {urlParams: URLParams{URL: "http:// 127.0.0.1"}, 476 | expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Spaces before IPv4 address", 477 | }, 478 | {urlParams: URLParams{URL: "http://127.0.0.1 :50"}, 479 | expected: ExtractResult{Scheme: "http://", Port: "50"}, err: errs[8], description: "Spaces between IPv4 address and Port/Path", 480 | }, 481 | {urlParams: URLParams{URL: "http:// [aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]"}, 482 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], description: "Spaces before IPv6 address", 483 | }, 484 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789] :50"}, 485 | expected: ExtractResult{Scheme: "http://"}, err: errs[5], description: "Spaces between IPv6 address and Port/Path", 486 | }, 487 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe\u3002a\uff61\u3002fk"}, 488 | expected: ExtractResult{Scheme: "https://"}, err: errs[6], description: "Consecutive label separators within Suffix", 489 | }, 490 | {urlParams: URLParams{URL: "example.\u3002za/en"}, 491 | expected: ExtractResult{Path: "/en"}, err: errs[6], 492 | description: "za has no 1st-level TLD | Consecutive label separators between labels", 493 | }, 494 | {urlParams: URLParams{URL: ".\u3002a\uff61fk"}, expected: ExtractResult{}, err: errs[8], description: "eTLD only, multiple leading label separators"}, 495 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to\uff0ebe.\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between Domain and Suffix"}, 496 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002am\uff0egoing\uff61to.\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators between SubDomain and Domain"}, 497 | {urlParams: URLParams{URL: "https://brb\u002ei\u3002.am.\uff0egoing\uff61to\uff0ebe\u3002a\uff61fk"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Consecutive label separators within SubDomain"}, 498 | {urlParams: URLParams{URL: "https://\uff0eexample.com"}, expected: ExtractResult{Scheme: "https://"}, err: errs[8], description: "Hostname starting with label separator"}, 499 | {urlParams: URLParams{URL: "//server.example.com/path"}, expected: ExtractResult{Scheme: "//", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/path", HostType: HostName}, description: "Double-slash only Scheme with subdomain"}, 500 | {urlParams: URLParams{URL: "http://temasek"}, expected: ExtractResult{Scheme: "http://", Suffix: "temasek"}, err: errs[9], description: "Basic URL with eTLD only"}, 501 | {urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with bad eTLD"}, 502 | {urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real", HostType: HostName}, description: "Basic URL with subdomain and bad eTLD"}, 503 | {urlParams: URLParams{URL: "http://127.0.0.256"}, expected: ExtractResult{Scheme: "http://", SubDomain: "127.0.0", Domain: "256", HostType: HostName}, description: "Basic IPv4 Address URL with bad IP"}, 504 | {urlParams: URLParams{URL: "http://127\uff0e0\u30020\uff61256:5000"}, 505 | expected: ExtractResult{Scheme: "http://", SubDomain: "127\uff0e0\u30020", Port: "5000", 506 | Domain: "256", HostType: HostName}, description: "Basic IPv4 Address with Scheme and Port and bad IP | Internationalised label separators"}, 507 | {urlParams: URLParams{URL: "http://192.168.01.1:5000"}, 508 | expected: ExtractResult{Scheme: "http://", SubDomain: "192.168.01", Domain: "1", Port: "5000", HostType: HostName}, 509 | description: "Basic IPv4 Address with Scheme and Port and bad IP | octet with leading zero"}, 510 | {urlParams: URLParams{URL: "http://a:b@xn--tub-1m9d15sfkkhsifsbqygyujjrw60.com"}, 511 | expected: ExtractResult{Scheme: "http://", UserInfo: "a:b"}, err: errors.New("idna: invalid label \"tub-1m9d15sfkkhsifsbqygyujjrw60\""), description: "Invalid punycode Domain"}, 512 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789:5000"}, 513 | expected: ExtractResult{Scheme: "http://"}, err: errs[3], 514 | description: "Basic IPv6 Address with Scheme and Port with no closing bracket"}, 515 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:::]:5000"}, 516 | expected: ExtractResult{Scheme: "http://"}, err: errs[4], 517 | description: "Basic IPv6 Address with Scheme and Port and bad IP | odd number of empty hextets"}, 518 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:2345:fffffffffffffffff]:5000"}, 519 | expected: ExtractResult{Scheme: "http://"}, err: errs[4], 520 | description: "Basic IPv6 Address with Scheme and Port and bad IP | hextet too big"}, 521 | {urlParams: URLParams{URL: "http://[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e256\u30020\uff611]:5000"}, 522 | expected: ExtractResult{Scheme: "http://"}, err: errs[4], 523 | description: "Basic IPv6 Address + trailing bad IPv4 address with Scheme and Port | Internationalised label separators"}, 524 | {urlParams: URLParams{URL: "http://[::aBcD:ef01:2345:6789:aBcD:ef01:2345:127.255.0.1]:5000"}, 525 | expected: ExtractResult{Scheme: "http://"}, err: errs[4], 526 | description: "Malformed IPv6 Address with leading ellipsis and extra 16-bit chunk + trailing IPv4 address with Scheme and Port"}, 527 | {urlParams: URLParams{URL: "[1::1::1:1:1:1:1:1]"}, 528 | expected: ExtractResult{}, err: errs[4], 529 | description: "Malformed IPv6 Address with 2 consecutive double-colon"}, 530 | {urlParams: URLParams{URL: "[1:1:1:1:1:1:1:1:::]"}, 531 | expected: ExtractResult{}, err: errs[4], 532 | description: "Malformed IPv6 Address with trailing triple colon"}, 533 | {urlParams: URLParams{URL: "http://["}, 534 | expected: ExtractResult{Scheme: "http://"}, err: errs[3], 535 | description: "Single opening square bracket"}, 536 | {urlParams: URLParams{URL: "http://a["}, 537 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], 538 | description: "Single opening square bracket after alphabet"}, 539 | {urlParams: URLParams{URL: "http://]"}, 540 | expected: ExtractResult{Scheme: "http://"}, err: errs[1], 541 | description: "Single closing square bracket"}, 542 | {urlParams: URLParams{URL: "http://a]"}, 543 | expected: ExtractResult{Scheme: "http://"}, err: errs[1], 544 | description: "Single closing square bracket after alphabet"}, 545 | {urlParams: URLParams{URL: "http://]["}, 546 | expected: ExtractResult{Scheme: "http://"}, err: errs[1], 547 | description: "closing square bracket before opening square bracket"}, 548 | {urlParams: URLParams{URL: "http://a]["}, 549 | expected: ExtractResult{Scheme: "http://"}, err: errs[1], 550 | description: "closing square bracket before opening square bracket after alphabet"}, 551 | {urlParams: URLParams{URL: "http://[]"}, 552 | expected: ExtractResult{Scheme: "http://"}, err: errs[4], 553 | description: "Empty pair of square brackets"}, 554 | {urlParams: URLParams{URL: "http://a[]"}, 555 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], 556 | description: "Empty pair of square brackets after alphabet"}, 557 | {urlParams: URLParams{URL: "http://a[127.0.0.1]"}, 558 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], 559 | description: "IPv4 in square brackets after alphabet"}, 560 | {urlParams: URLParams{URL: "http://a[aBcD:ef01:2345:6789:aBcD:ef01:127\uff0e255\u30020\uff611]"}, 561 | expected: ExtractResult{Scheme: "http://"}, err: errs[0], 562 | description: "IPv6 in square brackets after alphabet"}, 563 | {urlParams: URLParams{URL: "http://[127.0.0.1]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "IPv4 in square brackets"}, 564 | {urlParams: URLParams{URL: "http://%78n--0.example.com"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`idna: invalid label "0"`), description: "Bad percentage encoding"}, 565 | {urlParams: URLParams{URL: "http://%78n--0.example.com", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://"}, err: errs[9], description: "Bad percentage encoding"}, 566 | 567 | // Test cases from net/ip-test.go 568 | {urlParams: URLParams{URL: "http://[-0.0.0.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 569 | {urlParams: URLParams{URL: "http://[0.-1.0.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 570 | {urlParams: URLParams{URL: "http://[0.0.-2.0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 571 | {urlParams: URLParams{URL: "http://[0.0.0.-3]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 572 | {urlParams: URLParams{URL: "http://[127.0.0.256]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 573 | {urlParams: URLParams{URL: "http://[abc]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 574 | {urlParams: URLParams{URL: "http://[123:]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 575 | {urlParams: URLParams{URL: "http://[fe80::1%lo0]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 576 | {urlParams: URLParams{URL: "http://[fe80::1%911]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 577 | {urlParams: URLParams{URL: "http://[a1:a2:a3:a4::b1:b2:b3:b4]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 578 | {urlParams: URLParams{URL: "http://[127.001.002.003]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 579 | {urlParams: URLParams{URL: "http://[::ffff:127.001.002.003]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 580 | {urlParams: URLParams{URL: "http://[123.000.000.000]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 581 | {urlParams: URLParams{URL: "http://[1.2..4]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 582 | {urlParams: URLParams{URL: "http://[0123.0.0.1]"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "net/ip-test.go"}, 583 | {urlParams: URLParams{URL: "git+ssh://www.!example.com/"}, expected: ExtractResult{Scheme: "git+ssh://", Path: "/"}, err: errs[8], description: "Full git+ssh URL with bad Domain"}, 584 | } 585 | var internationalTLDTests = []extractTest{ 586 | {urlParams: URLParams{URL: "https://𝖊𝖝𝖆𝖒𝖕𝖑𝖊.𝖈𝖔𝖒.𝖘𝖌", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "com.sg", RegisteredDomain: "example.com.sg", HostType: HostName}}, 587 | {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--lcvr32d.hk", RegisteredDomain: "example.xn--lcvr32d.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in punycode)"}, 588 | {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in punycode)"}, 589 | {urlParams: URLParams{URL: "http://example.敎育.hk/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "敎育.hk", RegisteredDomain: "example.敎育.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed international eTLD (result in unicode)"}, 590 | {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full international eTLD (result in unicode)"}, 591 | {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (result in punycode)"}, 592 | {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "/地图/A/b/C?编号=42", HostType: HostName}, description: "Basic URL with full punycode international eTLD (result in punycode)"}, 593 | {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", HostType: HostName}, description: "Basic URL with mixed punycode international eTLD (no further conversion to punycode)"}, 594 | {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Basic URL with full punycode international eTLD (no further conversion to punycode)"}, 595 | {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (no further conversion to punycode) See: https://github.com/golang/go/issues/48778"}, 596 | {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: ExtractResult{Scheme: "http://", Domain: "xn--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xn--h1alffa9f.xn--90azh.xn--90a3ac", HostType: HostName}, description: "Mixed case Punycode Domain with full punycode international eTLD (with further conversion to punycode)"}, 597 | } 598 | var domainOnlySingleTLDTests = []extractTest{ 599 | {urlParams: URLParams{URL: "https://example.ai/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "ai", RegisteredDomain: "example.ai", Path: "/en", HostType: HostName}, description: "Domain only + ai"}, 600 | {urlParams: URLParams{URL: "https://example.co/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "co", RegisteredDomain: "example.co", Path: "/en", HostType: HostName}, description: "Domain only + co"}, 601 | {urlParams: URLParams{URL: "https://example.sg/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "sg", RegisteredDomain: "example.sg", Path: "/en", HostType: HostName}, description: "Domain only + sg"}, 602 | {urlParams: URLParams{URL: "https://example.tv/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "tv", RegisteredDomain: "example.tv", Path: "/en", HostType: HostName}, description: "Domain only + tv"}, 603 | {urlParams: URLParams{URL: "https://example.%63om/en"}, expected: ExtractResult{Scheme: "https://", Domain: "example", Suffix: "%63om", RegisteredDomain: "example.%63om", Path: "/en", HostType: HostName}, description: "Domain only + %63om"}, 604 | {urlParams: URLParams{URL: "https://example.za/en"}, expected: ExtractResult{Scheme: "https://", SubDomain: "example", Domain: "za", Path: "/en", HostType: HostName}, description: "Domain only + za | za has no 1st-level TLD"}, 605 | } 606 | var pathTests = []extractTest{ 607 | {urlParams: URLParams{URL: "http://www.example.com/this:that"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/this:that", HostType: HostName}, description: "Colon in Path"}, 608 | {urlParams: URLParams{URL: "http://example.com/oid/[order_id]"}, expected: ExtractResult{Scheme: "http://", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "/oid/[order_id]", HostType: HostName}, description: "Square brackets in Path"}, 609 | } 610 | var wildcardTests = []extractTest{ 611 | {urlParams: URLParams{URL: "https://asdf.wwe.ck"}, 612 | expected: ExtractResult{ 613 | Scheme: "https://", Domain: "asdf", Suffix: "wwe.ck", 614 | RegisteredDomain: "asdf.wwe.ck", HostType: HostName}, 615 | description: "Wildcard rule | *.ck"}, 616 | {urlParams: URLParams{URL: "https://asdf.www.ck"}, 617 | expected: ExtractResult{ 618 | Scheme: "https://", SubDomain: "asdf", Domain: "www", Suffix: "ck", 619 | RegisteredDomain: "www.ck", HostType: HostName}, 620 | description: "Wildcard exception rule | !www.ck"}, 621 | {urlParams: URLParams{URL: "https://brb.i.am.going.to.be.a.fk"}, 622 | expected: ExtractResult{ 623 | Scheme: "https://", SubDomain: "brb.i.am.going.to", Domain: "be", Suffix: "a.fk", 624 | RegisteredDomain: "be.a.fk", HostType: HostName, 625 | }, description: "Wildcard rule | *.fk", 626 | }, 627 | } 628 | var lookoutTests = []extractTest{ // some tests from lookout.net 629 | {urlParams: URLParams{URL: "http://GOO\u200b\u2060\ufeffgoo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 630 | {urlParams: URLParams{URL: "http://\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 631 | {urlParams: URLParams{URL: "http://\u0000\u0dc1\u0dca\u200d\u0dbb\u0dd3.com.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 632 | {urlParams: URLParams{URL: "http://\u0dc1\u0dca\u200d\u0dbb\u0dd3.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 633 | {urlParams: URLParams{URL: "http://look\ufeffout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 634 | {urlParams: URLParams{URL: "http://www\u00A0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 635 | {urlParams: URLParams{URL: "http://\u1680.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid chars"}, 636 | {urlParams: URLParams{URL: "%68%74%74%70%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest.lookout.net"}, expected: ExtractResult{ 637 | SubDomain: "%68%74%74%70%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest", Domain: "lookout", 638 | Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 639 | {urlParams: URLParams{URL: "http%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest.lookout.net"}, expected: ExtractResult{ 640 | SubDomain: "http%3a%2f%2f%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d%2f.urltest", Domain: "lookout", 641 | Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 642 | {urlParams: URLParams{URL: "http://%25.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", 643 | SubDomain: "%25.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 644 | {urlParams: URLParams{URL: "http://%25DOMAIN:foobar@urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", UserInfo: "%25DOMAIN:foobar", 645 | SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded UserInfo"}, 646 | {urlParams: URLParams{URL: "http://%30%78%63%30%2e%30%32%35%30.01%2e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%30%78%63%30%2e%30%32%35%30.01%2e.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 647 | {urlParams: URLParams{URL: "http://%30%78%63%30%2e%30%32%35%30.01.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%30%78%63%30%2e%30%32%35%30.01.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 648 | {urlParams: URLParams{URL: "http://%3g%78%63%30%2e%30%32%35%30%2E.01.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`invalid URL escape "%3g"`), description: "Invalid Percentage encoded SubDomain"}, 649 | {urlParams: URLParams{URL: "http://%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d.urltest.lookout.net%3a%38%30"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%77%77%77%2e%65%78%61%6d%70%6c%65%2e%63%6f%6d.urltest.lookout", Domain: "net%3a%38%30", HostType: HostName}, description: "Percentage encoded SubDomain and Domain"}, 650 | {urlParams: URLParams{URL: "http://%A1%C1.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%A1%C1.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 651 | {urlParams: URLParams{URL: "http://%E4%BD%A0%E5%A5%BD\u4f60\u597d.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%E4%BD%A0%E5%A5%BD\u4f60\u597d.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded and Unicode SubDomain"}, 652 | {urlParams: URLParams{URL: "http://%ef%b7%90zyx.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%b7%90zyx.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 653 | {urlParams: URLParams{URL: "http://%ef%bc%85%ef%bc%90%ef%bc%90.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%bc%85%ef%bc%90%ef%bc%90.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 654 | {urlParams: URLParams{URL: "http://%ef%bc%85%ef%bc%94%ef%bc%91.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "%ef%bc%85%ef%bc%94%ef%bc%91.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 655 | {urlParams: URLParams{URL: "http://%zz%66%a.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New(`invalid URL escape "%zz"`), description: "Bad Percentage encoded SubDomain"}, 656 | {urlParams: URLParams{URL: "http://-foo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Start with dash"}, 657 | {urlParams: URLParams{URL: "http:////////user:@urltest.lookout.net?foo"}, expected: ExtractResult{Scheme: "http:////////", UserInfo: "user:", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "?foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple slashes in Scheme"}, 658 | {urlParams: URLParams{URL: "http://192.168.0.1 hello.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"}, 659 | {urlParams: URLParams{URL: "http://192.168.0.257.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "192.168.0.257.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "IPv4 Address in SubDomain"}, 660 | {urlParams: URLParams{URL: "http://B\u00fccher.de.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "B\u00fccher.de.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 661 | {urlParams: URLParams{URL: "http://GOO \u3000goo.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"}, 662 | {urlParams: URLParams{URL: "http://Goo%20 goo%7C|.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Space in SubDomain"}, 663 | {urlParams: URLParams{URL: "http://[google.com.].urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "Square Brackets in SubDomain"}, 664 | {urlParams: URLParams{URL: "http://[urltest.lookout.net]/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[4], description: "Square brackets but not IPv6"}, 665 | {urlParams: URLParams{URL: "http://\u001f.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Control Character in SubDomain"}, 666 | {urlParams: URLParams{URL: "http://\u0378.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u0378.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode U+0378"}, 667 | {urlParams: URLParams{URL: "http://\u03b2\u03cc\u03bb\u03bf\u03c2.com.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u03b2\u03cc\u03bb\u03bf\u03c2.com.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 668 | {urlParams: URLParams{URL: "http://\u03b2\u03cc\u03bb\u03bf\u03c2.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u03b2\u03cc\u03bb\u03bf\u03c2.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 669 | {urlParams: URLParams{URL: "http://\u0442(.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Parenthesis in SubDomain"}, 670 | {urlParams: URLParams{URL: "http://\u04c0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 671 | {urlParams: URLParams{URL: "http://\u06dd.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 672 | {urlParams: URLParams{URL: "http://\u09dc.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u09dc.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 673 | {urlParams: URLParams{URL: "http://\u15ef\u15ef\u15ef.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u15ef\u15ef\u15ef.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 674 | {urlParams: URLParams{URL: "http://\u180e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 675 | {urlParams: URLParams{URL: "http://\u1e9e.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u1e9e.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 676 | {urlParams: URLParams{URL: "http://\u2183.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 677 | {urlParams: URLParams{URL: "http://\u2665.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u2665.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 678 | {urlParams: URLParams{URL: "http://\u4f60\u597d\u4f60\u597d.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\u4f60\u597d\u4f60\u597d.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 679 | {urlParams: URLParams{URL: "http://\ufdd0zyx.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 680 | {urlParams: URLParams{URL: "http://\uff05\uff10\uff10.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 681 | {urlParams: URLParams{URL: "http://\uff05\uff14\uff11.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 682 | {urlParams: URLParams{URL: "http://\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 683 | {urlParams: URLParams{URL: "http://\uff27\uff4f.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "\uff27\uff4f.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 684 | {urlParams: URLParams{URL: "http://ab--cd.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "ab--cd.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Bad double-hyphen in SubDomain (still accepted)"}, 685 | {urlParams: URLParams{URL: "http://fa\u00df.de.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "fa\u00df.de.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 686 | {urlParams: URLParams{URL: "http://foo-.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "foo-.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Bad SubDomain label end with dash (still accepted)"}, 687 | {urlParams: URLParams{URL: "http://foo\u0300.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "foo\u0300.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 688 | {urlParams: URLParams{URL: "http://gOoGle.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "gOoGle.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Mixed case letters"}, 689 | {urlParams: URLParams{URL: "http://hello%00.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "hello%00.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded SubDomain"}, 690 | {urlParams: URLParams{URL: "http://look\u0341out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u0341out.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 691 | {urlParams: URLParams{URL: "http://look\u034fout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u034fout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 692 | {urlParams: URLParams{URL: "http://look\u05beout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u05beout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 693 | {urlParams: URLParams{URL: "http://look\u202eout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 694 | {urlParams: URLParams{URL: "http://look\u2060.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u2060.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 695 | {urlParams: URLParams{URL: "http://look\u206bout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "look\u206bout.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 696 | {urlParams: URLParams{URL: "http://look\u2ff0out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 697 | {urlParams: URLParams{URL: "http://look\ufffaout.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Unicode in SubDomain"}, 698 | {urlParams: URLParams{URL: "http://uRLTest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "uRLTest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Mixed case letters"}, 699 | {urlParams: URLParams{URL: "http://urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Simple SubDomain+Domain"}, 700 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%20foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%20foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 701 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%3A%3a%3C%3c"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%3A%3a%3C%3c", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 702 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%7Ffp3%3Eju%3Dduvgw%3Dd"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%7Ffp3%3Eju%3Dduvgw%3Dd", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 703 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%A1%C1/?foo=%EF%BD%81"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%A1%C1/?foo=%EF%BD%81", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 704 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%A1%C1/?foo=???"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%A1%C1/?foo=???", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 705 | {urlParams: URLParams{URL: "http://urltest.lookout.net/%EF%BD%81/?foo=%A1%C1"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/%EF%BD%81/?foo=%A1%C1", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 706 | {urlParams: URLParams{URL: "http://urltest.lookout.net/(%28:%3A%29)"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/(%28:%3A%29)", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Parentheses in Path"}, 707 | {urlParams: URLParams{URL: "http://urltest.lookout.net/././foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/././foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 708 | {urlParams: URLParams{URL: "http://urltest.lookout.net/./.foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/./.foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 709 | {urlParams: URLParams{URL: "http://urltest.lookout.net////../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "////../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 710 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?%02hello%7f bye"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?%02hello%7f bye", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Space in Path"}, 711 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?%40%41123"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?%40%41123", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 712 | {urlParams: URLParams{URL: "http://urltest.lookout.net/???/?foo=%A1%C1"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/???/?foo=%A1%C1", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Consecutive question marks"}, 713 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?D%C3%BCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?D%C3%BCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 714 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?D%FCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?D%FCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 715 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?as?df"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?as?df", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple question marks"}, 716 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?foo=bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?foo=bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with Query Parameters"}, 717 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?q=<asdf>"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=<asdf>", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with Query Parameters"}, 718 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?q=\"asdf\""}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=\"asdf\"", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Path with inverted commas"}, 719 | {urlParams: URLParams{URL: "http://urltest.lookout.net/?q=\u4f60\u597d"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/?q=\u4f60\u597d", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 720 | {urlParams: URLParams{URL: "http://urltest.lookout.net/@asdf%40"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/@asdf%40", RegisteredDomain: "lookout.net", HostType: HostName}, description: "@ in Path"}, 721 | {urlParams: URLParams{URL: "http://urltest.lookout.net/D%C3%BCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/D%C3%BCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 722 | {urlParams: URLParams{URL: "http://urltest.lookout.net/D%FCrst"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/D%FCrst", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 723 | {urlParams: URLParams{URL: "http://urltest.lookout.net/\u2025/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u2025/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 724 | {urlParams: URLParams{URL: "http://urltest.lookout.net/\u202e/foo/\u202d/bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u202e/foo/\u202d/bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 725 | {urlParams: URLParams{URL: "http://urltest.lookout.net/\u4f60\u597d\u4f60\u597d"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\u4f60\u597d\u4f60\u597d", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 726 | {urlParams: URLParams{URL: "http://urltest.lookout.net/\ufdd0zyx"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\ufdd0zyx", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 727 | {urlParams: URLParams{URL: "http://urltest.lookout.net/\ufeff/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/\ufeff/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 728 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Simple SubDomain+Domain+Path"}, 729 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo bar/? foo = bar # foo"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo bar/? foo = bar # foo", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Space in Path"}, 730 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Trailing percentage sign in Path"}, 731 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%00%51"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%00%51", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 732 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 733 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2Ehtml"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2Ehtml", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 734 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2\u00c2\u00a9zbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2\u00c2\u00a9zbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in Path"}, 735 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2fbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2fbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 736 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%2zbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%2zbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 737 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%3fbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%3fbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 738 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo%41%7a"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo%41%7a", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 739 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 740 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e%2"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e%2", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 741 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/%2e./%2e%2e/.%2e/%2e.bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/%2e./%2e%2e/.%2e/%2e.bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Percentage encoded Path"}, 742 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/.", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 743 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/../../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/../../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 744 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/../../../ton"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/../../../ton", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 745 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/..bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/..bar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 746 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/./"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/./", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 747 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 748 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 749 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../ton"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../ton", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 750 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar/../ton/../../a"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar/../ton/../../a", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 751 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar//.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar//..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path, Multiple slashes"}, 752 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo/bar//../.."}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo/bar//../..", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Dots in Path"}, 753 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo?bar=baz#"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo?bar=baz#", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Query Parameters in Path"}, 754 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo\\tbar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo\\tbar", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Backslash in Path"}, 755 | {urlParams: URLParams{URL: "http://urltest.lookout.net/foo\t\ufffd%91"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Path: "/foo\t\ufffd%91", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Tab in Path"}, 756 | {urlParams: URLParams{URL: "http://urltest.lookout.net:80/"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", Port: "80", RegisteredDomain: "lookout.net", Path: "/", HostType: HostName}, description: "Port"}, 757 | {urlParams: URLParams{URL: "http://urltest.lookout.net::80::443/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[10], description: "Bad Port"}, 758 | {urlParams: URLParams{URL: "http://urltest.lookout.net::==80::==443::/"}, expected: ExtractResult{Scheme: "http://"}, err: errs[10], description: "Bad Port"}, 759 | {urlParams: URLParams{URL: "http://urltest.lookout.net\\\\foo\\\\bar"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", Path: "\\\\foo\\\\bar", HostType: HostName}, description: "Multiple backslashes in Path"}, 760 | {urlParams: URLParams{URL: "http://urltest.lookout.net\u2a7480/"}, expected: ExtractResult{Scheme: "http://", SubDomain: "urltest.lookout", Domain: "net\u2a7480", Path: "/", HostType: HostName}, description: "Unicode in Domain"}, 761 | {urlParams: URLParams{URL: "http://urltest.lookout.net\uff0ffoo/"}, expected: ExtractResult{Scheme: "http://", Path: "/"}, err: errs[8], description: "Unicode in Domain"}, 762 | {urlParams: URLParams{URL: "http://www.foo\u3002bar.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.foo\u3002bar.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 763 | {urlParams: URLParams{URL: "http://www.loo\u0138out.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.loo\u0138out.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 764 | {urlParams: URLParams{URL: "http://www.lookout.\u0441\u043e\u043c.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.lookout.\u0441\u043e\u043c.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 765 | {urlParams: URLParams{URL: "http://www.lookout.net\uff1a80.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Reject full-width colon"}, 766 | {urlParams: URLParams{URL: "http://www.lookout\u2027net.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://", SubDomain: "www.lookout\u2027net.urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Unicode in SubDomain"}, 767 | {urlParams: URLParams{URL: "http://www\u2025urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errs[8], description: "Invalid Character"}, 768 | {urlParams: URLParams{URL: "http://xn--0.urltest.lookout.net"}, expected: ExtractResult{Scheme: "http://"}, err: errors.New("idna: invalid label \"0\""), description: "Invalid Punycode"}, 769 | {urlParams: URLParams{URL: "http:\\\\\\\\urltest.lookout.net\\\\foo"}, expected: ExtractResult{Scheme: "http:\\\\\\\\", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", Path: "\\\\foo", HostType: HostName}, description: "Multiple forward slashes in Scheme"}, 770 | {urlParams: URLParams{URL: "http:///\\/\\/\\/\\/urltest.lookout.net"}, expected: ExtractResult{Scheme: "http:///\\/\\/\\/\\/", SubDomain: "urltest", Domain: "lookout", Suffix: "net", RegisteredDomain: "lookout.net", HostType: HostName}, description: "Multiple mixed slashes in Scheme"}, 771 | } 772 | 773 | func TestExtract(t *testing.T) { 774 | testPSLFilePath, ok := getTestPSLFilePath() 775 | if !ok { 776 | t.Errorf("Cannot get path to current module file") 777 | } 778 | extractorWithPrivateSuffix, _ := New(SuffixListParams{ 779 | CacheFilePath: testPSLFilePath, 780 | IncludePrivateSuffix: true, 781 | }) 782 | extractorWithoutPrivateSuffix, _ := New(SuffixListParams{ 783 | CacheFilePath: testPSLFilePath, 784 | IncludePrivateSuffix: false, 785 | }) 786 | for _, testCollection := range []([]extractTest){ 787 | schemeTests, 788 | noSchemeTests, 789 | userInfoTests, 790 | ipv4Tests, 791 | ipv6Tests, 792 | ignoreSubDomainsTests, 793 | privateSuffixTests, 794 | periodsAndWhiteSpacesTests, 795 | invalidTests, 796 | internationalTLDTests, 797 | domainOnlySingleTLDTests, 798 | pathTests, 799 | wildcardTests, 800 | lookoutTests, 801 | } { 802 | for _, test := range testCollection { 803 | var extractor *FastTLD 804 | if test.includePrivateSuffix { 805 | extractor = extractorWithPrivateSuffix 806 | } else { 807 | extractor = extractorWithoutPrivateSuffix 808 | } 809 | res, err := extractor.Extract(test.urlParams) 810 | 811 | if output := reflect.DeepEqual(res, 812 | test.expected); !output { 813 | t.Errorf("%+q | Output %q not equal to expected output %q | %q", 814 | test.urlParams.URL, res, test.expected, test.description) 815 | } 816 | 817 | if !(err == nil && test.err == nil) && 818 | ((err == nil && test.err != nil) || 819 | (err != nil && test.err == nil) || 820 | !reflect.DeepEqual(err.Error(), 821 | test.err.Error())) { 822 | t.Errorf("%+q | Error %v not equal to expected error %v | %q", 823 | test.urlParams.URL, err, test.err, test.description) 824 | } 825 | } 826 | } 827 | } 828 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/elliotwutingfeng/go-fasttld 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.24.3 6 | 7 | require ( 8 | github.com/fatih/color v1.18.0 9 | github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8 10 | github.com/jpillora/go-tld v1.2.1 11 | github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651 12 | github.com/mjd2021usa/tldextract v0.9.2 13 | github.com/spf13/afero v1.14.0 14 | github.com/spf13/cobra v1.9.1 15 | github.com/tidwall/hashmap v1.8.1 16 | golang.org/x/net v0.40.0 17 | ) 18 | 19 | require ( 20 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 21 | github.com/klauspost/cpuid/v2 v2.2.9 // indirect 22 | github.com/mattn/go-colorable v0.1.14 // indirect 23 | github.com/mattn/go-isatty v0.0.20 // indirect 24 | github.com/spf13/pflag v1.0.6 // indirect 25 | github.com/zeebo/xxh3 v1.0.2 // indirect 26 | golang.org/x/sys v0.33.0 // indirect 27 | golang.org/x/text v0.25.0 // indirect 28 | ) 29 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= 2 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= 5 | github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= 6 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 7 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 8 | github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8 h1:Ig0ESdy6JtHI17vsb7L+UlUFpoZctKfvBZplcILeL6g= 9 | github.com/joeguo/tldextract v0.0.0-20220507100122-d83daa6adef8/go.mod h1:oGfutRjaB95239mjFVwofaOPTwuS3vb71ZLIGCEb36g= 10 | github.com/jpillora/go-tld v1.2.1 h1:kDKOkmXLlskqjcvNs7w5XHLep7c8WM7Xd4HQjxllVMk= 11 | github.com/jpillora/go-tld v1.2.1/go.mod h1:plzIl7xr5UWKGy7R+giuv+L/nOjrPjsoWxy/ST9OBUk= 12 | github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651 h1:bTfsnv9ZwdVc7mPWBEhd+F5pBeJ4P4WYVxaPuoZwmPE= 13 | github.com/karlseguin/intset v1.0.3-0.20221130142345-37ee0d7df651/go.mod h1:hJ3siwEnJbQ92zdVj7Q2OyyMrMZ7LZAIRYDZr0IAAqc= 14 | github.com/klauspost/cpuid/v2 v2.2.9 h1:66ze0taIn2H33fBvCkXuv9BmCwDfafmiIVpKV9kKGuY= 15 | github.com/klauspost/cpuid/v2 v2.2.9/go.mod h1:rqkxqrZ1EhYM9G+hXH7YdowN5R5RGN6NK4QwQ3WMXF8= 16 | github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= 17 | github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= 18 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 19 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 20 | github.com/mjd2021usa/tldextract v0.9.2 h1:Tkz+q0q4t4NvScACm3+bXZJY9lRlFeClopw0AkhAbA4= 21 | github.com/mjd2021usa/tldextract v0.9.2/go.mod h1:GB3fhxYasOChxf3Oo5Or6H4uzl8dhEx3wA7CQf8i4aI= 22 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 23 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 24 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 25 | github.com/spf13/afero v1.14.0 h1:9tH6MapGnn/j0eb0yIXiLjERO8RB6xIVZRDCX7PtqWA= 26 | github.com/spf13/afero v1.14.0/go.mod h1:acJQ8t0ohCGuMN3O+Pv0V0hgMxNYDlvdk+VTfyZmbYo= 27 | github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= 28 | github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= 29 | github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= 30 | github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 31 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 32 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 33 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 34 | github.com/tidwall/hashmap v1.8.1 h1:hXNzBfSJ2Jwvt0lbkWD59O/r3OfatSIcbuWT0VKEVns= 35 | github.com/tidwall/hashmap v1.8.1/go.mod h1:v+0qJrJn7l+l2dB8+fAFpC62p2G0SMP2Teu8ejkebg8= 36 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= 37 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= 38 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= 39 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= 40 | golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 41 | golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= 42 | golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= 43 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 44 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 45 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= 47 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 48 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 49 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 50 | golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= 51 | golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= 52 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 53 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 54 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 55 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 56 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 57 | -------------------------------------------------------------------------------- /net.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import "unicode/utf8" 4 | 5 | // IP address lengths (bytes). 6 | const ( 7 | iPv4len int = 4 8 | iPv6len int = 16 9 | lenDiff = iPv6len - iPv4len 10 | ) 11 | 12 | // Bigger than we need, not too big to worry about overflow 13 | const big int = 0xFFFFFF 14 | 15 | // Decimal to integer. 16 | // Returns number, characters consumed, success. 17 | func dtoi(s string) (n int, i int, ok bool) { 18 | n = 0 19 | for i = 0; i < len(s) && '0' <= s[i] && s[i] <= '9'; i++ { 20 | n = n*10 + int(s[i]-'0') 21 | if n >= big { 22 | return big, i, false 23 | } 24 | } 25 | if i == 0 { 26 | return 0, 0, false 27 | } 28 | return n, i, true 29 | } 30 | 31 | // Hexadecimal to integer. 32 | // Returns number, characters consumed, success. 33 | func xtoi(s string) (n int, i int, ok bool) { 34 | n = 0 35 | for i = 0; i < len(s); i++ { 36 | if '0' <= s[i] && s[i] <= '9' { 37 | n *= 16 38 | n += int(s[i] - '0') 39 | } else if 'a' <= s[i] && s[i] <= 'f' { 40 | n *= 16 41 | n += int(s[i]-'a') + 10 42 | } else if 'A' <= s[i] && s[i] <= 'F' { 43 | n *= 16 44 | n += int(s[i]-'A') + 10 45 | } else { 46 | break 47 | } 48 | if n >= big { 49 | return 0, i, false 50 | } 51 | } 52 | if i == 0 { 53 | return 0, i, false 54 | } 55 | return n, i, true 56 | } 57 | 58 | // isIPv4 returns true if s is a literal IPv4 address 59 | // 60 | // trailing label separators are accepted 61 | func isIPv4(s string) bool { 62 | s = fastTrim(s, labelSeparatorsRuneSet, trimRight) 63 | for i := 0; i < iPv4len; i++ { 64 | if len(s) == 0 { 65 | // Missing octets. 66 | return false 67 | } 68 | if i > 0 { 69 | r, size := utf8.DecodeRuneInString(s) 70 | if !labelSeparatorsRuneSet.Exists(r) { 71 | return false 72 | } 73 | s = s[size:] 74 | } 75 | n, c, ok := dtoi(s) 76 | if !ok || n > 0xFF { 77 | return false 78 | } 79 | if c > 1 && s[0] == '0' { 80 | // Reject non-zero components with leading zeroes. 81 | return false 82 | } 83 | s = s[c:] 84 | } 85 | return len(s) == 0 86 | } 87 | 88 | // isIPv6 returns true if s is a literal IPv6 address as described in RFC 4291 89 | // and RFC 5952. 90 | func isIPv6(s string) bool { 91 | ellipsis := -1 // position of ellipsis in ip 92 | 93 | // Might have leading ellipsis 94 | if len(s) >= 2 && s[0] == ':' && s[1] == ':' { 95 | ellipsis = 0 96 | s = s[2:] 97 | // Might be only ellipsis 98 | if len(s) == 0 { 99 | return true 100 | } 101 | } 102 | 103 | // Loop, parsing hex numbers followed by colon. 104 | i := 0 105 | for i < iPv6len { 106 | // Hex number. 107 | n, c, ok := xtoi(s) 108 | if !ok || n > 0xFFFF { 109 | return false 110 | } 111 | 112 | // If followed by any separator in labelSeparators, might be in trailing IPv4. 113 | if c < len(s) && labelSeparatorsRuneSet.Exists([]rune(s[c:])[0]) { 114 | if ellipsis < 0 && i != lenDiff { 115 | // Not the right place. 116 | return false 117 | } 118 | if i > lenDiff { 119 | // Not enough room. 120 | return false 121 | } 122 | if !isIPv4(s) { 123 | return false 124 | } 125 | s = "" 126 | i += iPv4len 127 | break 128 | } 129 | 130 | // Save this 16-bit chunk. 131 | i += 2 132 | 133 | // Stop at end of string. 134 | s = s[c:] 135 | if len(s) == 0 { 136 | break 137 | } 138 | 139 | // Otherwise must be followed by colon and more. 140 | if s[0] != ':' || len(s) == 1 { 141 | return false 142 | } 143 | s = s[1:] 144 | 145 | // Look for ellipsis. 146 | if s[0] == ':' { 147 | if ellipsis >= 0 { // already have one 148 | return false 149 | } 150 | ellipsis = i 151 | s = s[1:] 152 | if len(s) == 0 { // can be at end 153 | break 154 | } 155 | } 156 | } 157 | 158 | // Must have used entire string. 159 | if len(s) != 0 { 160 | return false 161 | } 162 | 163 | // If didn't parse enough, expand ellipsis. 164 | if i < iPv6len { 165 | if ellipsis < 0 { 166 | return false 167 | } 168 | } else if ellipsis >= 0 { 169 | // Ellipsis must represent at least one 0 group. 170 | return false 171 | } 172 | return true 173 | } 174 | -------------------------------------------------------------------------------- /net_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import "testing" 4 | 5 | type looksLikeIPAddressTest struct { 6 | maybeIPAddress string 7 | isIPAddress bool 8 | } 9 | 10 | var looksLikeIPv4AddressTests = []looksLikeIPAddressTest{ 11 | {maybeIPAddress: "", 12 | isIPAddress: false, 13 | }, 14 | {maybeIPAddress: " ", 15 | isIPAddress: false, 16 | }, 17 | {maybeIPAddress: "google.com", 18 | isIPAddress: false, 19 | }, 20 | {maybeIPAddress: "1google.com", 21 | isIPAddress: false, 22 | }, 23 | {maybeIPAddress: "127.0.0.1", 24 | isIPAddress: true, 25 | }, 26 | {maybeIPAddress: "127.0.0.256", 27 | isIPAddress: false, 28 | }, 29 | } 30 | 31 | var looksLikeIPv6AddressTests = []looksLikeIPAddressTest{ 32 | {maybeIPAddress: "", 33 | isIPAddress: false, 34 | }, 35 | {maybeIPAddress: " ", 36 | isIPAddress: false, 37 | }, 38 | {maybeIPAddress: "google.com", 39 | isIPAddress: false, 40 | }, 41 | {maybeIPAddress: "1google.com", 42 | isIPAddress: false, 43 | }, 44 | {maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:2345:6789", 45 | isIPAddress: true, 46 | }, 47 | {maybeIPAddress: "gGgG:ef01:2345:6789:aBcD:ef01:2345:6789", 48 | isIPAddress: false, 49 | }, 50 | {maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1", 51 | isIPAddress: true, 52 | }, 53 | {maybeIPAddress: "aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.256", 54 | isIPAddress: false, 55 | }, 56 | } 57 | 58 | func TestIsIPv4(t *testing.T) { 59 | for _, test := range looksLikeIPv4AddressTests { 60 | isIPv4Address := isIPv4(test.maybeIPAddress) 61 | if isIPv4Address != test.isIPAddress { 62 | t.Errorf("Output %t not equal to expected %t", 63 | isIPv4Address, test.isIPAddress) 64 | } 65 | } 66 | } 67 | 68 | func TestIsIPv6(t *testing.T) { 69 | for _, test := range looksLikeIPv6AddressTests { 70 | isIPv6Address := isIPv6(test.maybeIPAddress) 71 | if isIPv6Address != test.isIPAddress { 72 | t.Errorf("Output %t not equal to expected %t", 73 | isIPv6Address, test.isIPAddress) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /print.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "github.com/fatih/color" 5 | ) 6 | 7 | // PrintRes pretty-prints URL components from ExtractResult 8 | func PrintRes(url string, res ExtractResult) { 9 | var leftAttrsFilled = []color.Attribute{color.FgHiYellow, color.Bold} 10 | var leftAttrsBlank = []color.Attribute{color.FgHiBlack} 11 | var rightAttrs = []color.Attribute{color.FgHiWhite} 12 | 13 | if len(url) != 0 { 14 | color.New(leftAttrsFilled...).Print(" url: ") 15 | } else { 16 | color.New(leftAttrsBlank...).Print(" url: ") 17 | } 18 | color.New(rightAttrs...).Println(url) 19 | 20 | if len(res.Scheme) != 0 { 21 | color.New(leftAttrsFilled...).Print(" scheme: ") 22 | } else { 23 | color.New(leftAttrsBlank...).Print(" scheme: ") 24 | } 25 | color.New(rightAttrs...).Println(res.Scheme) 26 | 27 | if len(res.UserInfo) != 0 { 28 | color.New(leftAttrsFilled...).Print(" userinfo: ") 29 | } else { 30 | color.New(leftAttrsBlank...).Print(" userinfo: ") 31 | } 32 | color.New(rightAttrs...).Println(res.UserInfo) 33 | 34 | if len(res.SubDomain) != 0 { 35 | color.New(leftAttrsFilled...).Print(" subdomain: ") 36 | } else { 37 | color.New(leftAttrsBlank...).Print(" subdomain: ") 38 | } 39 | color.New(rightAttrs...).Println(res.SubDomain) 40 | 41 | if len(res.Domain) != 0 { 42 | color.New(leftAttrsFilled...).Print(" domain: ") 43 | } else { 44 | color.New(leftAttrsBlank...).Print(" domain: ") 45 | } 46 | color.New(rightAttrs...).Println(res.Domain) 47 | 48 | if len(res.Suffix) != 0 { 49 | color.New(leftAttrsFilled...).Print(" suffix: ") 50 | } else { 51 | color.New(leftAttrsBlank...).Print(" suffix: ") 52 | } 53 | color.New(rightAttrs...).Println(res.Suffix) 54 | 55 | if len(res.RegisteredDomain) != 0 { 56 | color.New(leftAttrsFilled...).Print("registered domain: ") 57 | } else { 58 | color.New(leftAttrsBlank...).Print("registered domain: ") 59 | } 60 | color.New(rightAttrs...).Println(res.RegisteredDomain) 61 | 62 | if len(res.Port) != 0 { 63 | color.New(leftAttrsFilled...).Print(" port: ") 64 | } else { 65 | color.New(leftAttrsBlank...).Print(" port: ") 66 | } 67 | color.New(rightAttrs...).Println(res.Port) 68 | 69 | if len(res.Path) != 0 { 70 | color.New(leftAttrsFilled...).Print(" path: ") 71 | } else { 72 | color.New(leftAttrsBlank...).Print(" path: ") 73 | } 74 | color.New(rightAttrs...).Println(res.Path) 75 | 76 | if res.HostType != 0 { 77 | color.New(color.FgHiBlue, color.Bold).Print(" host type: ") 78 | } else { 79 | color.New(leftAttrsBlank...).Print(" host type: ") 80 | } 81 | switch res.HostType { 82 | case HostName: 83 | color.New(rightAttrs...).Println("hostname") 84 | case IPv4: 85 | color.New(rightAttrs...).Println("ipv4 address") 86 | case IPv6: 87 | color.New(rightAttrs...).Println("ipv6 address") 88 | default: 89 | color.New(rightAttrs...).Println() 90 | } 91 | 92 | color.New().Println() 93 | } 94 | -------------------------------------------------------------------------------- /print_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestPrintRes(t *testing.T) { 8 | PrintRes("", ExtractResult{}) 9 | res := ExtractResult{} 10 | res.Scheme = "https://" 11 | res.UserInfo = "user" 12 | res.SubDomain = "a.subdomain" 13 | res.Domain = "example" 14 | res.Suffix = "a%63.uk" 15 | res.RegisteredDomain = "example.a%63.uk" 16 | res.Port = "5000" 17 | res.Path = "/a/b?id=42" 18 | res.HostType = HostName 19 | PrintRes("https://user@a.subdomain.example.a%63.uk:5000/a/b?id=42", res) 20 | res = ExtractResult{} 21 | res.HostType = IPv4 22 | PrintRes("1.1.1.1", res) 23 | res.HostType = IPv6 24 | PrintRes("[aBcD:ef01:2345:6789:aBcD:ef01:2345:6789]", res) 25 | } 26 | -------------------------------------------------------------------------------- /psl.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "log" 8 | "net/http" 9 | "os" 10 | "path/filepath" 11 | "runtime" 12 | "strings" 13 | "time" 14 | 15 | "github.com/spf13/afero" 16 | "golang.org/x/net/idna" 17 | ) 18 | 19 | var publicSuffixListSources = []string{ 20 | "https://publicsuffix.org/list/public_suffix_list.dat", 21 | "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat", 22 | } 23 | 24 | type suffixes struct { 25 | publicSuffixes []string 26 | privateSuffixes []string 27 | allSuffixes []string 28 | } 29 | 30 | func processLine(rawLine string, psl suffixes, isPrivateSuffix bool) (suffixes, bool) { 31 | line := strings.TrimSpace(rawLine) 32 | if "// ===BEGIN PRIVATE DOMAINS===" == line { 33 | isPrivateSuffix = true 34 | } 35 | if len(line) == 0 || strings.HasPrefix(line, "//") { 36 | return psl, isPrivateSuffix 37 | } 38 | suffix, err := idna.ToASCII(line) 39 | if err != nil { 40 | // skip line if unable to convert to ascii 41 | log.Println(line, '|', err) 42 | return psl, isPrivateSuffix 43 | } 44 | if isPrivateSuffix { 45 | psl.privateSuffixes = append(psl.privateSuffixes, suffix) 46 | if suffix != line { 47 | // add non-punycode version if it is different from punycode version 48 | psl.privateSuffixes = append(psl.privateSuffixes, line) 49 | } 50 | } else { 51 | psl.publicSuffixes = append(psl.publicSuffixes, suffix) 52 | if suffix != line { 53 | // add non-punycode version if it is different from punycode version 54 | psl.publicSuffixes = append(psl.publicSuffixes, line) 55 | } 56 | } 57 | psl.allSuffixes = append(psl.allSuffixes, suffix) 58 | if suffix != line { 59 | // add non-punycode version if it is different from punycode version 60 | psl.allSuffixes = append(psl.allSuffixes, line) 61 | } 62 | return psl, isPrivateSuffix 63 | } 64 | 65 | // getPublicSuffixList retrieves Public Suffixes and Private Suffixes from Public Suffix list located at cacheFilePath. 66 | // 67 | // publicSuffixes: ICANN domains. Example: com, net, org etc. 68 | // 69 | // privateSuffixes: PRIVATE domains. Example: blogspot.co.uk, appspot.com etc. 70 | // 71 | // allSuffixes: Both ICANN and PRIVATE domains. 72 | func getPublicSuffixList(cacheFilePath string) (suffixes, error) { 73 | var psl suffixes 74 | b, err := os.ReadFile(cacheFilePath) 75 | if err != nil { 76 | log.Println(err) 77 | return psl, err 78 | } 79 | var isPrivateSuffix bool 80 | for _, line := range strings.Split(string(b), "\n") { 81 | psl, isPrivateSuffix = processLine(line, psl, isPrivateSuffix) 82 | } 83 | return psl, nil 84 | } 85 | 86 | // getHardcodedPublicSuffixList retrieves Public Suffixes and Private Suffixes from hardcoded Public Suffix list. 87 | // 88 | // publicSuffixes: ICANN domains. Example: com, net, org etc. 89 | // 90 | // privateSuffixes: PRIVATE domains. Example: blogspot.co.uk, appspot.com etc. 91 | // 92 | // allSuffixes: Both ICANN and PRIVATE domains. 93 | func getHardcodedPublicSuffixList() (suffixes, error) { 94 | var psl suffixes 95 | var isPrivateSuffix bool 96 | for _, line := range strings.Split(hardcodedPSL, "\n") { 97 | psl, isPrivateSuffix = processLine(line, psl, isPrivateSuffix) 98 | } 99 | return psl, nil 100 | } 101 | 102 | // newHardcodedPSL creates a new *FastTLD using data from a hardcoded Public Suffix List file. 103 | func newHardcodedPSL(err error, n SuffixListParams) (*FastTLD, error) { 104 | log.Println(err, "Fallback to hardcoded Public Suffix List") 105 | tldTrie, err := trieConstruct(n.IncludePrivateSuffix, "") 106 | return &FastTLD{cacheFilePath: "", tldTrie: tldTrie, includePrivateSuffix: n.IncludePrivateSuffix}, err 107 | } 108 | 109 | // downloadFile downloads file from url as byte slice 110 | func downloadFile(url string) ([]byte, error) { 111 | // Make HTTP GET request 112 | var bodyBytes []byte 113 | resp, err := http.Get(url) 114 | if err != nil { 115 | return bodyBytes, err 116 | } 117 | defer resp.Body.Close() 118 | 119 | if resp.StatusCode == http.StatusOK { 120 | bodyBytes, err = afero.ReadAll(resp.Body) 121 | } else { 122 | err = errors.New("Download failed, HTTP status code : " + fmt.Sprint(resp.StatusCode)) 123 | } 124 | return bodyBytes, err 125 | } 126 | 127 | // getCurrentFilePath returns path to current module file 128 | // 129 | // Similar to os.path.dirname(os.path.realpath(__file__)) in Python 130 | // 131 | // Credits: https://andrewbrookins.com/tech/golang-get-directory-of-the-current-file 132 | func getCurrentFilePath() (string, bool) { 133 | _, file, _, ok := runtime.Caller(0) 134 | return filepath.Dir(file), ok 135 | } 136 | 137 | // Number of hours elapsed since last modified time of fileinfo. 138 | func fileLastModifiedHours(fileinfo os.FileInfo) float64 { 139 | return time.Now().Sub(fileinfo.ModTime()).Hours() 140 | } 141 | 142 | // update updates the local cache of Public Suffix List 143 | func update(file afero.File, 144 | publicSuffixListSources []string) error { 145 | for _, publicSuffixListSource := range publicSuffixListSources { 146 | // Write GET request body to local file 147 | if bodyBytes, err := downloadFile(publicSuffixListSource); err != nil { 148 | log.Println(err) 149 | } else { 150 | if !validPSLDelimiters(bodyBytes) { 151 | continue 152 | } 153 | if _, err := file.Seek(0, 0); err != nil { 154 | log.Println(err) 155 | continue 156 | } 157 | if _, err := file.Write(bodyBytes); err != nil { 158 | log.Println(err) 159 | continue 160 | } 161 | log.Println("Public Suffix List updated.") 162 | return nil 163 | } 164 | } 165 | return errors.New("failed to fetch any Public Suffix List from all mirrors") 166 | } 167 | 168 | func validPSLDelimiters(contents []byte) bool { 169 | return bytes.Contains(contents, []byte("// ===BEGIN ICANN DOMAINS===")) && 170 | bytes.Contains(contents, []byte("// ===END ICANN DOMAINS===")) && 171 | bytes.Contains(contents, []byte("// ===BEGIN PRIVATE DOMAINS===")) && 172 | bytes.Contains(contents, []byte("// ===END PRIVATE DOMAINS===")) 173 | } 174 | 175 | func checkCacheFile(cacheFilePath string) (bool, float64) { 176 | cacheFilePath, pathValidErr := filepath.Abs(strings.TrimSpace(cacheFilePath)) 177 | stat, fileinfoErr := os.Stat(cacheFilePath) 178 | var lastModifiedHours float64 179 | if fileinfoErr == nil { 180 | lastModifiedHours = fileLastModifiedHours(stat) 181 | } 182 | 183 | var validDelimiters bool 184 | if contents, err := os.ReadFile(cacheFilePath); err == nil { 185 | validDelimiters = validPSLDelimiters(contents) 186 | } 187 | return pathValidErr == nil && fileinfoErr == nil && !stat.IsDir() && validDelimiters, lastModifiedHours 188 | } 189 | 190 | // Update updates the default Public Suffix list file and updates its suffix trie using the updated file. 191 | // If cache file path is not the same as the default cache file path, this will be a no-op. 192 | func (f *FastTLD) Update() error { 193 | filesystem := new(afero.OsFs) 194 | defaultCacheFilePath := afero.GetTempDir(filesystem, "") + defaultPSLFileName 195 | 196 | if f.cacheFilePath != defaultCacheFilePath { 197 | return errors.New("No-op. Only default Public Suffix list file can be updated") 198 | } 199 | file, err := os.OpenFile(defaultCacheFilePath, os.O_CREATE|os.O_WRONLY, 0644) 200 | if err != nil { 201 | return err 202 | } 203 | defer file.Close() 204 | if updateErr := update(file, publicSuffixListSources); updateErr != nil { 205 | return updateErr 206 | } 207 | tldTrie, err := trieConstruct(f.includePrivateSuffix, defaultCacheFilePath) 208 | if err == nil { 209 | f.tldTrie = tldTrie 210 | f.cacheFilePath = defaultCacheFilePath 211 | } 212 | return err 213 | } 214 | -------------------------------------------------------------------------------- /psl_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "net/http/httptest" 7 | "os" 8 | "reflect" 9 | "testing" 10 | 11 | "github.com/spf13/afero" 12 | ) 13 | 14 | type getPublicSuffixListTest struct { 15 | cacheFilePath string 16 | expectedLists suffixes 17 | hasError bool 18 | } 19 | 20 | var getPublicSuffixListTests = []getPublicSuffixListTest{ 21 | {cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat", string(os.PathSeparator)), 22 | expectedLists: pslTestLists, 23 | hasError: false, 24 | }, 25 | {cacheFilePath: fmt.Sprintf("test%smini_public_suffix_list.dat", string(os.PathSeparator)), 26 | expectedLists: suffixes{[]string{"ac", "com.ac", "edu.ac", "gov.ac", "net.ac", 27 | "mil.ac", "org.ac", "*.ck", "!www.ck", "org.sg"}, []string{"blogspot.com"}, 28 | []string{"ac", "com.ac", "edu.ac", "gov.ac", "net.ac", "mil.ac", 29 | "org.ac", "*.ck", "!www.ck", "org.sg", "blogspot.com"}}, 30 | hasError: false, 31 | }, 32 | {cacheFilePath: fmt.Sprintf("test%spublic_suffix_list.dat.noexist", string(os.PathSeparator)), 33 | expectedLists: suffixes{[]string{}, []string{}, []string{}}, 34 | hasError: true, 35 | }, 36 | } 37 | 38 | func TestGetPublicSuffixList(t *testing.T) { 39 | for _, test := range getPublicSuffixListTests { 40 | suffixLists, err := getPublicSuffixList(test.cacheFilePath) 41 | if test.hasError && err == nil { 42 | t.Errorf("Expected an error. Got no error.") 43 | } 44 | if !test.hasError && err != nil { 45 | t.Errorf("Expected no error. Got an error.") 46 | } 47 | if output := reflect.DeepEqual(suffixLists, 48 | test.expectedLists); !output && (len(suffixLists.publicSuffixes)+ 49 | len(suffixLists.privateSuffixes)+ 50 | len(suffixLists.allSuffixes)+ 51 | len(test.expectedLists.publicSuffixes)+ 52 | len(test.expectedLists.privateSuffixes)+ 53 | len(test.expectedLists.allSuffixes)) != 0 { 54 | t.Errorf("Output %q not equal to expected %q", 55 | suffixLists, test.expectedLists) 56 | } 57 | } 58 | } 59 | 60 | func TestGetHardcodedPublicSuffixList(t *testing.T) { 61 | suffixLists, err := getHardcodedPublicSuffixList() 62 | if err != nil { 63 | t.Errorf("Expected no error. Got an error.") 64 | } 65 | if len(suffixLists.publicSuffixes) == 0 { 66 | t.Errorf("len(suffixLists.publicSuffixes) should be more than 0.") 67 | } 68 | if len(suffixLists.privateSuffixes) == 0 { 69 | t.Errorf("len(suffixLists.privateSuffixes) should be more than 0.") 70 | } 71 | if len(suffixLists.allSuffixes) == 0 { 72 | t.Errorf("len(suffixLists.allSuffixes) should be more than 0.") 73 | } 74 | } 75 | 76 | func TestNewHardcodedPSL(t *testing.T) { 77 | f, err := newHardcodedPSL(nil, SuffixListParams{}) 78 | if err != nil { 79 | t.Errorf("newHardcodedPSL error: %q", err) 80 | } 81 | if f.tldTrie.matches.Len() == 0 { 82 | t.Errorf("tldTrie should not be empty") 83 | } 84 | } 85 | 86 | func TestDownloadFile(t *testing.T) { 87 | expectedResponse := []byte(`{"isItSunday": true}`) 88 | goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 89 | w.Write(expectedResponse) 90 | r.Header.Get("") // removes unused parameter warning 91 | })) 92 | defer goodServer.Close() 93 | badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 94 | w.WriteHeader(404) 95 | r.Header.Get("") // removes unused parameter warning 96 | })) 97 | defer badServer.Close() 98 | 99 | // HTTP Status Code 200 100 | res, _ := downloadFile(goodServer.URL) 101 | if output := reflect.DeepEqual(expectedResponse, 102 | res); !output { 103 | t.Errorf("Output %q not equal to expected %q", 104 | res, expectedResponse) 105 | } 106 | 107 | // HTTP Status Code 404 108 | res, _ = downloadFile(badServer.URL) 109 | if len(res) != 0 { 110 | t.Errorf("Response should be empty.") 111 | } 112 | 113 | // Malformed URL 114 | res, _ = downloadFile("!example.com") 115 | if len(res) != 0 { 116 | t.Errorf("Response should be empty.") 117 | } 118 | } 119 | 120 | type updateTest struct { 121 | mainServerAvailable, fallbackServerAvailable, expectError bool 122 | } 123 | 124 | var updateTests = []updateTest{ 125 | {true, true, false}, 126 | {true, false, false}, 127 | {false, true, false}, 128 | {false, false, true}, 129 | } 130 | 131 | func TestUpdate(t *testing.T) { 132 | requiredComments := "// ===BEGIN ICANN DOMAINS===\n// ===END ICANN DOMAINS===\n// ===BEGIN PRIVATE DOMAINS===\n// ===END PRIVATE DOMAINS===" 133 | goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 134 | w.Write([]byte(requiredComments)) 135 | r.Header.Get("") // removes unused parameter warning 136 | })) 137 | defer goodServer.Close() 138 | emptyServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 139 | w.Write([]byte("")) 140 | r.Header.Get("") // removes unused parameter warning 141 | })) 142 | defer emptyServer.Close() 143 | badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 144 | w.WriteHeader(404) 145 | r.Header.Get("") // removes unused parameter warning 146 | })) 147 | defer badServer.Close() 148 | 149 | filesystem := new(afero.MemMapFs) 150 | file, _ := afero.TempFile(filesystem, "", "ioutil-test") 151 | defer file.Close() 152 | 153 | for _, test := range updateTests { 154 | var primarySource, fallbackSource string 155 | if test.mainServerAvailable { 156 | primarySource = goodServer.URL 157 | } else { 158 | primarySource = badServer.URL 159 | } 160 | if test.fallbackServerAvailable { 161 | fallbackSource = goodServer.URL 162 | } else { 163 | fallbackSource = badServer.URL 164 | } 165 | 166 | // error should only be returned if Public Suffix List with requiredComments cannot 167 | // be downloaded from any of the sources. 168 | err := update(file, []string{primarySource, fallbackSource}) 169 | if test.expectError && err == nil { 170 | t.Errorf("Expected update() error, got no error.") 171 | } 172 | if !test.expectError && err != nil { 173 | t.Errorf("Expected no update() error, got an error.") 174 | } 175 | } 176 | 177 | // None of the servers return content with requiredComments 178 | if err := update(file, []string{emptyServer.URL, emptyServer.URL}); err == nil { 179 | t.Errorf("Expected update() error, got no error.") 180 | } 181 | } 182 | 183 | func TestFileLastModifiedHours(t *testing.T) { 184 | filesystem := new(afero.MemMapFs) 185 | file, _ := afero.TempFile(filesystem, "", "ioutil-test") 186 | fileinfo, _ := filesystem.Stat(file.Name()) 187 | if hours := fileLastModifiedHours(fileinfo); int(hours) != 0 { 188 | t.Errorf("Expected hours elapsed since last modification to be 0 immediately after file creation. %f", hours) 189 | } 190 | defer file.Close() 191 | } 192 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:recommended"], 3 | "ignoreTests": false, 4 | "packageRules": [ 5 | { 6 | "matchUpdateTypes": ["minor", "patch", "pin", "digest"], 7 | "automerge": true 8 | }, 9 | { 10 | "description": "Opt-out of minimum Go version updates", 11 | "matchManagers": ["gomod"], 12 | "matchDepTypes": ["golang"], 13 | "enabled": false 14 | } 15 | ], 16 | "gomod": { 17 | "postUpdateOptions": ["gomodUpdateImportPaths", "gomodTidy"] 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /strings.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "log" 5 | "strings" 6 | "unicode/utf8" 7 | 8 | "github.com/karlseguin/intset" 9 | "golang.org/x/net/idna" 10 | ) 11 | 12 | // const string ----------------------------------------------------------- 13 | 14 | const alphabets string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 15 | const numbers string = "0123456789" 16 | 17 | // IETF RFC 3490 18 | const labelSeparators string = "\u002e\u3002\uff0e\uff61" 19 | 20 | const controlChars string = "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\t\n\v\f\r\u000e\u000f" + 21 | "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f" 22 | const whitespace string = controlChars + " \u0085\u0086\u00a0\u1680\u200b\u200c\u200d\uFEFF" 23 | const invalidHostNameChars string = whitespace + "!\"#$&'()*+,/:;<=>?@[\\]^_`{|}~\u0378\u04c0\u06dd\u180e\u2025\u202e\u206b\u2183\u2a74\u2ff0\ufdd0\uff05\uff0f\uff1a\ufffa" 24 | 25 | const endOfHostWithPortDelimiters string = `/\?#` 26 | const endOfHostDelimiters string = endOfHostWithPortDelimiters + ":" 27 | const invalidUserInfoChars string = endOfHostWithPortDelimiters + "[]" 28 | 29 | // asciiSet --------------------------------------------------------------- 30 | 31 | var numericSet asciiSet = makeASCIISet(numbers) 32 | var alphaNumericSet asciiSet = makeASCIISet(alphabets + numbers) 33 | var endOfHostWithPortDelimitersSet asciiSet = makeASCIISet(endOfHostWithPortDelimiters) 34 | var endOfHostDelimitersSet asciiSet = makeASCIISet(endOfHostDelimiters) 35 | var invalidUserInfoCharsSet asciiSet = makeASCIISet(invalidUserInfoChars) 36 | 37 | var schemeFirstCharSet asciiSet = makeASCIISet(alphabets) 38 | var schemeRemainingCharSet asciiSet = makeASCIISet(alphabets + numbers + "+-.") 39 | var slashes asciiSet = makeASCIISet(`/\`) 40 | 41 | // asciiSet is a 32-byte value, where each bit represents the presence of a 42 | // given ASCII character in the set. The 128-bits of the lower 16 bytes, 43 | // starting with the least-significant bit of the lowest word to the 44 | // most-significant bit of the highest word, map to the full range of all 45 | // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 46 | // ensuring that any non-ASCII character will be reported as not in the set. 47 | // This allocates a total of 32 bytes even though the upper half 48 | // is unused to avoid bounds checks in asciiSet.contains. 49 | type asciiSet [8]uint32 50 | 51 | // makeASCIISet creates a set of ASCII characters from runes in chars. 52 | // Non-ASCII runes are skipped. Similar to strings.makeASCIISet. 53 | func makeASCIISet(chars string) (as asciiSet) { 54 | for _, c := range chars { 55 | if c < utf8.RuneSelf { 56 | as[c/32] |= 1 << (c % 32) 57 | } 58 | } 59 | return as 60 | } 61 | 62 | // contains reports whether c is inside the set. 63 | // 64 | // same as strings.contains. 65 | func (as *asciiSet) contains(c byte) bool { 66 | return (as[c/32] & (1 << (c % 32))) != 0 67 | } 68 | 69 | // *intset.Rune ----------------------------------------------------------- 70 | 71 | var labelSeparatorsRuneSet *intset.Rune = makeRuneSet(labelSeparators) 72 | var whitespaceRuneSet *intset.Rune = makeRuneSet(whitespace) 73 | var invalidHostNameCharsRuneSet *intset.Rune = makeRuneSet(invalidHostNameChars) 74 | 75 | // makeRuneSet converts a string to a set of unique runes 76 | func makeRuneSet(s string) (iset *intset.Rune) { 77 | var biggestRune rune 78 | for idx, r := range s { 79 | if idx == 0 || r > biggestRune { 80 | biggestRune = r 81 | } 82 | } 83 | // optimal target capacity 84 | iset = intset.NewRune(biggestRune) 85 | for _, r := range s { 86 | iset.Set(r) 87 | } 88 | return 89 | } 90 | 91 | // ------------------------------------------------------------------------ 92 | 93 | // getSchemeEndIndex checks if string s begins with a URL Scheme and 94 | // returns its last index. Returns -1 if no Scheme exists. 95 | func getSchemeEndIndex(s string) int { 96 | var colon bool 97 | var slashCount int 98 | 99 | for i := 0; i < len(s); i++ { 100 | // first character 101 | if i == 0 { 102 | // expecting schemeFirstCharSet or slash 103 | if schemeFirstCharSet.contains(s[i]) { 104 | continue 105 | } 106 | if slashes.contains(s[i]) { 107 | slashCount++ 108 | continue 109 | } 110 | return -1 111 | } 112 | // second character onwards 113 | // if no slashes yet, look for schemeRemainingCharSet or colon 114 | // otherwise look for slashes 115 | if slashCount == 0 { 116 | if !colon { 117 | if schemeRemainingCharSet.contains(s[i]) { 118 | continue 119 | } 120 | if s[i] == ':' { 121 | colon = true 122 | continue 123 | } 124 | } 125 | if slashes.contains(s[i]) { 126 | slashCount++ 127 | continue 128 | } 129 | return -1 130 | } 131 | // expecting only slashes 132 | if slashes.contains(s[i]) { 133 | slashCount++ 134 | continue 135 | } 136 | if slashCount < 2 { 137 | return -1 138 | } 139 | return i 140 | } 141 | if slashCount >= 2 { 142 | return len(s) 143 | } 144 | return -1 145 | } 146 | 147 | // indexAnyASCII returns the index of the first instance of any Unicode code point 148 | // from asciiSet in s, or -1 if no Unicode code point from asciiSet is present in s. 149 | // 150 | // Similar to strings.IndexAny but takes in an asciiSet instead of a string 151 | // and skips input validation. 152 | func indexAnyASCII(s string, as asciiSet) int { 153 | for i, b := range []byte(s) { 154 | if as.contains(b) { 155 | return i 156 | } 157 | } 158 | return -1 159 | } 160 | 161 | // hasInvalidChars checks s for invalid runes 162 | // 163 | // or leading/consecutive label separators 164 | // 165 | // or leading/trailing dash 166 | func hasInvalidChars(s string) bool { 167 | var isLabelSeparator bool 168 | lastByteIdx := len(s) - 1 169 | for idx, c := range s { 170 | if alphaNumericSet.contains(byte(c)) { 171 | // check for alphanumeric characters early to avoid expensive intset search 172 | isLabelSeparator = false 173 | continue 174 | } 175 | if idx == 0 && (c == '-' || labelSeparatorsRuneSet.Exists(c)) { 176 | // starts with a dash or label separator 177 | return true 178 | } 179 | if idx == lastByteIdx && c == '-' { 180 | // ends with a dash 181 | return true 182 | } 183 | if labelSeparatorsRuneSet.Exists(c) { 184 | if isLabelSeparator { 185 | // reject consecutive label separators 186 | return true 187 | } 188 | isLabelSeparator = true 189 | } else { 190 | isLabelSeparator = false 191 | } 192 | if invalidHostNameCharsRuneSet.Exists(c) { 193 | return true 194 | } 195 | } 196 | return false 197 | } 198 | 199 | // lastIndexAny returns the index of the last instance of any Unicode code 200 | // point from chars in s, or -1 if no Unicode code point from chars is 201 | // present in s. 202 | // 203 | // Similar to strings.LastIndexAny but skips input validation and uses *intset.Rune. 204 | func lastIndexAny(s string, chars *intset.Rune) int { 205 | for i := len(s); i > 0; { 206 | r, size := utf8.DecodeLastRuneInString(s[0:i]) 207 | i -= size 208 | if chars.Exists(r) { 209 | return i 210 | } 211 | } 212 | return -1 213 | } 214 | 215 | // reverse reverses a slice of strings in-place. 216 | func reverse(input []string) { 217 | for i, j := 0, len(input)-1; i < j; i, j = i+1, j-1 { 218 | input[i], input[j] = input[j], input[i] 219 | } 220 | } 221 | 222 | // sepSize returns byte length of an sep rune, given the rune's first byte. 223 | func sepSize(r byte) int { 224 | // r is the first byte of any of the runes in labelSeparators 225 | if r == 46 { 226 | // First byte of '.' is 46 227 | // size of '.' is 1 228 | return 1 229 | } 230 | // First byte of any label separator other than '.' is not 46 231 | // size of separator is 3 232 | return 3 233 | } 234 | 235 | var idnaToPuny *idna.Profile = idna.New(idna.MapForLookup(), idna.Transitional(true), idna.BidiRule(), idna.CheckHyphens(true)) 236 | 237 | // formatAsPunycode formats s as punycode. 238 | func formatAsPunycode(s string) string { 239 | asPunyCode, err := idnaToPuny.ToASCII(s) 240 | if err != nil { 241 | log.Println(strings.SplitAfterN(err.Error(), "idna: invalid label", 2)[0]) 242 | return "" 243 | } 244 | return asPunyCode 245 | } 246 | 247 | // indexLastByteBefore returns the index of the last instance of byte b 248 | // before any byte in notAfterCharsSet, otherwise -1 249 | func indexLastByteBefore(s string, b byte, notAfterCharsSet asciiSet) int { 250 | if firstNotAfterCharIdx := indexAnyASCII(s, notAfterCharsSet); firstNotAfterCharIdx != -1 { 251 | return strings.LastIndexByte(s[0:firstNotAfterCharIdx], b) 252 | } 253 | return strings.LastIndexByte(s, b) 254 | } 255 | 256 | // trimMode specifies which parts of string to trim for fastTrim() 257 | type trimMode int 258 | 259 | const ( 260 | trimBoth trimMode = iota 261 | trimLeft 262 | trimRight 263 | ) 264 | 265 | // fastTrim works like strings.Trim but uses *intset.Rune 266 | func fastTrim(s string, charsToTrim *intset.Rune, mode trimMode) string { 267 | var startIdx, endIdx int 268 | if mode != trimRight { 269 | // Trim left-hand side 270 | var trimCharsExist bool 271 | var broken bool 272 | for idx, c := range s { 273 | startIdx = idx 274 | if !charsToTrim.Exists(c) { 275 | broken = true 276 | break 277 | } 278 | trimCharsExist = true 279 | } 280 | if trimCharsExist && !broken { 281 | // Return empty string if every character in s exists in charsToTrim 282 | return "" 283 | } 284 | } 285 | if mode != trimLeft { 286 | // Trim right-hand side 287 | var trimCharsExist bool 288 | var broken bool 289 | for i := len(s); i > 0; { 290 | endIdx = i 291 | r, size := utf8.DecodeLastRuneInString(s[0:i]) 292 | i -= size 293 | if !charsToTrim.Exists(r) { 294 | broken = true 295 | break 296 | } 297 | trimCharsExist = true 298 | } 299 | if trimCharsExist && !broken { 300 | // Return empty string if every character in s exists in charsToTrim 301 | return "" 302 | } 303 | } else { 304 | endIdx = len(s) 305 | } 306 | return s[startIdx:endIdx] 307 | } 308 | -------------------------------------------------------------------------------- /strings_test.go: -------------------------------------------------------------------------------- 1 | package fasttld 2 | 3 | import ( 4 | "reflect" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/karlseguin/intset" 9 | ) 10 | 11 | type punyCodeTest struct { 12 | url string 13 | expected string 14 | } 15 | 16 | var punyCodeTests = []punyCodeTest{ 17 | {"google.com", "google.com"}, 18 | {"hello.世界.com", "hello.xn--rhqv96g.com"}, 19 | {strings.Repeat("x", 65536) + "\uff00", ""}, // int32 overflow. 20 | } 21 | 22 | func TestPunyCode(t *testing.T) { 23 | for _, test := range punyCodeTests { 24 | converted := formatAsPunycode(test.url) 25 | if output := reflect.DeepEqual(converted, test.expected); !output { 26 | t.Errorf("Output %q not equal to expected %q", converted, test.expected) 27 | } 28 | } 29 | } 30 | 31 | type reverseTest struct { 32 | original []string 33 | expected []string 34 | } 35 | 36 | var reverseTests = []reverseTest{ 37 | {[]string{}, []string{}}, 38 | {[]string{"ab"}, []string{"ab"}}, 39 | {[]string{"ab", "cd", "gh", "ij"}, []string{"ij", "gh", "cd", "ab"}}, 40 | {[]string{"ab", "cd", "ef", "gh", "ij"}, []string{"ij", "gh", "ef", "cd", "ab"}}, 41 | } 42 | 43 | func TestReverse(t *testing.T) { 44 | for _, test := range reverseTests { 45 | reverse(test.original) 46 | if output := reflect.DeepEqual(test.original, test.expected); !output { 47 | t.Errorf("Output %q not equal to expected %q", test.original, test.expected) 48 | } 49 | } 50 | } 51 | 52 | func TestFastTrim(t *testing.T) { 53 | const charsToTrim string = ".@新" 54 | var charsToTrimRuneSet *intset.Rune = makeRuneSet(charsToTrim) 55 | 56 | ss := []string{".abc.", ".abc", "abc.", "..abc.", ".abc..", "..abc..", 57 | "@abc@", "@abc", "abc@", "@@abc@", "@abc@@", "@@abc@@", 58 | "新abc新", "新abc", "abc新", "新新abc新", "新abc新新", "新新abc新新", 59 | "新@abc新.", "新.abc", "abc@新", "新新.abc新", "新abc新@新", "新新.abc.新新", 60 | ".", "..", 61 | ".@", "@.", 62 | ".@新", "新@.", 63 | " ", " .@ ", ". .@ ", " .@ 新", 64 | "abc"} 65 | 66 | for _, s := range ss { 67 | expectedTrimBoth := strings.Trim(s, charsToTrim) 68 | if output := fastTrim(s, charsToTrimRuneSet, trimBoth); output != expectedTrimBoth { 69 | t.Errorf("Output %q not equal to expected %q", output, expectedTrimBoth) 70 | } 71 | expectedTrimLeft := strings.TrimLeft(s, charsToTrim) 72 | if output := fastTrim(s, charsToTrimRuneSet, trimLeft); output != expectedTrimLeft { 73 | t.Errorf("Output %q not equal to expected %q", output, expectedTrimLeft) 74 | } 75 | expectedTrimRight := strings.TrimRight(s, charsToTrim) 76 | if output := fastTrim(s, charsToTrimRuneSet, trimRight); output != expectedTrimRight { 77 | t.Errorf("Output %q not equal to expected %q", output, expectedTrimRight) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /test/mini_public_suffix_list.dat: -------------------------------------------------------------------------------- 1 | // ===BEGIN ICANN DOMAINS=== 2 | ac 3 | com.ac 4 | edu.ac 5 | gov.ac 6 | net.ac 7 | mil.ac 8 | org.ac 9 | *.ck 10 | !www.ck 11 | org.sg 12 | // ===END ICANN DOMAINS=== 13 | // ===BEGIN PRIVATE DOMAINS=== 14 | blogspot.com 15 | // the following line is invalid punycode 16 | xn--0.com 17 | // ===END PRIVATE DOMAINS=== 18 | --------------------------------------------------------------------------------