├── .gitignore ├── tools ├── private_domains_checker │ ├── .gitignore │ ├── requirements.txt │ ├── TestPSLPrivateDomainsProcessor.py │ └── README.md ├── convert_tests ├── patchnewgtlds ├── go.mod ├── internal │ ├── domain │ │ ├── update_idna_testdata.go │ │ └── domain_test.go │ ├── parser │ │ ├── file_test.go │ │ ├── write.go │ │ ├── unicode.go │ │ ├── write_test.go │ │ ├── text_test.go │ │ ├── text.go │ │ ├── metadata_test.go │ │ ├── errors.go │ │ ├── validate_test.go │ │ ├── metadata.go │ │ ├── diff.go │ │ ├── parser_test.go │ │ ├── file.go │ │ └── parser.go │ ├── githistory │ │ └── history.go │ └── github │ │ └── pr.go ├── go.sum └── psltool │ └── psltool.go ├── linter ├── test_section1.input ├── test_allowedchars.input ├── test_section2.expected ├── test_section3.expected ├── test_NFKC.expected ├── test_section1.expected ├── test_punycode.expected ├── test_section4.input ├── test_section4.expected ├── test_allowedchars.expected ├── test_punycode.input ├── test_section2.input ├── test_wildcard.expected ├── test_section3.input ├── test_dots.expected ├── test_wildcard.input ├── test_NFKC.input ├── test_dots.input ├── test_spaces.expected ├── test_exception.expected ├── test_duplicate.expected ├── test_duplicate.input ├── test_exception.input ├── test_spaces.input ├── README.md ├── pslint_selftest.sh └── pslint.py ├── tests ├── README ├── test_psl.js ├── test_bug414122.js ├── tests.txt ├── prepare_tlds.py └── test_psl.txt ├── .github ├── workflows │ ├── deploy-site.yml │ ├── validate.yml │ ├── psltool_pr_check.yml │ ├── test.yml │ ├── psltool-fmt.yml │ └── tld-update.yml └── pull_request_template.md ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── SECURITY.md ├── Makefile ├── CONTRIBUTING.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | linter/log 2 | libpsl 3 | coverage.out 4 | -------------------------------------------------------------------------------- /tools/private_domains_checker/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ 3 | data/*.csv -------------------------------------------------------------------------------- /tools/private_domains_checker/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | requests 3 | whoisdomain -------------------------------------------------------------------------------- /linter/test_section1.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - no section at all 3 | 4 | example.com 5 | -------------------------------------------------------------------------------- /linter/test_allowedchars.input: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lovablelabs/list/main/linter/test_allowedchars.input -------------------------------------------------------------------------------- /linter/test_section2.expected: -------------------------------------------------------------------------------- 1 | 11: warning: 2 ICANN sections found 2 | 11: warning: No PRIVATE section found 3 | -------------------------------------------------------------------------------- /linter/test_section3.expected: -------------------------------------------------------------------------------- 1 | 11: warning: No ICANN section found 2 | 11: warning: 2 PRIVATE sections found 3 | -------------------------------------------------------------------------------- /linter/test_NFKC.expected: -------------------------------------------------------------------------------- 1 | 9: error: Rule must be NFKC: 'südtirol.it' 2 | 11: warning: No PRIVATE section found 3 | -------------------------------------------------------------------------------- /linter/test_section1.expected: -------------------------------------------------------------------------------- 1 | 4: error: Rule outside of section: 'example.com' 2 | 4: warning: No ICANN section found 3 | 4: warning: No PRIVATE section found 4 | -------------------------------------------------------------------------------- /linter/test_punycode.expected: -------------------------------------------------------------------------------- 1 | 7: error: Punycode found: 'a.xn--0zwm56d' 2 | 8: error: Double minus found: 'a.ex--ample.com' 3 | 10: warning: No PRIVATE section found 4 | -------------------------------------------------------------------------------- /linter/test_section4.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - ICANN section improperly closed 3 | 4 | // ===BEGIN ICANN DOMAINS=== 5 | 6 | example.com 7 | 8 | // ===END PRIVATE DOMAINS=== 9 | -------------------------------------------------------------------------------- /linter/test_section4.expected: -------------------------------------------------------------------------------- 1 | 8: error: Unexpected end of section: '// ===END PRIVATE DOMAINS===' 2 | 8: error: ICANN section not closed 3 | 8: warning: No PRIVATE section found 4 | -------------------------------------------------------------------------------- /linter/test_allowedchars.expected: -------------------------------------------------------------------------------- 1 | 10: error: Illegal character: 'a.exam#ple.com' 2 | 11: error: Illegal character: 'b.exam ple.com' 3 | 13: error: Invalid UTF-8 character 4 | 15: warning: No PRIVATE section found 5 | -------------------------------------------------------------------------------- /linter/test_punycode.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - label is punycode 3 | // - label has double minus 4 | 5 | // ===BEGIN ICANN DOMAINS=== 6 | 7 | a.xn--0zwm56d 8 | a.ex--ample.com 9 | 10 | // ===END ICANN DOMAINS=== 11 | -------------------------------------------------------------------------------- /linter/test_section2.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - two ICANN sections 3 | 4 | // ===BEGIN ICANN DOMAINS=== 5 | 6 | example.com 7 | 8 | // ===END ICANN DOMAINS=== 9 | 10 | // ===BEGIN ICANN DOMAINS=== 11 | // ===END ICANN DOMAINS=== 12 | -------------------------------------------------------------------------------- /linter/test_wildcard.expected: -------------------------------------------------------------------------------- 1 | 11: error: Illegal character: '**.com' 2 | 12: error: Illegal character: 'a*.com' 3 | 13: error: Illegal character: 'b.*.com' 4 | 14: error: Illegal character: 'a.b.*' 5 | 16: warning: No PRIVATE section found 6 | -------------------------------------------------------------------------------- /linter/test_section3.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - two PRIVATE sections 3 | 4 | // ===BEGIN PRIVATE DOMAINS=== 5 | 6 | example.com 7 | 8 | // ===END PRIVATE DOMAINS=== 9 | 10 | // ===BEGIN PRIVATE DOMAINS=== 11 | // ===END PRIVATE DOMAINS=== 12 | -------------------------------------------------------------------------------- /tools/convert_tests: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Written 2016 by Tim Ruehsen (tim dot ruehsen at gmx dot de) 4 | # 5 | # Convert test_psl.txt to tests.txt (sed hack) 6 | 7 | sed -e "s/checkPublicSuffix('*\([^']*\)'*, '*\([^']*\)'*);/\1 \2/g" test_psl.txt >tests.txt 8 | -------------------------------------------------------------------------------- /linter/test_dots.expected: -------------------------------------------------------------------------------- 1 | 9: error: Leading/trailing or multiple dot: '.a.example.com' 2 | 10: error: Leading/trailing or multiple dot: 'b.example.com.' 3 | 11: error: Leading/trailing or multiple dot: 'c..example.com' 4 | 13: warning: No PRIVATE section found 5 | -------------------------------------------------------------------------------- /linter/test_wildcard.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - valid wildcard usage 3 | // - invalid wildcard usage 4 | 5 | // ===BEGIN ICANN DOMAINS=== 6 | 7 | // valid 8 | *.com 9 | 10 | // invalid 11 | **.com 12 | a*.com 13 | b.*.com 14 | a.b.* 15 | 16 | // ===END ICANN DOMAINS=== 17 | -------------------------------------------------------------------------------- /tests/README: -------------------------------------------------------------------------------- 1 | prepare_tlds.py: 2 | 3 | This is a copy of a file mastered in Mozilla's Hg repo at: 4 | https://hg.mozilla.org/mozilla-central/file/default/netwerk/dns/prepare_tlds.py 5 | We include it here so we can check that it still produces valid output when 6 | the PSL changes. 7 | 8 | -------------------------------------------------------------------------------- /linter/test_NFKC.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - label contains non-NFKC character(s) 3 | // 4 | // best viewed with 'LC_ALL=C.UTF-8 vi ' (or any other UTF-8 locale) 5 | 6 | // ===BEGIN ICANN DOMAINS=== 7 | 8 | südtirol.it 9 | südtirol.it 10 | 11 | // ===END ICANN DOMAINS=== 12 | -------------------------------------------------------------------------------- /linter/test_dots.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - leading dot 3 | // - trailing dot 4 | // - consecutive dots 5 | 6 | // ===BEGIN ICANN DOMAINS=== 7 | 8 | // example.com: https://www.iana.org/domains/reserved 9 | .a.example.com 10 | b.example.com. 11 | c..example.com 12 | 13 | // ===END ICANN DOMAINS=== 14 | -------------------------------------------------------------------------------- /linter/test_spaces.expected: -------------------------------------------------------------------------------- 1 | 12: warning: Leading/Trailing whitespace: ' a.example.com' 2 | 13: warning: Leading/Trailing whitespace: 'b.example.com ' 3 | 14: warning: Leading/Trailing whitespace: '\tc.example.com' 4 | 15: warning: Leading/Trailing whitespace: 'd.example.com\t' 5 | 17: warning: Leading/Trailing whitespace: ' ' 6 | 19: warning: No PRIVATE section found 7 | -------------------------------------------------------------------------------- /.github/workflows/deploy-site.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: deploy website 3 | on: 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | 9 | permissions: {} 10 | 11 | jobs: 12 | upload-website: 13 | uses: publicsuffix/publicsuffix.org/.github/workflows/deploy-site.yaml@main 14 | permissions: 15 | contents: read 16 | id-token: write 17 | secrets: inherit 18 | -------------------------------------------------------------------------------- /linter/test_exception.expected: -------------------------------------------------------------------------------- 1 | 17: error: Leading/trailing or multiple dot: '!.example.com' 2 | 18: error: Illegal character: 'w!w.example.com' 3 | 19: error: Found doublette/ambiguity (previous line was 12): '!www.example.com' 4 | 20: error: Exception without previous wildcard: '!a.b.example.com' 5 | 21: error: Exception without previous wildcard: '!a.c.example.com' 6 | 23: warning: No PRIVATE section found 7 | -------------------------------------------------------------------------------- /tools/patchnewgtlds: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -o pipefail 5 | set -x 6 | 7 | if ! [ -x "$(command -v go)" ] 8 | then 9 | echo "error: a go binary in \$PATH is required to run newgtlds.go" >&2 10 | exit 1 11 | fi 12 | 13 | SCRIPT=$(realpath "$0") 14 | BASEDIR=$(dirname "$SCRIPT") 15 | 16 | go run -C "$BASEDIR/" . \ 17 | -overwrite \ 18 | -psl-dat-file="$BASEDIR/../public_suffix_list.dat" 19 | -------------------------------------------------------------------------------- /linter/test_duplicate.expected: -------------------------------------------------------------------------------- 1 | 9: error: Found doublette/ambiguity (previous line was 8): '*.com' 2 | 13: error: Found doublette/ambiguity (previous line was 12): '!www.com' 3 | 17: error: Found doublette/ambiguity (previous line was 16): '*.example.com' 4 | 21: error: Found doublette/ambiguity (previous line was 20): 'example1.com' 5 | 24: error: Found doublette/ambiguity (previous line was 17): 'example.com' 6 | 26: warning: No PRIVATE section found 7 | -------------------------------------------------------------------------------- /linter/test_duplicate.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - valid wildcard usage 3 | // - invalid wildcard usage 4 | 5 | // ===BEGIN ICANN DOMAINS=== 6 | 7 | // *.com implicitely includes .com 8 | com 9 | *.com 10 | 11 | // double exception 12 | !www.com 13 | !www.com 14 | 15 | // double wildcard 16 | *.example.com 17 | *.example.com 18 | 19 | // double plain rule 20 | example1.com 21 | example1.com 22 | 23 | // redundant/overlapping rule 24 | example.com 25 | 26 | // ===END ICANN DOMAINS=== 27 | -------------------------------------------------------------------------------- /linter/test_exception.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - valid exception 3 | // - invalid exceptions 4 | // - same exception twice 5 | // - exception without wildcard 6 | // - exception with prevailing '*' rule (!localhost) 7 | 8 | // ===BEGIN ICANN DOMAINS=== 9 | 10 | // valid 11 | *.example.com 12 | !www.example.com 13 | !localhost 14 | c.example.com 15 | 16 | // invalid 17 | !.example.com 18 | w!w.example.com 19 | !www.example.com 20 | !a.b.example.com 21 | !a.c.example.com 22 | 23 | // ===END ICANN DOMAINS=== 24 | -------------------------------------------------------------------------------- /linter/test_spaces.input: -------------------------------------------------------------------------------- 1 | // test: 2 | // - leading space 3 | // - trailing space, empty line with spaces 4 | // - leading tab 5 | // - trailing tab 6 | // - line ends with CRLF (pslint_selftest will add one to e.example.com and removed it after testing) 7 | // - empty line with spaces 8 | 9 | // ===BEGIN ICANN DOMAINS=== 10 | 11 | // example.com: https://www.iana.org/domains/reserved 12 | a.example.com 13 | b.example.com 14 | c.example.com 15 | d.example.com 16 | e.example.com 17 | 18 | 19 | // ===END ICANN DOMAINS=== 20 | -------------------------------------------------------------------------------- /.github/workflows/validate.yml: -------------------------------------------------------------------------------- 1 | name: validate 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | validate: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 12 | - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0 13 | with: 14 | go-version: "stable" 15 | - name: run validations 16 | run: | 17 | cd tools 18 | go run ./psltool validate ../public_suffix_list.dat 19 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | compiler: gcc 3 | 4 | script: 5 | - make 6 | - go test -v -coverprofile=coverage.out tools/*.go 7 | 8 | go: 9 | - "1.15.x" 10 | 11 | addons: 12 | apt: 13 | packages: 14 | - python3 15 | - autoconf 16 | - automake 17 | - autopoint 18 | - libtool 19 | - gettext 20 | - libidn11-dev 21 | - libidn2-0 22 | - libidn2-0-dev 23 | - libicu-dev 24 | - libunistring0 25 | - libunistring-dev 26 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ### Community Participation Guidelines 2 | Your participation in the Public Suffix List project should follow the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/ "Mozilla Community Participation Guidelines") as well as the [GitHub Community Participation Guidelines](https://help.github.com/en/github/site-policy/github-community-guidelines "GitHub Community Participation Guidelines"). Behavior that falls into the areas forbidden by either document is unwelcome and will result in further escalation. 3 | -------------------------------------------------------------------------------- /tools/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/publicsuffix/list/tools 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.24.2 6 | 7 | require ( 8 | github.com/google/go-cmp v0.6.0 9 | golang.org/x/net v0.38.0 10 | golang.org/x/text v0.23.0 11 | ) 12 | 13 | require ( 14 | github.com/creachadair/command v0.1.13 15 | github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad 16 | github.com/creachadair/mds v0.15.2 17 | github.com/creachadair/taskgroup v0.9.0 18 | github.com/google/go-github/v63 v63.0.0 19 | github.com/natefinch/atomic v1.0.1 20 | ) 21 | 22 | require github.com/google/go-querystring v1.1.0 // indirect 23 | -------------------------------------------------------------------------------- /.github/workflows/psltool_pr_check.yml: -------------------------------------------------------------------------------- 1 | name: psltool PR check 2 | 3 | on: 4 | pull_request: 5 | 6 | permissions: {} 7 | 8 | jobs: 9 | validate: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 14 | - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0 15 | with: 16 | go-version: "stable" 17 | - name: run validations 18 | run: | 19 | cd tools 20 | go run ./psltool fmt -d ../public_suffix_list.dat && go run ./psltool check-pr --gh-owner ${{ github.event.repository.owner.login }} --gh-repo ${{ github.event.repository.name }} --online-checks ${{ github.event.pull_request.number }} 21 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: [push, pull_request] 3 | 4 | permissions: 5 | contents: read 6 | 7 | jobs: 8 | make-test: 9 | name: Unit tests 10 | runs-on: ubuntu-22.04 11 | steps: 12 | - name: Check out code 13 | uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 14 | 15 | - name: Set up Go 16 | uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0 17 | with: 18 | go-version: 'stable' 19 | 20 | - name: Run Go unit tests 21 | run: go test -C ./tools -v . 22 | 23 | - name: Install dependencies 24 | run: sudo apt install -y autopoint 25 | 26 | - name: Run makefile tests 27 | run: make test 28 | -------------------------------------------------------------------------------- /linter/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a linter for the Public Suffix List. 2 | 3 | Before you commit any changes to the PSL, please use the 4 | linter to check the syntax. 5 | 6 | Usage 7 | ===== 8 | 9 | (from the repo's main directory) 10 | 11 | $ linter/pslint.py public_suffix_list.dat 12 | 13 | $? is set to 0 on success, else it is set to 1. 14 | 15 | 16 | Selftest 17 | ======== 18 | 19 | Every change on pslint.py should be followed by a self-test. 20 | 21 | ``` 22 | $ cd linter 23 | $ ./pslint_selftest.sh 24 | test_allowedchars: OK 25 | test_dots: OK 26 | test_duplicate: OK 27 | test_exception: OK 28 | test_punycode: OK 29 | test_section1: OK 30 | test_section2: OK 31 | test_section3: OK 32 | test_section4: OK 33 | test_spaces: OK 34 | test_wildcard: OK 35 | ``` 36 | -------------------------------------------------------------------------------- /linter/pslint_selftest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rc=0 4 | rm -rf log 5 | mkdir -p log 6 | 7 | # add CR if missing, it won't possibly survive git 8 | sed -i -e 's/^e.example.com$/e.example.com\r/g' test_spaces.input 9 | 10 | for file in `ls *.input|cut -d'.' -f1`; do 11 | echo -n "${file}: " 12 | ./pslint.py ${file}.input >log/${file}.log 2>&1 13 | diff -u ${file}.expected log/${file}.log >log/${file}.diff 14 | if [ $? -eq 0 ]; then 15 | echo OK 16 | rm log/${file}.diff log/${file}.log 17 | else 18 | echo FAILED 19 | cat log/${file}.diff 20 | rc=1 21 | fi 22 | done 23 | 24 | # remove CR, to not appear as changed to git 25 | sed -i -e 's/^e.example.com\r$/e.example.com/g' test_spaces.input 26 | 27 | if [ $rc -eq 0 ]; then 28 | rmdir log 29 | fi 30 | 31 | exit $rc 32 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | Security updates are applied only to the repository itself. 4 | 5 | ## Reporting a Vulnerability 6 | 7 | Reports are limited to repo matters. Any vulnerability reports related to the addition or removal of PSL entries in the .dat file shall be rejected and referred to filing pull requests that should make mention the alleged urgency. 8 | 9 | If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. 10 | 11 | Please disclose it at [security advisory](https://github.com/publicsuffix/list/security/advisories/new) and send an email with the link to the newly filed issue to [security@mozilla.org](mailto:security@mozilla.org) to expedite the review on our end. 12 | 13 | This project is maintained by a team of volunteers on a reasonable-effort basis. As such, please give us at least 90 days to work on a fix before public exposure. 14 | -------------------------------------------------------------------------------- /.github/workflows/psltool-fmt.yml: -------------------------------------------------------------------------------- 1 | name: psltool-fmt 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | permissions: 7 | pull-requests: write 8 | contents: write # This should be okay since you cannot easily run this on 9 | # something like an untrusted PR. PRs are not offered in the GUI. 10 | 11 | jobs: 12 | validate: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 17 | - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0 18 | with: 19 | go-version: "stable" 20 | - name: run validations 21 | run: | 22 | cd tools 23 | go run ./psltool fmt ../public_suffix_list.dat 24 | - name: create PR 25 | uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8 26 | with: 27 | commit-message: Apply formatting using `psltool fmt` 28 | branch: psltool-fmt 29 | title: 'Automatic PR for workflow `psltool-fmt`' 30 | body: 'Automatic PR for formatting workflow using `psltool fmt`' 31 | -------------------------------------------------------------------------------- /tools/internal/domain/update_idna_testdata.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | // This script is run by `go generate` (see domains_test.go) to 4 | // download a new copy of the IDNA test inputs. They are stored 5 | // verbatim as provided by the Unicode Consortium to make it easy to 6 | // verify that it's an unaltered file, and gets parsed for the 7 | // information relevant to this package in domains_test.go. 8 | package main 9 | 10 | import ( 11 | "fmt" 12 | "log" 13 | "net/http" 14 | 15 | "github.com/natefinch/atomic" 16 | "golang.org/x/net/idna" 17 | ) 18 | 19 | const ( 20 | idnaTestVectorsURLPattern = "https://www.unicode.org/Public/idna/%s/IdnaTestV2.txt" 21 | idnaTestVectorsPath = "testdata/idna_test_vectors.txt" 22 | ) 23 | 24 | func main() { 25 | // New releases of Unicode can alter the outcome of existing 26 | // tests, so it's very important to use the test vectors for the 27 | // specific version of Unicode that x/net/idna uses. 28 | url := fmt.Sprintf(idnaTestVectorsURLPattern, idna.UnicodeVersion) 29 | 30 | resp, err := http.Get(url) 31 | if err != nil { 32 | log.Fatal(err) 33 | } else if resp.StatusCode != http.StatusOK { 34 | log.Fatalf("Fetching %q: %v", url, err) 35 | } 36 | defer resp.Body.Close() 37 | 38 | if err := atomic.WriteFile(idnaTestVectorsPath, resp.Body); err != nil { 39 | log.Fatalf("Writing %q: %v", idnaTestVectorsPath, err) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/test_psl.js: -------------------------------------------------------------------------------- 1 | var etld = Cc["@mozilla.org/network/effective-tld-service;1"] 2 | .getService(Ci.nsIEffectiveTLDService); 3 | 4 | var idna = Cc["@mozilla.org/network/idn-service;1"] 5 | .getService(Ci.nsIIDNService); 6 | 7 | var Cr = Components.results; 8 | 9 | function run_test() 10 | { 11 | var file = do_get_file("data/test_psl.txt"); 12 | var ios = Cc["@mozilla.org/network/io-service;1"] 13 | .getService(Ci.nsIIOService); 14 | var uri = ios.newFileURI(file); 15 | var scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"] 16 | .getService(Ci.mozIJSSubScriptLoader); 17 | var srvScope = {}; 18 | scriptLoader.loadSubScript(uri.spec, srvScope, "utf-8"); 19 | } 20 | 21 | function checkPublicSuffix(host, expectedSuffix) 22 | { 23 | var actualSuffix = null; 24 | try { 25 | actualSuffix = etld.getBaseDomainFromHost(host); 26 | } catch (e if e.result == Cr.NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS || 27 | e.result == Cr.NS_ERROR_ILLEGAL_VALUE) { 28 | } 29 | // The EffectiveTLDService always gives back punycoded labels. 30 | // The test suite wants to get back what it put in. 31 | if (actualSuffix !== null && expectedSuffix !== null && 32 | /(^|\.)xn--/.test(actualSuffix) && !/(^|\.)xn--/.test(expectedSuffix)) { 33 | actualSuffix = idna.convertACEtoUTF8(actualSuffix); 34 | } 35 | do_check_eq(actualSuffix, expectedSuffix); 36 | } 37 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | Dir = $(PWD) 2 | Options = --with-psl-file=$(Dir)/public_suffix_list.dat --with-psl-testfile=$(Dir)/tests/tests.txt 3 | 4 | all: test 5 | 6 | test: test-syntax test-rules 7 | 8 | test-rules: libpsl-libicu 9 | 10 | test-syntax: 11 | @ 12 | cd linter; \ 13 | ./pslint_selftest.sh; \ 14 | ./pslint.py ../public_suffix_list.dat; 15 | 16 | libpsl-config: 17 | @ 18 | test -d libpsl || git clone --depth=1 https://github.com/rockdaboot/libpsl; \ 19 | cd libpsl; \ 20 | git pull; \ 21 | echo "EXTRA_DIST =" > gtk-doc.make; \ 22 | echo "CLEANFILES =" >> gtk-doc.make; \ 23 | autoreconf --install --force --symlink; 24 | 25 | # Test PSL data with libicu (IDNA2008 UTS#46) 26 | libpsl-libicu: libpsl-config 27 | cd libpsl && ./configure -q -C --enable-runtime=libicu --enable-builtin=libicu $(Options) && make -s clean && make -s check -j4 28 | 29 | # TEST PSL data with libidn2 (IDNA2008) 30 | libpsl-libidn2: libpsl-config 31 | cd libpsl && ./configure -q -C --enable-runtime=libidn2 --enable-builtin=libidn2 $(Options) && make -s clean && make -s check -j4 32 | 33 | # TEST PSL data with libidn (IDNA2003) 34 | libpsl-libidn: libpsl-config 35 | cd libpsl && ./configure -q -C --enable-runtime=libidn --enable-builtin=libidn $(Options) && make -s clean && make -s check -j4 -------------------------------------------------------------------------------- /.github/workflows/tld-update.yml: -------------------------------------------------------------------------------- 1 | name: tld-update 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | # Run once a day at 15:00 UTC 6 | - cron: '0 15 * * *' 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | psl-gtld-update: 13 | name: Check for TLD data updates 14 | runs-on: ubuntu-latest 15 | permissions: 16 | contents: write 17 | pull-requests: write 18 | steps: 19 | 20 | - name: Check out code 21 | uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 22 | 23 | - name: Set up Go 24 | uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0 25 | with: 26 | go-version: 'stable' 27 | 28 | - name: Run unit tests 29 | run: go test -C ./tools -v . 30 | 31 | - name: Set current date 32 | id: get-date 33 | run: echo "NOW=$(date +'%Y-%m-%dT%H:%M:%S %Z')" >> $GITHUB_OUTPUT 34 | 35 | - name: Run patchnewgtlds 36 | run: tools/patchnewgtlds 37 | 38 | - name: Create pull-request 39 | id: cpr 40 | uses: peter-evans/create-pull-request@38e0b6e68b4c852a5500a94740f0e535e0d7ba54 # v4.2.4 41 | with: 42 | commit-message: "util: gTLD data autopull updates for ${{ steps.get-date.outputs.NOW }}" 43 | title: "util: gTLD autopull updates for ${{ steps.get-date.outputs.now }}" 44 | body: "Public suffix list gTLD data updates from `tools/patchnewgtlds` for ${{ steps.get-date.outputs.now }}." 45 | committer: "GitHub " 46 | author: "GitHub " 47 | branch: psl-gtld-update 48 | labels: | 49 | ✅ autopull 50 | 🚩ICANN (IANA/ICP-3) Section 51 | delete-branch: true 52 | 53 | - name: Check outputs 54 | run: | 55 | echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" 56 | echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" 57 | -------------------------------------------------------------------------------- /tests/test_bug414122.js: -------------------------------------------------------------------------------- 1 | const PR_RDONLY = 0x1; 2 | 3 | var etld = Cc["@mozilla.org/network/effective-tld-service;1"] 4 | .getService(Ci.nsIEffectiveTLDService); 5 | var idn = Cc["@mozilla.org/network/idn-service;1"] 6 | .getService(Ci.nsIIDNService); 7 | 8 | function run_test() 9 | { 10 | var fis = Cc["@mozilla.org/network/file-input-stream;1"] 11 | .createInstance(Ci.nsIFileInputStream); 12 | fis.init(do_get_file("effective_tld_names.dat"), 13 | PR_RDONLY, 0444, Ci.nsIFileInputStream.CLOSE_ON_EOF); 14 | 15 | var lis = Cc["@mozilla.org/intl/converter-input-stream;1"] 16 | .createInstance(Ci.nsIConverterInputStream); 17 | lis.init(fis, "UTF-8", 1024, 0); 18 | lis.QueryInterface(Ci.nsIUnicharLineInputStream); 19 | 20 | var out = { value: "" }; 21 | do 22 | { 23 | var more = lis.readLine(out); 24 | var line = out.value; 25 | 26 | line = line.replace(/^\s+/, ""); 27 | var firstTwo = line.substring(0, 2); // a misnomer, but whatever 28 | if (firstTwo == "" || firstTwo == "//") 29 | continue; 30 | 31 | var space = line.search(/[ \t]/); 32 | line = line.substring(0, space == -1 ? line.length : space); 33 | 34 | if ("*." == firstTwo) 35 | { 36 | let rest = line.substring(2); 37 | checkPublicSuffix("foo.SUPER-SPECIAL-AWESOME-PREFIX." + rest, 38 | "SUPER-SPECIAL-AWESOME-PREFIX." + rest); 39 | } 40 | else if ("!" == line.charAt(0)) 41 | { 42 | checkPublicSuffix(line.substring(1), 43 | line.substring(line.indexOf(".") + 1)); 44 | } 45 | else 46 | { 47 | checkPublicSuffix("SUPER-SPECIAL-AWESOME-PREFIX." + line, line); 48 | } 49 | } 50 | while (more); 51 | } 52 | 53 | function checkPublicSuffix(host, expectedSuffix) 54 | { 55 | expectedSuffix = idn.convertUTF8toACE(expectedSuffix).toLowerCase(); 56 | var actualSuffix = etld.getPublicSuffixFromHost(host); 57 | do_check_eq(actualSuffix, expectedSuffix); 58 | } 59 | -------------------------------------------------------------------------------- /tools/go.sum: -------------------------------------------------------------------------------- 1 | github.com/creachadair/command v0.1.13 h1:UDKPF3QYPRS/quZPVYZ7sW1JLxLLOgiyVSLQ+7wwI2o= 2 | github.com/creachadair/command v0.1.13/go.mod h1:YKwUE49nAi8qxLl8jCQ0GMPvwdxmIBkJW3LqxgZ7ljk= 3 | github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad h1:Fv6FRWgCJTHsslL0qRhhO7Jj7cL78YW8s1c8UxFGIIo= 4 | github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad/go.mod h1:K8bFvn8hMdAljQkaKNc7I3os5Wk36JxkyCkfdZ7S8d4= 5 | github.com/creachadair/mds v0.15.2 h1:es1qGKgRGSaztpvrSQcZ0B9I6NsHYJ1Sa9naD/3OfCM= 6 | github.com/creachadair/mds v0.15.2/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo= 7 | github.com/creachadair/taskgroup v0.9.0 h1:kzXSea5C7R5DtnKFBOTEW3hvmCkiVnRkODMVDMgSS6k= 8 | github.com/creachadair/taskgroup v0.9.0/go.mod h1:+1hJc8zL1rQkxcMVqEYJ0UPGtwl6Iz1+fd4zcOLtt+A= 9 | github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= 10 | github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= 11 | github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 12 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 13 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 14 | github.com/google/go-github/v63 v63.0.0 h1:13xwK/wk9alSokujB9lJkuzdmQuVn2QCPeck76wR3nE= 15 | github.com/google/go-github/v63 v63.0.0/go.mod h1:IqbcrgUmIcEaioWrGYei/09o+ge5vhffGOcxrO0AfmA= 16 | github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= 17 | github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= 18 | github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A= 19 | github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM= 20 | golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= 21 | golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= 22 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= 23 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= 24 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 25 | -------------------------------------------------------------------------------- /tests/tests.txt: -------------------------------------------------------------------------------- 1 | // Any copyright is dedicated to the Public Domain. 2 | // https://creativecommons.org/publicdomain/zero/1.0/ 3 | 4 | // null input. 5 | null null 6 | // Mixed case. 7 | COM null 8 | example.COM example.com 9 | WwW.example.COM example.com 10 | // Leading dot. 11 | .com null 12 | .example null 13 | .example.com null 14 | .example.example null 15 | // Unlisted TLD. 16 | example null 17 | example.example example.example 18 | b.example.example example.example 19 | a.b.example.example example.example 20 | // Listed, but non-Internet, TLD. 21 | //local null 22 | //example.local null 23 | //b.example.local null 24 | //a.b.example.local null 25 | // TLD with only 1 rule. 26 | biz null 27 | domain.biz domain.biz 28 | b.domain.biz domain.biz 29 | a.b.domain.biz domain.biz 30 | // TLD with some 2-level rules. 31 | com null 32 | example.com example.com 33 | b.example.com example.com 34 | a.b.example.com example.com 35 | uk.com null 36 | example.uk.com example.uk.com 37 | b.example.uk.com example.uk.com 38 | a.b.example.uk.com example.uk.com 39 | test.ac test.ac 40 | // TLD with only 1 (wildcard) rule. 41 | mm null 42 | c.mm null 43 | b.c.mm b.c.mm 44 | a.b.c.mm b.c.mm 45 | // More complex TLD. 46 | jp null 47 | test.jp test.jp 48 | www.test.jp test.jp 49 | ac.jp null 50 | test.ac.jp test.ac.jp 51 | www.test.ac.jp test.ac.jp 52 | kyoto.jp null 53 | test.kyoto.jp test.kyoto.jp 54 | ide.kyoto.jp null 55 | b.ide.kyoto.jp b.ide.kyoto.jp 56 | a.b.ide.kyoto.jp b.ide.kyoto.jp 57 | c.kobe.jp null 58 | b.c.kobe.jp b.c.kobe.jp 59 | a.b.c.kobe.jp b.c.kobe.jp 60 | city.kobe.jp city.kobe.jp 61 | www.city.kobe.jp city.kobe.jp 62 | // TLD with a wildcard rule and exceptions. 63 | ck null 64 | test.ck null 65 | b.test.ck b.test.ck 66 | a.b.test.ck b.test.ck 67 | www.ck www.ck 68 | www.www.ck www.ck 69 | // US K12. 70 | us null 71 | test.us test.us 72 | www.test.us test.us 73 | ak.us null 74 | test.ak.us test.ak.us 75 | www.test.ak.us test.ak.us 76 | k12.ak.us null 77 | test.k12.ak.us test.k12.ak.us 78 | www.test.k12.ak.us test.k12.ak.us 79 | // IDN labels. 80 | 食狮.com.cn 食狮.com.cn 81 | 食狮.公司.cn 食狮.公司.cn 82 | www.食狮.公司.cn 食狮.公司.cn 83 | shishi.公司.cn shishi.公司.cn 84 | 公司.cn null 85 | 食狮.中国 食狮.中国 86 | www.食狮.中国 食狮.中国 87 | shishi.中国 shishi.中国 88 | 中国 null 89 | // Same as above, but punycoded. 90 | xn--85x722f.com.cn xn--85x722f.com.cn 91 | xn--85x722f.xn--55qx5d.cn xn--85x722f.xn--55qx5d.cn 92 | www.xn--85x722f.xn--55qx5d.cn xn--85x722f.xn--55qx5d.cn 93 | shishi.xn--55qx5d.cn shishi.xn--55qx5d.cn 94 | xn--55qx5d.cn null 95 | xn--85x722f.xn--fiqs8s xn--85x722f.xn--fiqs8s 96 | www.xn--85x722f.xn--fiqs8s xn--85x722f.xn--fiqs8s 97 | shishi.xn--fiqs8s shishi.xn--fiqs8s 98 | xn--fiqs8s null 99 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Submitting Amendments 2 | 3 | Before submitting any change to the list, please make sure to read the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines). 4 | 5 | A properly formatted and validated patch will decrease the review time, and increase the chances your request will be reviewed and perhaps accepted. Any patch that doesn't follow the Guidelines will be rejected or, in the best scenario, left pending for follow-up. 6 | 7 | The most common time loss comes from not following the sorting guidelines 8 | - Sorting / Placement needs to comply with [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines) 9 | - *PLEASE* order within the existing entries in the PRIVATE DOMAINS section so that your listed organization on your first comment line is alphabetically sorted 10 | - Do NOT append your PRIVATE DOMAINS entry to end of the file 11 | - If there are more than one domain within your PR, order your entries alphabetically, ascending by TLD, then SLD, then 3LD and deeper etc (if present) 12 | 13 | Other Common mistakes that may cause the request to be rejected include: 14 | 15 | - Invalid patch formatting, rule sorting or changeset position (see this: [Wiki:Formatting](https://github.com/publicsuffix/list/wiki/Format)) 16 | - Missing validation records 17 | - Lack of proper domain ownership or expiry dates less than 2Y away 18 | - Attempts to work around vendor limits (see [#1245](https://github.com/publicsuffix/list/issues/1245) as an example) 19 | - Submissions with TLDs non-compliant with [ICP-3](https://www.icann.org/resources/pages/unique-authoritative-root-2012-02-25-en) or on the [ICANN PSL](https://github.com/publicsuffix/list/wiki/Security-Considerations#icann-public-suffix-list) 20 | - Insufficient or incomplete rationale (be verbose!) 21 | - Smaller, private projects with <2000 stakeholders 22 | 23 | Frequently, PR submissions overlook the sort ordering guidelines, adding to delay in processing and an increase in the time it takes to process requests. 24 | 25 | Make sure to review with the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines) before you open a new pull request. 26 | 27 | Please also note that there is no guarantee of inclusion, nor we are able to provide an ETA for any inclusion request. This is also true of projects that incorporate the PSL downline. This is described, outlined and diagrammed [here]( 28 | https://github.com/publicsuffix/list/wiki/Guidelines#appropriate-expectations-on-derivative-propagation-use-or-inclusion). 29 | 30 | Before you attempt to make a contribution or comment, please read the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/). 31 | 32 | ## PSL Mailing List 33 | 34 | We suggest that submitters and users/integrators of the PSL to please join the (low traffic) mailing list to be aware of changes to structure, processes or formatting. 35 | 36 | Some future changes may include automation of DNS tests for the presence of `_PSL` in `#PRIVATE` section entries to indicate ongoing inclusion or remove entries that do not have them, possible file structure or other changes. The "list list" is located [HERE](https://groups.google.com/g/publicsuffix-discuss) 37 | -------------------------------------------------------------------------------- /tools/private_domains_checker/TestPSLPrivateDomainsProcessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import uuid 3 | 4 | from PSLPrivateDomainsProcessor import PSLPrivateDomainsProcessor, check_dns_status, get_whois_data, check_psl_txt_record 5 | 6 | 7 | class TestPSLPrivateDomainsProcessor(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.processor = PSLPrivateDomainsProcessor() 11 | # Populate icann_domains for testing 12 | self.processor.icann_domains = { 13 | "com", "co.uk", "ac.uk", "net", "org" 14 | } 15 | 16 | def test_parse_domain_icann_domain(self): 17 | # Test cases where domains should be parsed correctly 18 | test_cases = [ 19 | ("*.example.com", "example.com"), 20 | ("sub.example.com", "example.com"), 21 | ("*.sub.example.com", "example.com"), 22 | ("example.com", "example.com"), 23 | ("example.co.uk", "example.co.uk"), 24 | ("sub.example.co.uk", "example.co.uk"), 25 | ("*.example.co.uk", "example.co.uk"), 26 | ("*.sub.example.co.uk", "example.co.uk"), 27 | ("abc.ac.uk", "abc.ac.uk"), 28 | ("a.b.com", "b.com") 29 | ] 30 | 31 | for domain, expected in test_cases: 32 | with self.subTest(domain=domain): 33 | result = self.processor.parse_domain(domain) 34 | self.assertEqual(expected, result) 35 | 36 | def test_parse_domain_no_icann(self): 37 | # Test case where no valid ICANN domain is found 38 | self.processor.icann_domains.remove("com") 39 | with self.assertRaises(ValueError): 40 | self.processor.parse_domain("example.com") 41 | 42 | def test_parse_domain_edge_cases(self): 43 | # Additional edge case testing 44 | self.assertEqual("example.org", self.processor.parse_domain("sub.example.org")) 45 | self.assertEqual("example.com", self.processor.parse_domain("example.com")) 46 | self.assertEqual("example.ac.uk", self.processor.parse_domain("sub.example.ac.uk")) 47 | 48 | def test_parse_domain_invalid(self): 49 | # Test invalid domains which should raise ValueError 50 | invalid_domains = ["invalid.test", "*.invalid.test", "sub.invalid.test"] 51 | for domain in invalid_domains: 52 | with self.subTest(domain=domain): 53 | with self.assertRaises(ValueError): 54 | self.processor.parse_domain(domain) 55 | 56 | def test_check_dns_status(self): 57 | # Test with a known good domain 58 | self.assertEqual("ok", check_dns_status("mozilla.org")) 59 | # Test with a likely non-existent domain 60 | random_domain = "nxdomain-" + str(uuid.uuid4()) + ".edu" 61 | self.assertEqual("NXDOMAIN", check_dns_status(random_domain)) 62 | 63 | def test_check_psl_txt_record(self): 64 | # Test with a known domain having a valid _psl TXT record 65 | self.assertEqual("valid", check_psl_txt_record("cdn.cloudflare.net")) 66 | # Test with a domain without a _psl TXT record 67 | random_domain = "invalid-" + str(uuid.uuid4()) + ".edu" 68 | self.assertEqual("invalid", check_psl_txt_record(random_domain)) 69 | 70 | def test_get_whois_data(self): 71 | whois_data = get_whois_data("example.com") 72 | self.assertEqual("ok", whois_data[2]) 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /tools/internal/parser/file_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/publicsuffix/list/tools/internal/domain" 7 | ) 8 | 9 | func TestPublicSuffix(t *testing.T) { 10 | lst := list( 11 | section(1, 1, "PRIVATE DOMAINS", 12 | suffixes(1, 1, noInfo, 13 | suffix(1, "example.com"), 14 | wildcard(2, 3, "baz.net", "except", "other"), 15 | suffix(4, "com"), 16 | 17 | // Wildcards and exceptions nested inside each 18 | // other. This doesn't appear in the PSL in practice, 19 | // and is implicitly forbidden by the format spec, but 20 | // the parser/validator does not currently reject such 21 | // files, so we want PublicSuffix/RegisteredDomain to 22 | // be well-defined for such inputs. 23 | wildcard(5, 6, "nested.org", "except"), 24 | wildcard(7, 8, "in.except.nested.org", "other-except"), 25 | ), 26 | ), 27 | ) 28 | 29 | tests := []struct { 30 | in string 31 | pubSuffix string 32 | regDomain string 33 | }{ 34 | {"www.example.com", "example.com", "www.example.com"}, 35 | {"www.public.example.com", "example.com", "public.example.com"}, 36 | {"example.com", "example.com", ""}, 37 | 38 | {"www.other.com", "com", "other.com"}, 39 | {"other.com", "com", "other.com"}, 40 | {"com", "com", ""}, 41 | 42 | {"qux.bar.baz.net", "bar.baz.net", "qux.bar.baz.net"}, 43 | {"bar.baz.net", "bar.baz.net", ""}, 44 | {"baz.net", "net", "baz.net"}, // Implicit * rule 45 | {"qux.except.baz.net", "baz.net", "except.baz.net"}, 46 | {"except.baz.net", "baz.net", "except.baz.net"}, 47 | {"other.other.baz.net", "baz.net", "other.baz.net"}, 48 | 49 | // Tests for nested wildcards+exceptions. Does not appear in 50 | // the real PSL, and implicitly disallowed by the format spec, 51 | // but necessary to make PublicSuffix and RegisteredDomain's 52 | // outputs well defined for all inputs. 53 | {"qux.bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"}, 54 | {"bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"}, 55 | {"foo.nested.org", "foo.nested.org", ""}, 56 | {"nested.org", "org", "nested.org"}, 57 | {"bar.except.nested.org", "nested.org", "except.nested.org"}, 58 | {"except.nested.org", "nested.org", "except.nested.org"}, 59 | {"in.except.nested.org", "nested.org", "except.nested.org"}, 60 | // Matches both nested wildcard and also outer exception, 61 | // outer exception wins. 62 | {"other.in.except.nested.org", "nested.org", "except.nested.org"}, 63 | // Matches both outer and inner exceptions, inner exception 64 | // wins. 65 | {"qux.other-except.in.except.nested.org", "in.except.nested.org", "other-except.in.except.nested.org"}, 66 | } 67 | 68 | for _, tc := range tests { 69 | in := mustParseDomain(tc.in) 70 | wantSuffix := mustParseDomain(tc.pubSuffix) 71 | 72 | gotSuffix := lst.PublicSuffix(in) 73 | if !gotSuffix.Equal(wantSuffix) { 74 | t.Errorf("PublicSuffix(%q) = %q, want %q", in, gotSuffix, wantSuffix) 75 | } 76 | 77 | gotReg, ok := lst.RegisteredDomain(in) 78 | if ok && tc.regDomain == "" { 79 | t.Errorf("RegisteredDomain(%q) = %q, want none", in, gotReg) 80 | } else if ok { 81 | wantReg := mustParseDomain(tc.regDomain) 82 | if !gotReg.Equal(wantReg) { 83 | t.Errorf("RegisteredDomain(%q) = %q, want %q", in, gotReg, wantReg) 84 | } 85 | } 86 | } 87 | } 88 | 89 | func mustParseDomain(s string) domain.Name { 90 | d, err := domain.Parse(s) 91 | if err != nil { 92 | panic(err) 93 | } 94 | return d 95 | } 96 | -------------------------------------------------------------------------------- /tests/prepare_tlds.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import codecs 6 | import encodings.idna 7 | import re 8 | import sys 9 | 10 | """ 11 | Processes a file containing effective TLD data. See the following URL for a 12 | description of effective TLDs and of the file format that this script 13 | processes (although for the latter you're better off just reading this file's 14 | short source code). 15 | 16 | https://wiki.mozilla.org/Gecko:Effective_TLD_Service 17 | """ 18 | 19 | def getEffectiveTLDs(path): 20 | file = codecs.open(path, "r", "UTF-8") 21 | domains = set() 22 | while True: 23 | line = file.readline() 24 | # line always contains a line terminator unless the file is empty 25 | if len(line) == 0: 26 | raise StopIteration 27 | line = line.rstrip() 28 | # comment, empty, or superfluous line for explicitness purposes 29 | if line.startswith("//") or "." not in line: 30 | continue 31 | line = re.split(r"[ \t\n]", line, 1)[0] 32 | entry = EffectiveTLDEntry(line) 33 | domain = entry.domain() 34 | assert domain not in domains, \ 35 | "repeating domain %s makes no sense" % domain 36 | domains.add(domain) 37 | yield entry 38 | 39 | def _normalizeHostname(domain): 40 | """ 41 | Normalizes the given domain, component by component. ASCII components are 42 | lowercased, while non-ASCII components are processed using the ToASCII 43 | algorithm. 44 | """ 45 | def convertLabel(label): 46 | if _isASCII(label): 47 | return label.lower() 48 | return encodings.idna.ToASCII(label) 49 | return ".".join(map(convertLabel, domain.split("."))) 50 | 51 | def _isASCII(s): 52 | "True if s consists entirely of ASCII characters, false otherwise." 53 | for c in s: 54 | if ord(c) > 127: 55 | return False 56 | return True 57 | 58 | class EffectiveTLDEntry: 59 | """ 60 | Stores an entry in an effective-TLD name file. 61 | """ 62 | 63 | _exception = False 64 | _wild = False 65 | 66 | def __init__(self, line): 67 | """ 68 | Creates a TLD entry from a line of data, which must have been stripped of 69 | the line ending. 70 | """ 71 | if line.startswith("!"): 72 | self._exception = True 73 | domain = line[1:] 74 | elif line.startswith("*."): 75 | self._wild = True 76 | domain = line[2:] 77 | else: 78 | domain = line 79 | self._domain = _normalizeHostname(domain) 80 | 81 | def domain(self): 82 | "The domain this represents." 83 | return self._domain 84 | 85 | def exception(self): 86 | "True if this entry's domain denotes does not denote an effective TLD." 87 | return self._exception 88 | 89 | def wild(self): 90 | "True if this entry represents a class of effective TLDs." 91 | return self._wild 92 | 93 | 94 | ################# 95 | # DO EVERYTHING # 96 | ################# 97 | 98 | def main(output, effective_tld_filename): 99 | """ 100 | effective_tld_filename is the effective TLD file to parse. 101 | A C++ array of { domain, exception, wild } entries representing the 102 | eTLD file is then printed to output. 103 | """ 104 | 105 | def boolStr(b): 106 | if b: 107 | return "true" 108 | return "false" 109 | 110 | for etld in getEffectiveTLDs(effective_tld_filename): 111 | exception = boolStr(etld.exception()) 112 | wild = boolStr(etld.wild()) 113 | output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild)) 114 | 115 | if __name__ == '__main__': 116 | main(sys.stdout, sys.argv[1]) 117 | -------------------------------------------------------------------------------- /tools/internal/parser/write.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | // MarshalPSL returns the list serialized to standard PSL text format. 12 | func (l *List) MarshalPSL() []byte { 13 | var ret bytes.Buffer 14 | writeBlockPSL(&ret, l) 15 | return ret.Bytes() 16 | } 17 | 18 | func writeBlockPSL(w io.Writer, b Block) { 19 | f := func(msg string, args ...any) { 20 | fmt.Fprintf(w, msg+"\n", args...) 21 | } 22 | 23 | switch v := b.(type) { 24 | case *List: 25 | for i, child := range v.Blocks { 26 | if i > 0 { 27 | f("") 28 | } 29 | writeBlockPSL(w, child) 30 | } 31 | case *Section: 32 | f("// ===BEGIN %s===", v.Name) 33 | for _, child := range v.Blocks { 34 | f("") 35 | writeBlockPSL(w, child) 36 | } 37 | f("") 38 | f("// ===END %s===", v.Name) 39 | case *Suffixes: 40 | for _, child := range v.Blocks { 41 | writeBlockPSL(w, child) 42 | } 43 | case *Suffix: 44 | f("%s", v.Domain) 45 | case *Wildcard: 46 | base := v.Domain 47 | f("*.%s", base) 48 | for _, exc := range v.Exceptions { 49 | f("!%s.%s", exc, base) 50 | } 51 | case *Comment: 52 | for _, line := range v.Text { 53 | f("// %s", line) 54 | } 55 | default: 56 | panic("unknown ast node") 57 | } 58 | } 59 | 60 | // MarhsalDebug returns the list serialized to a verbose debugging 61 | // format. This format is private to this package and for development 62 | // use only. The format may change drastically without notice. 63 | func (l *List) MarshalDebug() []byte { 64 | var ret bytes.Buffer 65 | writeBlockDebug(&ret, l, "") 66 | return ret.Bytes() 67 | } 68 | 69 | func writeBlockDebug(w io.Writer, b Block, indent string) { 70 | changemark := "" 71 | if b.Changed() { 72 | changemark = "!!" 73 | } 74 | f := func(msg string, args ...any) { 75 | fmt.Fprintf(w, indent+msg+"\n", args...) 76 | } 77 | 78 | src := b.SrcRange() 79 | loc := fmt.Sprintf("%d-%d", src.FirstLine, src.LastLine) 80 | if src.FirstLine+1 == src.LastLine { 81 | loc = strconv.Itoa(src.FirstLine) 82 | } 83 | 84 | const extraIndent = " " 85 | nextIndent := indent + extraIndent 86 | 87 | switch v := b.(type) { 88 | case *List: 89 | f("%sList(%s) {", changemark, loc) 90 | for _, child := range v.Blocks { 91 | writeBlockDebug(w, child, nextIndent) 92 | } 93 | f("} // List") 94 | case *Section: 95 | f("%sSection(%s, name=%q) {", changemark, loc, v.Name) 96 | for _, child := range v.Blocks { 97 | writeBlockDebug(w, child, nextIndent) 98 | } 99 | f("} // Section(name=%q)", v.Name) 100 | case *Suffixes: 101 | items := []string{loc, fmt.Sprintf("editable=%v", v.Info.MachineEditable)} 102 | if v.Info.Name != "" { 103 | items = append(items, fmt.Sprintf("name=%q", v.Info.Name)) 104 | } 105 | for _, u := range v.Info.URLs { 106 | items = append(items, fmt.Sprintf("url=%q", u)) 107 | } 108 | for _, e := range v.Info.Maintainers { 109 | email := strings.TrimSpace(fmt.Sprintf("%s <%s>", e.Name, e.Address)) 110 | items = append(items, fmt.Sprintf("contact=%q", email)) 111 | } 112 | for _, o := range v.Info.Other { 113 | items = append(items, fmt.Sprintf("other=%q", o)) 114 | } 115 | 116 | const open = "SuffixBlock(" 117 | pad := strings.Repeat(" ", len(open)) 118 | f("%s%s%s) {", changemark, open, strings.Join(items, fmt.Sprintf(",\n%s%s", indent, pad))) 119 | for _, child := range v.Blocks { 120 | writeBlockDebug(w, child, nextIndent) 121 | } 122 | f("} // SuffixBlock(name=%q)", v.Info.Name) 123 | case *Suffix: 124 | f("%sSuffix(%s, %q)", changemark, loc, v.Domain) 125 | case *Wildcard: 126 | w := fmt.Sprintf("*.%s", v.Domain) 127 | if len(v.Exceptions) > 0 { 128 | f("%sWildcard(%s, %q, except=%v)", changemark, loc, w, v.Exceptions) 129 | } else { 130 | f("%sWildcard(%s, %q)", changemark, loc, w) 131 | } 132 | case *Comment: 133 | f("%sComment(%s) {", changemark, loc) 134 | for _, line := range v.Text { 135 | f("%s%s", extraIndent, line) 136 | } 137 | f("}") 138 | default: 139 | panic("unknown ast node") 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /tools/internal/parser/unicode.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "bytes" 5 | "sync" 6 | 7 | "golang.org/x/text/collate" 8 | "golang.org/x/text/language" 9 | ) 10 | 11 | // How do you sort strings? The answer is surprisingly complex. 12 | // 13 | // "Collation" is the technical term for putting things in a specific 14 | // order. For strings of human text, there is no universal agreement 15 | // on what order is "correct". 16 | // 17 | // Different languages have different sorting conventions: in English 18 | // Ä is an accented A and comes before B, but in Swedish Ä is the 28th 19 | // letter of the alphabet and comes after Z. 20 | // 21 | // A single language also sorts differently sometimes: a phonebook 22 | // written in the German language is in a slightly different order in 23 | // Germany vs. Austria. Or even within a single country: in Germany, a 24 | // list of names can be in "standard" order, or it can be in 25 | // "phonebook" order, with different choices for ä, ö and ü. 26 | // 27 | // Finally, there are style choices available that are considered 28 | // equally valid, depending on the application. A common example is 29 | // "numeric sort", which order numbers inside strings according to 30 | // mathematics: "3" > "24" in "standard" lexicographic order, but if a 31 | // collation uses numeric sort, "3" < "24". 32 | // 33 | // Whitespace and punctuation are another example of a style choice: 34 | // in some applications they participate in the ordering, and in 35 | // others they are ignored and only "real" letters determine the 36 | // order. 37 | // 38 | // Fortunately, the Unicode Consortium has simplified all this for us: 39 | // there is a single universal Unicode Collation Algorithm 40 | // (http://www.unicode.org/reports/tr10/) that handles all of this 41 | // complexity. We just have to tell it which 42 | // language/dialect/country/style we want to use, and now we can 43 | // compare strings. 44 | // 45 | // For non-suffix text, the PSL uses the "basic" English 46 | // collation. Specifically, we use the collation defined in the 47 | // Unicode CLDR (Common Locale Data Repository, 48 | // https://cldr.unicode.org/), described by the BCP 47 language tag 49 | // "en": "global" English, with no country or dialect modifications, 50 | // and "default" style choices for English: ordering is 51 | // case-sensitive, whitespace-sensitive and punctuation-sensitive, and 52 | // numbers are compared in lexicographic order, not numeric order. 53 | 54 | // compareCommentText compares the strings of comment text a and b, 55 | // using the PSL's chosen collation. It returns -1 if a < b, +1 if a > 56 | // b, or 0 if a == b. 57 | // 58 | // This function MUST NOT be used to compare domain name or DNS label 59 | // strings. For that, use domain.Name.Compare or domain.Label.Compare. 60 | func compareCommentText(a string, b string) int { 61 | // golang.org/x/text/collate has a few bugs, and in particular the 62 | // "CompareString" method uses a special "incremental collation" 63 | // codepath that sometimes returns incorrect results (see 64 | // https://github.com/golang/go/issues/68166). 65 | // 66 | // To be safe, we instead use the "slower" (still pretty fast) 67 | // codepath: we explicitly convert the strings into the 68 | // corresponding "sort keys", and then bytes.Compare those. There 69 | // are more exhaustive tests for sort key computation, so there is 70 | // higher confidence that it works correctly. 71 | // 72 | // Unfortunately individual collators are also not safe for 73 | // concurrent use. Wrap them in a global mutex. We could also 74 | // construct a new collator for each use, but that ends up being 75 | // more expensive and less performant than sharing one collator 76 | // with a mutex. 77 | commentCollatorMu.Lock() 78 | defer commentCollatorMu.Unlock() 79 | var buf collate.Buffer 80 | ka := commentCollator.KeyFromString(&buf, a) 81 | kb := commentCollator.KeyFromString(&buf, b) 82 | return bytes.Compare(ka, kb) 83 | } 84 | 85 | // commentCollator compares strings in the PSL's chosen collation for 86 | // non-suffix text. See the comment at the start of this file for more 87 | // details. 88 | var commentCollator = collate.New(language.MustParse("en")) 89 | var commentCollatorMu sync.Mutex 90 | -------------------------------------------------------------------------------- /tools/internal/parser/write_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | ) 7 | 8 | func TestMarshalPSL(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | in *List 12 | want []byte 13 | }{ 14 | { 15 | name: "empty", 16 | in: list(), 17 | want: byteLines(""), 18 | }, 19 | 20 | { 21 | name: "comments_and_empty_sections", 22 | in: list( 23 | comment(0, "This is a two", "line comment"), 24 | comment(0, "Another separate comment"), 25 | section(0, 0, "ICANN DOMAINS", 26 | comment(0, "Inside icann domains"), 27 | ), 28 | comment(0, "Between sections"), 29 | section(0, 0, "PRIVATE DOMAINS", 30 | comment(0, "Private domains here"), 31 | comment(0, "More private domains"), 32 | ), 33 | ), 34 | want: byteLines( 35 | "// This is a two", 36 | "// line comment", 37 | "", 38 | "// Another separate comment", 39 | "", 40 | "// ===BEGIN ICANN DOMAINS===", 41 | "", 42 | "// Inside icann domains", 43 | "", 44 | "// ===END ICANN DOMAINS===", 45 | "", 46 | "// Between sections", 47 | "", 48 | "// ===BEGIN PRIVATE DOMAINS===", 49 | "", 50 | "// Private domains here", 51 | "", 52 | "// More private domains", 53 | "", 54 | "// ===END PRIVATE DOMAINS===", 55 | "", 56 | ), 57 | }, 58 | 59 | { 60 | name: "some_suffixes", 61 | in: list( 62 | comment(1, "Test list"), 63 | section(2, 2, "ICANN DOMAINS", 64 | suffixes(1, 1, noInfo, 65 | suffix(1, "aaa"), 66 | suffix(2, "bbb"), 67 | wildcard(3, 3, "ccc", "d", "e", "f"), 68 | ), 69 | suffixes(2, 2, noInfo, 70 | suffix(1, "xxx"), 71 | suffix(2, "yyy"), 72 | suffix(3, "zzz"), 73 | ), 74 | ), 75 | ), 76 | want: byteLines( 77 | "// Test list", 78 | "", 79 | "// ===BEGIN ICANN DOMAINS===", 80 | "", 81 | "aaa", 82 | "bbb", 83 | "*.ccc", 84 | "!d.ccc", 85 | "!e.ccc", 86 | "!f.ccc", 87 | "", 88 | "xxx", 89 | "yyy", 90 | "zzz", 91 | "", 92 | "// ===END ICANN DOMAINS===", 93 | "", 94 | ), 95 | }, 96 | } 97 | 98 | for _, tc := range tests { 99 | t.Run(tc.name, func(t *testing.T) { 100 | got := tc.in.MarshalPSL() 101 | checkDiff(t, "MarhsalPSL output", got, tc.want) 102 | 103 | // Does the marshaled output parse? 104 | in2, errs := Parse(got) 105 | if len(errs) > 0 { 106 | t.Logf("failed to parse MarshalPSL output:") 107 | for _, err := range errs { 108 | t.Error(err) 109 | } 110 | t.FailNow() 111 | } 112 | 113 | // Parse result should be identical to the original, 114 | // modulo source ranges. 115 | zeroSourceRange(tc.in) 116 | zeroSourceRange(in2) 117 | checkDiff(t, "MarshalPSL then Parse", in2, tc.in) 118 | if t.Failed() { 119 | t.FailNow() 120 | } 121 | }) 122 | } 123 | } 124 | 125 | func TestRoundtripRealPSL(t *testing.T) { 126 | bs, err := os.ReadFile("../../../public_suffix_list.dat") 127 | if err != nil { 128 | t.Fatal(err) 129 | } 130 | 131 | psl, errs := Parse(bs) 132 | if len(errs) > 0 { 133 | t.Logf("PSL parse failed, skipping round-trip test:") 134 | for _, err := range errs { 135 | t.Error(err) 136 | } 137 | t.FailNow() 138 | } 139 | 140 | suffixCnt1 := len(BlocksOfType[*Suffix](psl)) 141 | wildCnt1 := len(BlocksOfType[*Wildcard](psl)) 142 | if got, wantMin := suffixCnt1, 1000; got < wantMin { 143 | t.Fatalf("PSL doesn't have enough suffixes, got %d want at least %d", got, wantMin) 144 | } 145 | if got, wantMin := wildCnt1, 2; got < wantMin { 146 | t.Fatalf("PSL doesn't have enough wildcards, got %d want at least %d", got, wantMin) 147 | } 148 | 149 | bs2 := psl.MarshalPSL() 150 | psl2, errs := Parse(bs2) 151 | if len(errs) > 0 { 152 | t.Logf("PSL parse after MarshalPSL failed:") 153 | for _, err := range errs { 154 | t.Error(err) 155 | } 156 | t.FailNow() 157 | } 158 | 159 | suffixCnt2 := len(BlocksOfType[*Suffix](psl2)) 160 | wildCnt2 := len(BlocksOfType[*Wildcard](psl2)) 161 | if got, want := suffixCnt2, suffixCnt1; got != want { 162 | t.Errorf("MarshalPSL changed suffix count, got %d want %d", got, want) 163 | } 164 | if got, want := wildCnt2, wildCnt1; got != want { 165 | t.Errorf("MarshalPSL changed wildcard count, got %d want %d", got, want) 166 | } 167 | 168 | zeroSourceRange(psl) 169 | zeroSourceRange(psl2) 170 | checkDiff(t, "PSL roundtrip through MarshalPSL", psl2, psl) 171 | } 172 | -------------------------------------------------------------------------------- /tests/test_psl.txt: -------------------------------------------------------------------------------- 1 | // Any copyright is dedicated to the Public Domain. 2 | // https://creativecommons.org/publicdomain/zero/1.0/ 3 | 4 | // null input. 5 | checkPublicSuffix(null, null); 6 | // Mixed case. 7 | checkPublicSuffix('COM', null); 8 | checkPublicSuffix('example.COM', 'example.com'); 9 | checkPublicSuffix('WwW.example.COM', 'example.com'); 10 | // Leading dot. 11 | checkPublicSuffix('.com', null); 12 | checkPublicSuffix('.example', null); 13 | checkPublicSuffix('.example.com', null); 14 | checkPublicSuffix('.example.example', null); 15 | // Unlisted TLD. 16 | checkPublicSuffix('example', null); 17 | checkPublicSuffix('example.example', 'example.example'); 18 | checkPublicSuffix('b.example.example', 'example.example'); 19 | checkPublicSuffix('a.b.example.example', 'example.example'); 20 | // Listed, but non-Internet, TLD. 21 | //checkPublicSuffix('local', null); 22 | //checkPublicSuffix('example.local', null); 23 | //checkPublicSuffix('b.example.local', null); 24 | //checkPublicSuffix('a.b.example.local', null); 25 | // TLD with only 1 rule. 26 | checkPublicSuffix('biz', null); 27 | checkPublicSuffix('domain.biz', 'domain.biz'); 28 | checkPublicSuffix('b.domain.biz', 'domain.biz'); 29 | checkPublicSuffix('a.b.domain.biz', 'domain.biz'); 30 | // TLD with some 2-level rules. 31 | checkPublicSuffix('com', null); 32 | checkPublicSuffix('example.com', 'example.com'); 33 | checkPublicSuffix('b.example.com', 'example.com'); 34 | checkPublicSuffix('a.b.example.com', 'example.com'); 35 | checkPublicSuffix('uk.com', null); 36 | checkPublicSuffix('example.uk.com', 'example.uk.com'); 37 | checkPublicSuffix('b.example.uk.com', 'example.uk.com'); 38 | checkPublicSuffix('a.b.example.uk.com', 'example.uk.com'); 39 | checkPublicSuffix('test.ac', 'test.ac'); 40 | // TLD with only 1 (wildcard) rule. 41 | checkPublicSuffix('mm', null); 42 | checkPublicSuffix('c.mm', null); 43 | checkPublicSuffix('b.c.mm', 'b.c.mm'); 44 | checkPublicSuffix('a.b.c.mm', 'b.c.mm'); 45 | // More complex TLD. 46 | checkPublicSuffix('jp', null); 47 | checkPublicSuffix('test.jp', 'test.jp'); 48 | checkPublicSuffix('www.test.jp', 'test.jp'); 49 | checkPublicSuffix('ac.jp', null); 50 | checkPublicSuffix('test.ac.jp', 'test.ac.jp'); 51 | checkPublicSuffix('www.test.ac.jp', 'test.ac.jp'); 52 | checkPublicSuffix('kyoto.jp', null); 53 | checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp'); 54 | checkPublicSuffix('ide.kyoto.jp', null); 55 | checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp'); 56 | checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp'); 57 | checkPublicSuffix('c.kobe.jp', null); 58 | checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp'); 59 | checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp'); 60 | checkPublicSuffix('city.kobe.jp', 'city.kobe.jp'); 61 | checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp'); 62 | // TLD with a wildcard rule and exceptions. 63 | checkPublicSuffix('ck', null); 64 | checkPublicSuffix('test.ck', null); 65 | checkPublicSuffix('b.test.ck', 'b.test.ck'); 66 | checkPublicSuffix('a.b.test.ck', 'b.test.ck'); 67 | checkPublicSuffix('www.ck', 'www.ck'); 68 | checkPublicSuffix('www.www.ck', 'www.ck'); 69 | // US K12. 70 | checkPublicSuffix('us', null); 71 | checkPublicSuffix('test.us', 'test.us'); 72 | checkPublicSuffix('www.test.us', 'test.us'); 73 | checkPublicSuffix('ak.us', null); 74 | checkPublicSuffix('test.ak.us', 'test.ak.us'); 75 | checkPublicSuffix('www.test.ak.us', 'test.ak.us'); 76 | checkPublicSuffix('k12.ak.us', null); 77 | checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us'); 78 | checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us'); 79 | // IDN labels. 80 | checkPublicSuffix('食狮.com.cn', '食狮.com.cn'); 81 | checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn'); 82 | checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn'); 83 | checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn'); 84 | checkPublicSuffix('公司.cn', null); 85 | checkPublicSuffix('食狮.中国', '食狮.中国'); 86 | checkPublicSuffix('www.食狮.中国', '食狮.中国'); 87 | checkPublicSuffix('shishi.中国', 'shishi.中国'); 88 | checkPublicSuffix('中国', null); 89 | // Same as above, but punycoded. 90 | checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn'); 91 | checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); 92 | checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); 93 | checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn'); 94 | checkPublicSuffix('xn--55qx5d.cn', null); 95 | checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); 96 | checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); 97 | checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s'); 98 | checkPublicSuffix('xn--fiqs8s', null); 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Public Suffix List 2 | 3 | A "public suffix" is one under which Internet users can (or historically could) 4 | directly register names. Some examples of public suffixes are `com`, `co.uk` and 5 | `pvt.k12.ma.us`. The Public Suffix List is a list of all known public suffixes. 6 | 7 | See https://publicsuffix.org/ and the [Wiki](https://github.com/publicsuffix/list/wiki) link above for more information. 8 | 9 | ## Are you here to add or update something? 10 | 11 | All submissions must conform to the [validation and acceptance factors](https://github.com/publicsuffix/list/wiki/Guidelines#validation-and-non-acceptance-factors) and provide sufficient rationale or basically be as complete as possible, and follow the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines), especially as they relate to format and [sorting](https://github.com/publicsuffix/list/wiki/Guidelines#sort-your-submission-correctly-important). 12 | 13 | The list is currently maintained by people who are volunteering their time towards universal acceptance and ensuring there is a bridge between the ICANN world of domain names and the crucial last mile - the world of developers and human users. 14 | 15 | Iteration back and forth will delay PR review or inclusion. Be extremely thorough, and patient. 16 | 17 | ## Important Notices 18 | 19 | ### 2025-05-27 20 | Were you directed here to be able to add a subdomain to your **Cloudflare** account? If so, please work directly with Cloudflare for these account limitations. The PSL is **NOT** intended as a workaround for Cloudflare's subdomain restrictions. 21 | 22 | Consult [Cloudflare's subdomain setup documentation](https://developers.cloudflare.com/dns/zone-setups/subdomain-setup/) or contact Cloudflare directly for subdomain setup questions. Only submit a request to the PSL if your domain truly meets our criteria outlined in [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines). 23 | 24 | ### 2024-07-26 25 | We are sending emails asking for confirmation if certain entries are still required or need updating. 26 | 27 | Currently, this process is purely manual and extremely low volume but if you do get an email, please respond. 28 | 29 | Please see the [Email Communication Policy](#email-communication-policy) to see how we will often communicate these changes. 30 | 31 | ### 2023-02-20 32 | Did [guidance from Google related to the changes that they are making to adsense subdomains](https://support.google.com/adsense/answer/12170421) bring you here? Work with Google Adsense [Help Link](https://support.google.com/adsense/gethelp) with any support questions you have. The PSL is thinly resourced, and the volunteer maintainers are unable to answer questions about Adsense changes or support Adsense. 33 | 34 | The PSL is volunteer-resourced and is absolutely not resourced to answer questions or support changes. Guidance is in the form of self-help (READ THE [WIKI](https://github.com/publicsuffix/list/wiki)), THERE IS NO PSL CUSTOMER SERVICE RESOURCE TO ASSIST YOU. *Please work directly with Google to ensure your domain does in fact need an entry, and they should help you know what the benefits and consequences are. __IT POSSIBLE TO HARM YOUR WEBSITE COOKIES BY REQUESTING A MALFORMED PSL ENTRY__. Also, understand what propagation delays and rollback processing entail before making requests.* 35 | 36 | ### 2021-04-23 37 | Did guidance related to an issue with Facebook or Apple bring you here? [Read this before submitting requests](https://github.com/publicsuffix/list/issues/1245) We are not approving workaround requests per the validation and acceptance standards, but do have open discussion with Facebook on the matter. 38 | 39 | ## Email Communication Policy 40 | 41 | We tend to use the subject line tag "[PSL notification]" in all Public Suffix List communications. For effective spam filtering, you can create a case-insensitive filter to allow only emails with exact "[PSL notification]" in the subject line. If you choose to set up such a filter in your email application, please verify the filter is implemented correctly and test it thoroughly to ensure you don't accidentally miss important communications from us. 42 | 43 | ## Code of Conduct 44 | 45 | Your participation in the Public Suffix List project should follow the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/ "Mozilla Community Participation Guidelines") as well as the [GitHub Community Participation Guidelines](https://help.github.com/en/github/site-policy/github-community-guidelines "GitHub Community Participation Guidelines"). Behavior that falls into the areas forbidden by either document is unwelcome and will result in further escalation. 46 | -------------------------------------------------------------------------------- /tools/private_domains_checker/README.md: -------------------------------------------------------------------------------- 1 | # PSL Private Section Domains WHOIS Checker 2 | 3 | ## Overview 4 | 5 | The `PSLPrivateDomainsProcessor` is a Python script designed to fetch data from the Public Suffix List (PSL) and check the domain status, expiry dates, and `_psl` TXT records of the private section domains. 6 | 7 | It performs WHOIS checks on these domains and saves the results into CSV files for manual review. 8 | 9 | ## Requirements 10 | 11 | - Python 3.x 12 | - `requests` 13 | - `pandas` 14 | - `whoisdomain` 15 | 16 | You can install the required packages using pip: 17 | 18 | ```sh 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | Ensure that `whois` is installed on your operating system. 23 | 24 | ```sh 25 | sudo apt install whois # Debian/Ubuntu 26 | sudo yum install whois # Fedora/Centos/Rocky 27 | ``` 28 | 29 | ## Usage 30 | 31 | `PSLPrivateDomainsProcessor.py`: The main script containing the `PSLPrivateDomainsProcessor` class and functions for DNS and WHOIS checks. 32 | 33 | Run the script using Python: 34 | 35 | ```sh 36 | cd private_domains_checker 37 | mkdir data 38 | python PSLPrivateDomainsProcessor.py 39 | ``` 40 | 41 | ## Main Components 42 | 43 | ### Functions 44 | 45 | - `make_dns_request(domain, record_type)`: Makes DNS requests to both Google and Cloudflare DNS APIs. 46 | - `check_dns_status(domain)`: Checks the DNS status of a domain using Google and Cloudflare DNS APIs. 47 | - `get_whois_data(domain)`: Retrieves WHOIS data for a domain using the whoisdomain package. 48 | - `check_psl_txt_record(domain)`: Checks the `_psl` TXT record for a domain using Google and Cloudflare DNS APIs. 49 | 50 | ### Class 51 | 52 | #### PSLPrivateDomainsProcessor 53 | 54 | - `fetch_psl_data()`: Fetches the PSL data from the specified URL. 55 | - `parse_domain(domain)`: Parses and normalizes a domain. 56 | - `parse_psl_data(psl_data)`: Parses the fetched PSL data and separates ICANN and private domains. 57 | - `process_domains(raw_domains, domains)`: Processes each domain, performing DNS, WHOIS, and `_psl` TXT record checks. 58 | - `save_results()`: Saves all processed domain data to `data/all.csv`. 59 | - `save_invalid_results()`: Saves domains with invalid DNS or expired WHOIS data to `data/nxdomain.csv` and `data/expired.csv`. 60 | - `save_hold_results()`: Saves domains with WHOIS status containing any form of "hold" to `data/hold.csv`. 61 | - `save_missing_psl_txt_results()`: Saves domains with invalid `_psl` TXT records to `data/missing_psl_txt.csv`. 62 | - `save_expiry_less_than_2yrs_results()`: Saves domains with WHOIS expiry date less than 2 years from now to `data/expiry_less_than_2yrs.csv`. 63 | - `run()`: Executes the entire processing pipeline. 64 | 65 | ## Output 66 | 67 | The script generates the following CSV files in the `data` directory: 68 | 69 | - `all.csv`: Contains all processed domain data. 70 | - `nxdomain.csv`: Contains domains that could not be resolved (`NXDOMAIN`). 71 | - `expired.csv`: Contains domains with expired WHOIS records. 72 | - `hold.csv`: Contains domains with WHOIS status indicating any kind of "hold". 73 | - `missing_psl_txt.csv`: Contains domains with invalid `_psl` TXT records. 74 | - `expiry_less_than_2yrs.csv`: Contains domains with WHOIS expiry date less than 2 years from now. 75 | 76 | ## Example 77 | 78 | An example CSV entry: 79 | 80 | | psl_entry | top_level_domain | dns_status | whois_status | whois_domain_expiry_date | whois_domain_status | psl_txt_status | expiry_check_status | 81 | | -------------- | ---------------- | ---------- | ------------ | ----------------------- | ---------------------------- | -------------- | ------------------- | 82 | | example.com | example.com | ok | ok | 2024-12-31 | "clientTransferProhibited" | "valid" | ok | 83 | 84 | ## Publicly Registrable Namespace Determination 85 | 86 | The script determines the publicly registrable namespace from private domains by using the ICANN section. 87 | 88 | Here's how it works: 89 | 90 | 1. **ICANN Domains Set**: ICANN domains are stored in a set for quick lookup. 91 | 2. **Domain Parsing**: For each private domain, the script splits the domain into parts. It then checks if any suffix of these parts exists in the ICANN domains set. 92 | 3. **Normalization**: The private domain is normalized to its publicly registrable form using the ICANN domains set. 93 | 94 | Examples: 95 | 96 | - **Input**: PSL private domain entry `"*.example.com"` 97 | - **Process**: 98 | - Remove leading `'*.'`: `"example.com"` 99 | - Check `"com"` against the ICANN domains set: Found 100 | - **Output**: `"example.com"` 101 | 102 | - **Input**: PSL private domain entry `"sub.example.co.uk"` 103 | - **Process**: 104 | - Check `"example.co.uk"` against the ICANN domains set: Not found 105 | - Check `"co.uk"` against the ICANN domains set: Found 106 | - **Output**: `"example.co.uk"` 107 | 108 | The output is then used for checking WHOIS data. 109 | 110 | ## License 111 | 112 | This tool is licensed under the MIT License. -------------------------------------------------------------------------------- /tools/internal/githistory/history.go: -------------------------------------------------------------------------------- 1 | // Package githistory provides helpers to look up PSL PR changes in a 2 | // local git repository. 3 | package githistory 4 | 5 | import ( 6 | "bytes" 7 | "fmt" 8 | "os/exec" 9 | "regexp" 10 | "strconv" 11 | "strings" 12 | ) 13 | 14 | // PRInfo lists commit metadata for a given Github PR. 15 | type PRInfo struct { 16 | Num int 17 | // CommitHash is the git hash in which the PSL contains the 18 | // changes of this PR. 19 | CommitHash string 20 | // ParentHash is the git hash immediately before this PR's changes 21 | // were added to the PSL. 22 | ParentHash string 23 | } 24 | 25 | // History is PR metadata extracted from a local PSL git clone. 26 | type History struct { 27 | GitPath string // path to the local git clone 28 | PRs map[int]PRInfo 29 | } 30 | 31 | // gitTopLevel finds the top level of the git repository that contains 32 | // path, if any. 33 | func gitToplevel(path string) (string, error) { 34 | bs, err := gitStdout(path, "rev-parse", "--show-toplevel") 35 | if err != nil { 36 | return "", fmt.Errorf("finding top level of git repo %q: %w", path, err) 37 | } 38 | return string(bs), nil 39 | } 40 | 41 | // GetPRInfo extracts PR metadata from the git repository at gitPath. 42 | func GetPRInfo(gitPath string) (*History, error) { 43 | toplevel, err := gitToplevel(gitPath) 44 | if err != nil { 45 | return nil, err 46 | } 47 | 48 | // List all commits that have a description with a '(#1234)' at 49 | // the end of a line of description or "Merge pull request #1234 50 | // from" at the start, and print the matching commits in a form 51 | // that's easy to parse. 52 | prCommits, err := gitStdout(toplevel, "log", 53 | "--perl-regexp", 54 | `--grep=\(#\d+\)$`, 55 | `--grep=^Merge pull request #\d+ from`, 56 | "--pretty=%H@%P@%s", 57 | "master") 58 | 59 | ret := &History{ 60 | GitPath: toplevel, 61 | PRs: map[int]PRInfo{}, 62 | } 63 | for _, line := range strings.Split(string(prCommits), "\n") { 64 | fs := strings.SplitN(line, "@", 3) 65 | if len(fs) != 3 { 66 | return nil, fmt.Errorf("unexpected line format %q", line) 67 | } 68 | commit, parentsStr, desc := fs[0], fs[1], fs[2] 69 | parents := strings.Split(parentsStr, " ") 70 | // For merge commits, we have multiple parents, and we want 71 | // the "main branch" side of the merge, i.e. the state of the 72 | // tree before the PR was merged. Empirically, Github always 73 | // lists that commit as the 1st parent in merge commits. 74 | // 75 | // For squash commits, there is only one parent. 76 | // 77 | // This logic cannot handle rebase-and-merge actions, since 78 | // those by definition erase the PR history from the git 79 | // history. However, the PSL doesn't use rebase-and-merge by 80 | // convention, so this works out. Worst case, if this logic 81 | // does catch a rebase-and-merge, the result will be false 82 | // positives (suffix flagged for invalid TXT record), if the 83 | // PR contained more than 1 commit. 84 | parent := parents[0] 85 | ms := prNumberRe.FindStringSubmatch(desc) 86 | if len(ms) != 3 { 87 | // The grep on git log returned a false positive where the 88 | // PR number is not on the first line of the commit 89 | // message. This is not a commit in the standard github 90 | // format for PRs. 91 | continue 92 | } 93 | 94 | var prNum int 95 | if ms[1] != "" { 96 | prNum, err = strconv.Atoi(ms[1]) 97 | } else { 98 | prNum, err = strconv.Atoi(ms[2]) 99 | } 100 | if err != nil { 101 | // Shouldn't happen, the regex isolates digits, why can't 102 | // we parse digits? 103 | return nil, fmt.Errorf("unexpected invalid PR number string %q", ms[1]) 104 | } 105 | 106 | ret.PRs[prNum] = PRInfo{ 107 | Num: prNum, 108 | CommitHash: commit, 109 | ParentHash: parent, 110 | } 111 | } 112 | 113 | return ret, nil 114 | } 115 | 116 | // GetPSL returns the PSL file at the given commit hash in the git 117 | // repository at gitPath. 118 | func GetPSL(gitPath string, hash string) ([]byte, error) { 119 | toplevel, err := gitToplevel(gitPath) 120 | if err != nil { 121 | return nil, err 122 | } 123 | 124 | bs, err := gitStdout(toplevel, "show", fmt.Sprintf("%s:public_suffix_list.dat", hash)) 125 | if err != nil { 126 | return nil, err 127 | } 128 | 129 | return bs, nil 130 | } 131 | 132 | // Matches either "(#1234)" at the end of a line, or "Merge pull 133 | // request #1234 from" at the start of a line. The first is how github 134 | // formats squash-and-merge commits, the second is how github formats 135 | // 2-parent merge commits. 136 | var prNumberRe = regexp.MustCompile(`(?:\(#(\d+)\)$)|(?:^Merge pull request #(\d+) from)`) 137 | 138 | func gitStdout(repoPath string, args ...string) ([]byte, error) { 139 | args = append([]string{"-C", repoPath}, args...) 140 | c := exec.Command("git", args...) 141 | var stderr bytes.Buffer 142 | c.Stderr = &stderr 143 | bs, err := c.Output() 144 | if err != nil { 145 | // Make the error show the git commandline and captured 146 | // stderr, not just the plain "exited with code 45" error. 147 | cmdline := append([]string{"git"}, args...) 148 | var stderrStr string 149 | if stderr.Len() != 0 { 150 | stderrStr = "stderr:\n" + stderr.String() 151 | } 152 | return nil, fmt.Errorf("running %q: %w. %s", strings.Join(cmdline, " "), err, stderrStr) 153 | } 154 | return bytes.TrimSpace(bs), nil 155 | } 156 | -------------------------------------------------------------------------------- /tools/internal/github/pr.go: -------------------------------------------------------------------------------- 1 | // Package github provides a github client with functions tailored to 2 | // the PSL's needs. 3 | package github 4 | 5 | import ( 6 | "context" 7 | "errors" 8 | "fmt" 9 | "os" 10 | "time" 11 | 12 | "github.com/google/go-github/v63/github" 13 | ) 14 | 15 | // Client is a GitHub API client that performs PSL-specific 16 | // operations. The zero value is a client that interacts with the 17 | // official publicsuffix/list repository. 18 | type Repo struct { 19 | // Owner is the github account of the repository to query. If 20 | // empty, defaults to "publicsuffix". 21 | Owner string 22 | // Repo is the repository to query. If empty, defaults to "list". 23 | Repo string 24 | 25 | client *github.Client 26 | } 27 | 28 | func (c *Repo) owner() string { 29 | if c.Owner != "" { 30 | return c.Owner 31 | } 32 | return "publicsuffix" 33 | } 34 | 35 | func (c *Repo) repo() string { 36 | if c.Repo != "" { 37 | return c.Repo 38 | } 39 | return "list" 40 | } 41 | 42 | func (c *Repo) apiClient() *github.Client { 43 | if c.client == nil { 44 | c.client = github.NewClient(nil) 45 | if token := os.Getenv("GITHUB_TOKEN"); token != "" { 46 | c.client = c.client.WithAuthToken(token) 47 | } 48 | } 49 | return c.client 50 | } 51 | 52 | // PSLForPullRequest fetches the PSL files needed to validate the 53 | // given pull request. Returns the PSL file for the target branch, and 54 | // the same but with the PR's changes applied. 55 | func (c *Repo) PSLForPullRequest(ctx context.Context, prNum int) (withoutPR, withPR []byte, err error) { 56 | // Github sometimes needs a little time to think to update the PR 57 | // state, so we might need to sleep and retry a few times. Usually 58 | // the status updates in <5s, but just for safety, give it a more 59 | // generous timeout. 60 | ctx, cancel := context.WithTimeout(ctx, 30*time.Second) 61 | defer cancel() 62 | 63 | var withoutHash, withHash string 64 | for withoutHash == "" { 65 | withoutHash, withHash, err = c.getPRCommitInfo(ctx, prNum) 66 | if errors.Is(err, errMergeInfoNotReady) { 67 | // PR exists but merge info is stale, need to wait and 68 | // retry. 69 | select { 70 | case <-time.After(2 * time.Second): 71 | continue 72 | case <-ctx.Done(): 73 | return nil, nil, ctx.Err() 74 | } 75 | } else if err != nil { 76 | return nil, nil, err 77 | } 78 | } 79 | 80 | withoutPR, err = c.PSLForHash(ctx, withoutHash) 81 | if err != nil { 82 | return nil, nil, err 83 | } 84 | withPR, err = c.PSLForHash(ctx, withHash) 85 | if err != nil { 86 | return nil, nil, err 87 | } 88 | return withoutPR, withPR, nil 89 | } 90 | 91 | var errMergeInfoNotReady = errors.New("PR mergeability information not available yet, please retry later") 92 | 93 | // getPRCommitInfo returns the "before" and "after" commit hashes for 94 | // prNum. 95 | // 96 | // The exact meaning of "before" and "after" varies, but in general 97 | // before is the state of the master branch right before the PR is 98 | // merged, and "after" is the same state plus the PR's changes, with 99 | // no unrelated changes. 100 | // 101 | // For an unmerged PR, "after" is a "trial merge commit" created 102 | // automatically by Github to run CI and check that the PR is 103 | // mergeable, and "before" is the master branch state from that trial 104 | // merge - usually the latest current state. 105 | // 106 | // For a merged PR, "after" is the commit where the PR's changes first 107 | // appeared in master, and "before" is the state of master immediately 108 | // before that. 109 | // 110 | // getPRCommitInfo returns the sentinel error errMergeInfoNotReady if 111 | // an open PR exists, but github needs a bit more time to update the 112 | // trial merge commit. The caller is expected to retry with 113 | // appropriate backoff. 114 | func (c *Repo) getPRCommitInfo(ctx context.Context, prNum int) (withoutPRCommit, withPRCommit string, err error) { 115 | ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 116 | defer cancel() 117 | 118 | pr, _, err := c.apiClient().PullRequests.Get(ctx, c.owner(), c.repo(), prNum) 119 | if err != nil { 120 | return "", "", err 121 | } 122 | 123 | mergeCommit := pr.GetMergeCommitSHA() 124 | if mergeCommit == "" { 125 | return "", "", fmt.Errorf("no merge commit available for PR %d", prNum) 126 | } 127 | commitInfo, _, err := c.apiClient().Git.GetCommit(ctx, c.owner(), c.repo(), mergeCommit) 128 | if err != nil { 129 | return "", "", fmt.Errorf("getting info for trial merge SHA %q: %w", mergeCommit, err) 130 | } 131 | 132 | var beforeMergeCommit string 133 | if pr.GetMerged() && len(commitInfo.Parents) == 1 { 134 | // PR was merged, PSL policy is to use squash-and-merge, so 135 | // the pre-PR commit is simply the parent of the merge commit. 136 | beforeMergeCommit = commitInfo.Parents[0].GetSHA() 137 | } else if pr.Mergeable == nil { 138 | // PR isn't merged, but github needs time to rebase the PR and 139 | // create a trial merge. Unfortunately the only way to know 140 | // when it's done is to just poll and wait for the mergeable 141 | // bool to be valid. 142 | return "", "", errMergeInfoNotReady 143 | } else if !pr.GetMergeable() { 144 | // PR isn't merged, and there's a merge conflict that prevents 145 | // us from knowing what the pre- and post-merge states are. 146 | return "", "", fmt.Errorf("cannot get PSL for PR %d, needs rebase to resolve conflicts", prNum) 147 | } else { 148 | // PR is either open, or it was merged without squashing. In 149 | // both cases, mergeCommit has 2 parents: one is the PR head 150 | // commit, and the other is the master branch without the PR's 151 | // changes. 152 | if numParents := len(commitInfo.Parents); numParents != 2 { 153 | return "", "", fmt.Errorf("unexpected parent count %d for trial merge commit on PR %d, expected 2 parents", numParents, prNum) 154 | } 155 | 156 | prHeadCommit := pr.GetHead().GetSHA() 157 | if prHeadCommit == "" { 158 | return "", "", fmt.Errorf("no commit SHA available for head of PR %d", prNum) 159 | } 160 | if commitInfo.Parents[0].GetSHA() == prHeadCommit { 161 | beforeMergeCommit = commitInfo.Parents[1].GetSHA() 162 | } else { 163 | beforeMergeCommit = commitInfo.Parents[0].GetSHA() 164 | } 165 | } 166 | 167 | return beforeMergeCommit, mergeCommit, nil 168 | } 169 | 170 | // PSLForHash returns the PSL file at the given git commit hash. 171 | func (c *Repo) PSLForHash(ctx context.Context, hash string) ([]byte, error) { 172 | ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 173 | defer cancel() 174 | 175 | opts := &github.RepositoryContentGetOptions{ 176 | Ref: hash, 177 | } 178 | content, _, _, err := c.apiClient().Repositories.GetContents(ctx, c.owner(), c.repo(), "public_suffix_list.dat", opts) 179 | if err != nil { 180 | return nil, fmt.Errorf("getting PSL for commit %q: %w", hash, err) 181 | } 182 | ret, err := content.GetContent() 183 | if err != nil { 184 | return nil, err 185 | } 186 | return []byte(ret), nil 187 | } 188 | -------------------------------------------------------------------------------- /tools/internal/parser/text_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "reflect" 7 | "testing" 8 | 9 | "github.com/google/go-cmp/cmp" 10 | "golang.org/x/text/encoding" 11 | "golang.org/x/text/encoding/unicode" 12 | ) 13 | 14 | func TestNormalize(t *testing.T) { 15 | t.Parallel() 16 | 17 | tests := []struct { 18 | name string 19 | in []byte 20 | want []string 21 | wantErrs []error 22 | }{ 23 | { 24 | name: "empty_input", 25 | in: []byte{}, 26 | want: []string{}, 27 | }, 28 | { 29 | name: "no_early_errors", 30 | in: byteLines( 31 | "// This is a small replica", 32 | "// of the PSL", 33 | "com", 34 | "net", 35 | "lol", 36 | "", 37 | "// End of file", 38 | ), 39 | want: []string{ 40 | "// This is a small replica", 41 | "// of the PSL", 42 | "com", 43 | "net", 44 | "lol", 45 | "", 46 | "// End of file", 47 | }, 48 | }, 49 | { 50 | name: "utf16be_input_with_bom", 51 | in: utf16BigWithBOM("utf-16 text"), 52 | want: []string{"utf-16 text"}, 53 | }, 54 | { 55 | name: "utf16le_input_with_bom", 56 | in: utf16LittleWithBOM("utf-16 text"), 57 | want: []string{"utf-16 text"}, 58 | }, 59 | { 60 | name: "utf16be_input", 61 | in: utf16Big("utf-16 text utf-16 text utf-16 text"), 62 | want: []string{"utf-16 text utf-16 text utf-16 text"}, 63 | wantErrs: []error{ErrInvalidEncoding{"UTF-16BE (guessed)"}}, 64 | }, 65 | { 66 | name: "utf16le_input", 67 | in: utf16Little("utf-16 text utf-16 text utf-16 text"), 68 | want: []string{"utf-16 text utf-16 text utf-16 text"}, 69 | wantErrs: []error{ErrInvalidEncoding{"UTF-16LE (guessed)"}}, 70 | }, 71 | { 72 | name: "utf8_with_bom", 73 | in: utf8WithBOM("utf-8 text"), 74 | want: []string{"utf-8 text"}, 75 | }, 76 | { 77 | name: "utf8_with_garbage", 78 | // See https://en.wikipedia.org/wiki/UTF-8 for a 79 | // description of UTF-8 encoding, to help understand why 80 | // these inputs are invalid. 81 | // 82 | // The invalid patterns are immediately followed by more 83 | // valid characters, to verify exactly how normalization 84 | // mangles the bytes around an invalid sequence. 85 | in: byteLines( 86 | "normal UTF-8", 87 | // Illegal start bitpattern (5 leading bits set to 1) 88 | "bad1: \xF8abc", 89 | // First byte declares 3-byte character, but ends after 2 bytes 90 | "bad2: \xE0\xBFabc", 91 | // Continuation byte outside of a character 92 | "bad3: \xBFabc", 93 | // Ascii space (0x20) encoded non-minimally 94 | "bad4: \xC0\xA0abc", 95 | "this line is ok", 96 | ), 97 | want: []string{ 98 | "normal UTF-8", 99 | "bad1: \uFFFDabc", 100 | "bad2: \uFFFDabc", 101 | "bad3: \uFFFDabc", 102 | "bad4: \uFFFD\uFFFDabc", 103 | "this line is ok", 104 | }, 105 | wantErrs: []error{ 106 | ErrInvalidUnicode{mkSrc(1, 2)}, 107 | ErrInvalidUnicode{mkSrc(2, 3)}, 108 | ErrInvalidUnicode{mkSrc(3, 4)}, 109 | ErrInvalidUnicode{mkSrc(4, 5)}, 110 | }, 111 | }, 112 | { 113 | name: "dos_line_endings", 114 | in: byteLines( 115 | "normal file\r", 116 | "except the lines\r", 117 | "end like it's 1991"), 118 | want: []string{ 119 | "normal file", 120 | "except the lines", 121 | "end like it's 1991", 122 | }, 123 | }, 124 | { 125 | name: "trailing_whitespace", 126 | in: byteLines( 127 | "a file ", 128 | "with all kinds\t\t", 129 | " \r\t", 130 | // Strange "spaces": em space, ideographic space, 131 | // 4/18em medium mathematical space. 132 | "of trailing space\u2003\u3000\u205f", 133 | "and one good line", 134 | ), 135 | want: []string{ 136 | "a file", 137 | "with all kinds", 138 | "", 139 | "of trailing space", 140 | "and one good line", 141 | }, 142 | }, 143 | { 144 | name: "leading_whitespace", 145 | in: byteLines( 146 | " a file", 147 | "\t\twith all kinds", 148 | " \r\t", // ensure this is reported as trailing, not leading 149 | // Strange "spaces": em space, ideographic space, 150 | // 4/18em medium mathematical space. 151 | "\u2003\u3000\u205fof leading space", 152 | "and one good line", 153 | ), 154 | want: []string{ 155 | "a file", 156 | "with all kinds", 157 | "", 158 | "of leading space", 159 | "and one good line", 160 | }, 161 | }, 162 | { 163 | name: "the_most_wrong_line", 164 | in: byteLines("\xef\xbb\xbf \t // Hello\xc3\x28 very broken line\t \r"), 165 | want: []string{"// Hello\uFFFD( very broken line"}, 166 | wantErrs: []error{ 167 | ErrInvalidUnicode{mkSrc(0, 1)}, 168 | }, 169 | }, 170 | } 171 | 172 | for _, tc := range tests { 173 | t.Run(tc.name, func(t *testing.T) { 174 | lines, errs := normalizeToUTF8Lines(tc.in) 175 | checkDiff(t, "newSource error set", errs, tc.wantErrs) 176 | checkDiff(t, "newSource result", lines, tc.want) 177 | }) 178 | } 179 | } 180 | 181 | func byteLines(lines ...any) []byte { 182 | var ret [][]byte 183 | for _, ln := range lines { 184 | switch v := ln.(type) { 185 | case string: 186 | ret = append(ret, []byte(v)) 187 | case []byte: 188 | ret = append(ret, v) 189 | default: 190 | panic(fmt.Sprintf("unhandled type %T for bytes()", ln)) 191 | } 192 | } 193 | return bytes.Join(ret, []byte("\n")) 194 | } 195 | 196 | func encodeFromUTF8(s string, e encoding.Encoding) []byte { 197 | ret, err := e.NewEncoder().Bytes([]byte(s)) 198 | if err != nil { 199 | // Only way this can happen is if the input isn't valid UTF-8, 200 | // and we don't do that in these tests. 201 | panic(err) 202 | } 203 | return ret 204 | } 205 | 206 | func utf16Big(s string) []byte { 207 | return encodeFromUTF8(s, unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)) 208 | } 209 | 210 | func utf16BigWithBOM(s string) []byte { 211 | return encodeFromUTF8(s, unicode.UTF16(unicode.BigEndian, unicode.UseBOM)) 212 | } 213 | 214 | func utf16Little(s string) []byte { 215 | return encodeFromUTF8(s, unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)) 216 | } 217 | 218 | func utf16LittleWithBOM(s string) []byte { 219 | return encodeFromUTF8(s, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)) 220 | } 221 | 222 | func utf8WithBOM(s string) []byte { 223 | return encodeFromUTF8(s, unicode.UTF8BOM) 224 | } 225 | 226 | func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) { 227 | t.Helper() 228 | 229 | // cmp.Diff refuses to examine unexported fields by default. Tell 230 | // it that it's okay to look at unexported fields of blocks and 231 | // blockInfo, since we own those fields and want to include their 232 | // values in comparisons. 233 | exportInfo := cmp.Exporter(func(t reflect.Type) bool { 234 | if t.Kind() != reflect.Pointer { 235 | t = reflect.PointerTo(t) 236 | } 237 | 238 | if t.Elem() == reflect.TypeFor[blockInfo]() { 239 | return true 240 | } 241 | 242 | if t.Implements(reflect.TypeFor[Block]()) { 243 | return true 244 | } 245 | 246 | return false 247 | }) 248 | if diff := cmp.Diff(got, want, exportInfo); diff != "" { 249 | t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff) 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /tools/internal/parser/text.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strings" 7 | "unicode/utf8" 8 | 9 | "golang.org/x/text/encoding" 10 | xunicode "golang.org/x/text/encoding/unicode" 11 | ) 12 | 13 | // SourceRange describes a slice of lines from an unparsed source 14 | // file. FirstLine and LastLine behave like normal slice offsets, 15 | // i.e. they represent the half-open range [FirstLine:LastLine). 16 | type SourceRange struct { 17 | FirstLine int 18 | LastLine int 19 | } 20 | 21 | // NumLines returns the number of source lines described by 22 | // SourceRange. 23 | func (s SourceRange) NumLines() int { 24 | if s.FirstLine >= s.LastLine { 25 | return 0 26 | } 27 | return s.LastLine - s.FirstLine 28 | } 29 | 30 | // LocationString prints a human-readable description of the 31 | // SourceRange. 32 | func (s SourceRange) LocationString() string { 33 | switch { 34 | case s.LastLine <= s.FirstLine: 35 | return "" 36 | case s.LastLine == s.FirstLine+1: 37 | return fmt.Sprintf("line %d", s.FirstLine+1) 38 | default: 39 | return fmt.Sprintf("lines %d-%d", s.FirstLine+1, s.LastLine) 40 | } 41 | } 42 | 43 | // merge returns a SourceRange that contains both s and other. If s 44 | // and other are not contiguous or overlapping, the returned 45 | // SourceRange also spans unrelated lines, but always covers both s 46 | // and other. 47 | func (s SourceRange) merge(other SourceRange) SourceRange { 48 | return SourceRange{ 49 | FirstLine: min(s.FirstLine, other.LastLine), 50 | LastLine: max(s.LastLine, other.LastLine), 51 | } 52 | } 53 | 54 | const ( 55 | bomUTF8 = "\xEF\xBB\xBF" 56 | bomUTF16BE = "\xFE\xFF" 57 | bomUTF16LE = "\xFF\xFE" 58 | ) 59 | 60 | // The transformers that normalizeToUTF8Lines can use to process input 61 | // into valid UTF-8, and that guessUTFVariant can return. 62 | var ( 63 | utf8Transform = xunicode.UTF8BOM 64 | utf16LittleEndianTransform = xunicode.UTF16(xunicode.LittleEndian, xunicode.UseBOM) 65 | utf16BigEndianTransform = xunicode.UTF16(xunicode.BigEndian, xunicode.UseBOM) 66 | ) 67 | 68 | // normalizeToUTF8Lines slices bs into one string per line. 69 | // 70 | // All returned strings contain only valid UTF-8. Invalid byte 71 | // sequences are replaced with the unicode replacement character 72 | // (\uFFFD). 73 | // 74 | // The canonical PSL encoding is a file consisting entirely of valid 75 | // UTF-8, with no leading BOM or unicode replacement characters. In an 76 | // effort to report useful errors for common mangling caused by older 77 | // Windows software, normalizeToUTF8Lines accepts input encoded as 78 | // UTF-8, UTF-16LE or UTF-16BE, with or without a leading BOM. 79 | // 80 | // normalizeToUTF8Lines returns the normalized lines of bs, as well as 81 | // errors that report deviations from the canonical encoding, if any. 82 | func normalizeToUTF8Lines(bs []byte) ([]string, []error) { 83 | var errs []error 84 | 85 | // Figure out the byte encoding to use. We try to detect and 86 | // correctly parse UTF-16 that doesn't have a BOM, but we also 87 | // report an explicit parse error in that case, because we cannot 88 | // be confident the parse is 100% correct, and therefore we can't 89 | // automatically fix it. 90 | enc := utf8Transform 91 | switch { 92 | case bytes.HasPrefix(bs, []byte(bomUTF8)): 93 | case bytes.HasPrefix(bs, []byte(bomUTF16BE)): 94 | enc = utf16BigEndianTransform 95 | case bytes.HasPrefix(bs, []byte(bomUTF16LE)): 96 | enc = utf16LittleEndianTransform 97 | default: 98 | enc = guessUTFVariant(bs) 99 | switch enc { 100 | case utf16BigEndianTransform: 101 | errs = append(errs, ErrInvalidEncoding{"UTF-16BE (guessed)"}) 102 | case utf16LittleEndianTransform: 103 | errs = append(errs, ErrInvalidEncoding{"UTF-16LE (guessed)"}) 104 | } 105 | } 106 | 107 | bs, err := enc.NewDecoder().Bytes(bs) 108 | if err != nil { 109 | // The decoder shouldn't error out, if it does we can't really 110 | // proceed, just return the errors we've found so far. 111 | errs = append(errs, err) 112 | return []string{}, errs 113 | } 114 | 115 | if len(bs) == 0 { 116 | return []string{}, errs 117 | } 118 | 119 | ret := strings.Split(string(bs), "\n") 120 | for i, line := range ret { 121 | // capture source info before we tidy up the line starts/ends, 122 | // so that input normalization errors show the problem being 123 | // described. 124 | // 125 | // However, we still provide post-sanitization UTF-8 bytes, 126 | // not the raw input. The raw input is unlikely to display 127 | // correctly in terminals and logs, and because the unicode 128 | // replacement character is a distinctive shape that stands 129 | // out, it should provide enough hints as to where any invalid 130 | // byte sequences are. 131 | src := SourceRange{i, i + 1} 132 | if strings.ContainsRune(line, utf8.RuneError) { 133 | // We can't fix invalid Unicode, by definition we don't 134 | // know what it's trying to say. 135 | errs = append(errs, ErrInvalidUnicode{src}) 136 | } 137 | ret[i] = strings.TrimSpace(line) 138 | } 139 | 140 | return ret, errs 141 | } 142 | 143 | // guessUTFVariant guesses the encoding of bs. 144 | // 145 | // Returns the transformer to use on bs, one of utf8Transform, 146 | // utf16LittleEndianTransform or utf16BigEndianTransform. 147 | func guessUTFVariant(bs []byte) encoding.Encoding { 148 | // Only scan a few hundred bytes. Assume UTF-8 if we don't see 149 | // anything odd before that. 150 | const checkLimit = 200 // 100 UTF-16 characters 151 | if len(bs) > checkLimit { 152 | bs = bs[:checkLimit] 153 | } 154 | 155 | // This is a crude but effective trick to detect UTF-16: we assume 156 | // that the input contains at least some ascii, and that the 157 | // decoded input does not contain Unicode \u0000 codepoints 158 | // (legacy ascii null). 159 | // 160 | // If this is true, then valid UTF-8 text does not have any zero 161 | // bytes, because UTF-8 never produces a zero byte except when it 162 | // encodes the \u0000 codepoint. 163 | // 164 | // On the other hand, UTF-16 encodes all codepoints a pair of 165 | // bytes, and that means an ascii string in UTF-16 a zero byte 166 | // every 2 bytes. We can use the presence of zero bytes to 167 | // identify UTF-16, and the position of the zero (even or odd 168 | // offset) tells us what endianness to use. 169 | evenZeros, oddZeros := 0, 0 170 | for i, b := range bs { 171 | if b != 0 { 172 | continue 173 | } 174 | 175 | if i%2 == 0 { 176 | evenZeros++ 177 | } else { 178 | oddZeros++ 179 | } 180 | 181 | const ( 182 | // Wait for a few zero bytes to accumulate, because if 183 | // this is just UTF-8 with a few \u0000 codepoints, 184 | // decoding as UTF-16 will be complete garbage. So, wait 185 | // until we see a suspicious number of zeros, and require 186 | // a strong bias towards even/odd before we guess 187 | // UTF-16. Otherwise, UTF-8 gives us the best chance of 188 | // producing coherent errors. 189 | decisionThreshold = 20 190 | utf16Threshold = 15 191 | ) 192 | if evenZeros+oddZeros < decisionThreshold { 193 | continue 194 | } 195 | if evenZeros > utf16Threshold { 196 | return utf16BigEndianTransform 197 | } else if oddZeros > utf16Threshold { 198 | return utf16LittleEndianTransform 199 | } 200 | // Lots of zeros, but no strong bias. No idea what's going on, 201 | // UTF-8 is a safe fallback. 202 | return utf8Transform 203 | } 204 | 205 | // Didn't find enough zeros, probably UTF-8. 206 | return utf8Transform 207 | } 208 | -------------------------------------------------------------------------------- /tools/internal/parser/metadata_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "net/mail" 5 | "net/url" 6 | "testing" 7 | ) 8 | 9 | func TestMetadata(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | in *Comment 13 | want MaintainerInfo 14 | }{ 15 | { 16 | name: "empty", 17 | in: nil, 18 | want: MaintainerInfo{ 19 | MachineEditable: true, 20 | }, 21 | }, 22 | 23 | { 24 | name: "canonical", 25 | in: comment(0, 26 | "DuckCo : https://example.com", 27 | "Submitted by Duck ", 28 | ), 29 | want: MaintainerInfo{ 30 | Name: "DuckCo", 31 | URLs: urls("https://example.com"), 32 | Maintainers: emails("Duck", "duck@example.com"), 33 | MachineEditable: true, 34 | }, 35 | }, 36 | 37 | { 38 | name: "canonical_no_space_around_colon", 39 | in: comment(0, 40 | "DuckCo:https://example.com", 41 | "Submitted by Duck ", 42 | ), 43 | want: MaintainerInfo{ 44 | Name: "DuckCo", 45 | URLs: urls("https://example.com"), 46 | Maintainers: emails("Duck", "duck@example.com"), 47 | MachineEditable: true, 48 | }, 49 | }, 50 | 51 | { 52 | name: "canonical_url_in_parens", 53 | in: comment(0, 54 | "DuckCo (https://example.com)", 55 | "Submitted by Duck ", 56 | ), 57 | want: MaintainerInfo{ 58 | Name: "DuckCo", 59 | URLs: urls("https://example.com"), 60 | Maintainers: emails("Duck", "duck@example.com"), 61 | MachineEditable: true, 62 | }, 63 | }, 64 | 65 | { 66 | name: "canonical_by_registry", 67 | in: comment(0, 68 | "DuckCo : https://example.com", 69 | "Submitted by registry ", 70 | ), 71 | want: MaintainerInfo{ 72 | Name: "DuckCo", 73 | URLs: urls("https://example.com"), 74 | Maintainers: emails("", "duck@example.com"), 75 | MachineEditable: true, 76 | }, 77 | }, 78 | 79 | { 80 | name: "name_and_email_first", 81 | in: comment(0, 82 | "DuckCo : Duck ", 83 | "https://example.com", 84 | ), 85 | want: MaintainerInfo{ 86 | Name: "DuckCo", 87 | URLs: urls("https://example.com"), 88 | Maintainers: emails("Duck", "duck@example.com"), 89 | MachineEditable: true, 90 | }, 91 | }, 92 | 93 | { 94 | name: "name_and_naked_email", 95 | in: comment(0, 96 | "DuckCo : duck@example.com", 97 | "https://example.com", 98 | ), 99 | want: MaintainerInfo{ 100 | Name: "DuckCo", 101 | URLs: urls("https://example.com"), 102 | Maintainers: emails("", "duck@example.com"), 103 | MachineEditable: true, 104 | }, 105 | }, 106 | 107 | { 108 | name: "one_per_line", 109 | in: comment(0, 110 | "DuckCo", 111 | "https://example.com", 112 | "Submitted by Duck ", 113 | ), 114 | want: MaintainerInfo{ 115 | Name: "DuckCo", 116 | URLs: urls("https://example.com"), 117 | Maintainers: emails("Duck", "duck@example.com"), 118 | MachineEditable: true, 119 | }, 120 | }, 121 | 122 | { 123 | name: "no_name", 124 | in: comment(0, 125 | "https://example.com", 126 | "Submitted by Duck ", 127 | "Other notes here", 128 | ), 129 | want: MaintainerInfo{ 130 | Name: "", 131 | URLs: urls("https://example.com"), 132 | Maintainers: emails("Duck", "duck@example.com"), 133 | Other: []string{"Other notes here"}, 134 | MachineEditable: true, 135 | }, 136 | }, 137 | 138 | { 139 | name: "http_url_and_bare_email", 140 | in: comment(0, 141 | "http://example.com", 142 | "duck@example.com", 143 | ), 144 | want: MaintainerInfo{ 145 | Name: "", 146 | URLs: urls("http://example.com"), 147 | Maintainers: emails("", "duck@example.com"), 148 | MachineEditable: true, 149 | }, 150 | }, 151 | 152 | { 153 | name: "multiple_urls", 154 | in: comment(0, 155 | "DuckCo : https://example.com", 156 | "https://example.org/details", 157 | "Submitted by Duck ", 158 | ), 159 | want: MaintainerInfo{ 160 | Name: "DuckCo", 161 | URLs: urls("https://example.com", "https://example.org/details"), 162 | Maintainers: emails("Duck", "duck@example.com"), 163 | MachineEditable: true, 164 | }, 165 | }, 166 | 167 | { 168 | name: "multiple_emails", 169 | in: comment(0, 170 | "DuckCo : https://example.com", 171 | "Submitted by Duck and Goat ", 172 | "llama@example.com", 173 | ), 174 | want: MaintainerInfo{ 175 | Name: "DuckCo", 176 | URLs: urls("https://example.com"), 177 | Maintainers: emails( 178 | "Duck", "duck@example.com", 179 | "Goat", "goat@example.com", 180 | "", "llama@example.com"), 181 | MachineEditable: true, 182 | }, 183 | }, 184 | 185 | { 186 | name: "multiple_everything_and_end_notes", 187 | in: comment(0, 188 | "DuckCo : https://example.com", 189 | "http://example.org", 190 | "https://example.net/more", 191 | "Submitted by Duck and Goat ", 192 | "llama@example.com", 193 | `"Owl" `, 194 | "Duck is theoretically in charge, but Owl has influence", 195 | "Goat is not to be trusted, don't know about llama yet", 196 | ), 197 | want: MaintainerInfo{ 198 | Name: "DuckCo", 199 | URLs: urls("https://example.com", "http://example.org", "https://example.net/more"), 200 | Maintainers: emails( 201 | "Duck", "duck@example.com", 202 | "Goat", "goat@example.com", 203 | "", "llama@example.com", 204 | "Owl", "owl@example.net"), 205 | Other: []string{ 206 | "Duck is theoretically in charge, but Owl has influence", 207 | "Goat is not to be trusted, don't know about llama yet", 208 | }, 209 | MachineEditable: true, 210 | }, 211 | }, 212 | 213 | { 214 | name: "info_after_extra_notes", 215 | in: comment(0, 216 | "DuckCo", 217 | "Duck is in charge", 218 | "https://example.com", 219 | "Submitted by Duck ", 220 | ), 221 | want: MaintainerInfo{ 222 | Name: "DuckCo", 223 | URLs: urls("https://example.com"), 224 | Maintainers: emails("Duck", "duck@example.com"), 225 | Other: []string{ 226 | "Duck is in charge", 227 | }, 228 | MachineEditable: false, 229 | }, 230 | }, 231 | 232 | { 233 | name: "obfuscated_email", 234 | in: comment(0, 235 | "lohmus", 236 | "someone at lohmus dot me", 237 | ), 238 | want: MaintainerInfo{ 239 | Name: "lohmus", 240 | Maintainers: emails("", "someone@lohmus.me"), 241 | MachineEditable: true, 242 | }, 243 | }, 244 | } 245 | 246 | for _, tc := range tests { 247 | got := extractMaintainerInfo(tc.in) 248 | checkDiff(t, "maintainer info", got, tc.want) 249 | } 250 | } 251 | 252 | func urls(us ...string) []*url.URL { 253 | var ret []*url.URL 254 | for _, s := range us { 255 | ret = append(ret, mustURL(s)) 256 | } 257 | return ret 258 | } 259 | 260 | func mustURL(s string) *url.URL { 261 | u, err := url.Parse(s) 262 | if err != nil { 263 | panic(err) 264 | } 265 | return u 266 | } 267 | 268 | func emails(elts ...string) []*mail.Address { 269 | var ret []*mail.Address 270 | for i := 0; i < len(elts); i += 2 { 271 | ret = append(ret, email(elts[i], elts[i+1])) 272 | } 273 | return ret 274 | } 275 | 276 | func email(name, email string) *mail.Address { 277 | return &mail.Address{ 278 | Name: name, 279 | Address: email, 280 | } 281 | } 282 | -------------------------------------------------------------------------------- /tools/internal/parser/errors.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | // ErrInvalidEncoding reports that the input is encoded with 8 | // something other than UTF-8. 9 | type ErrInvalidEncoding struct { 10 | Encoding string 11 | } 12 | 13 | func (e ErrInvalidEncoding) Error() string { 14 | return fmt.Sprintf("invalid character encoding %s", e.Encoding) 15 | } 16 | 17 | // ErrInvalidUnicode reports that a line contains characters that are 18 | // not valid Unicode. 19 | type ErrInvalidUnicode struct { 20 | SourceRange 21 | } 22 | 23 | func (e ErrInvalidUnicode) Error() string { 24 | return fmt.Sprintf("%s: invalid Unicode character(s)", e.SourceRange.LocationString()) 25 | } 26 | 27 | // ErrSectionInSuffixBlock reports that a comment within a suffix 28 | // block contains a section delimiter. 29 | type ErrSectionInSuffixBlock struct { 30 | SourceRange 31 | } 32 | 33 | func (e ErrSectionInSuffixBlock) Error() string { 34 | return fmt.Sprintf("%s: section delimiter not allowed in suffix block comment", e.SourceRange.LocationString()) 35 | } 36 | 37 | // ErrUnclosedSection reports that a file section was not closed 38 | // properly before EOF. 39 | type ErrUnclosedSection struct { 40 | Section *Section 41 | } 42 | 43 | func (e ErrUnclosedSection) Error() string { 44 | return fmt.Sprintf("%s: section %q is missing its closing marker", e.Section.SourceRange.LocationString(), e.Section.Name) 45 | } 46 | 47 | // ErrNestedSection reports that a file section is being started while 48 | // already within a section. 49 | type ErrNestedSection struct { 50 | SourceRange 51 | Name string 52 | Section *Section 53 | } 54 | 55 | func (e ErrNestedSection) Error() string { 56 | return fmt.Sprintf("%s: section %q is nested inside section %q (%s)", e.SourceRange.LocationString(), e.Name, e.Section.Name, e.Section.SourceRange.LocationString()) 57 | } 58 | 59 | // ErrUnstartedSection reports that section end marker was found 60 | // without a corresponding start. 61 | type ErrUnstartedSection struct { 62 | SourceRange 63 | Name string 64 | } 65 | 66 | func (e ErrUnstartedSection) Error() string { 67 | return fmt.Sprintf("%s: end marker for non-existent section %q", e.SourceRange.LocationString(), e.Name) 68 | } 69 | 70 | // ErrMismatchedSection reports that a file section was started 71 | // under one name but ended under another. 72 | type ErrMismatchedSection struct { 73 | SourceRange 74 | EndName string 75 | Section *Section 76 | } 77 | 78 | func (e ErrMismatchedSection) Error() string { 79 | return fmt.Sprintf("%s: section %q (%s) closed with wrong name %q", e.SourceRange.LocationString(), e.Section.Name, e.Section.SourceRange.LocationString(), e.EndName) 80 | } 81 | 82 | // ErrUnknownSectionMarker reports that a line looks like a file section 83 | // marker (e.g. "===BEGIN ICANN DOMAINS==="), but is not one of the 84 | // recognized kinds of marker. 85 | type ErrUnknownSectionMarker struct { 86 | SourceRange 87 | } 88 | 89 | func (e ErrUnknownSectionMarker) Error() string { 90 | return fmt.Sprintf("%s: unknown kind of section marker", e.SourceRange.LocationString()) 91 | } 92 | 93 | // MissingEntityName reports that a block of suffixes does not have a 94 | // parseable owner name in its header comment. 95 | type ErrMissingEntityName struct { 96 | Suffixes *Suffixes 97 | } 98 | 99 | func (e ErrMissingEntityName) Error() string { 100 | return fmt.Sprintf("%s: suffix block has no owner name", e.Suffixes.SourceRange.LocationString()) 101 | } 102 | 103 | // ErrMissingEntityEmail reports that a block of suffixes does not have a 104 | // parseable contact email address in its header comment. 105 | type ErrMissingEntityEmail struct { 106 | Suffixes *Suffixes 107 | } 108 | 109 | func (e ErrMissingEntityEmail) Error() string { 110 | return fmt.Sprintf("%s: suffix block has no contact email", e.Suffixes.SourceRange.LocationString()) 111 | } 112 | 113 | // ErrInvalidSuffix reports that a suffix suffix is not a valid PSL 114 | // entry. 115 | type ErrInvalidSuffix struct { 116 | SourceRange 117 | Suffix string 118 | Err error 119 | } 120 | 121 | func (e ErrInvalidSuffix) Error() string { 122 | return fmt.Sprintf("%s: invalid suffix %q: %v", e.SourceRange.LocationString(), e.Suffix, e.Err) 123 | } 124 | 125 | type ErrCommentPreventsSuffixSort struct { 126 | SourceRange 127 | } 128 | 129 | func (e ErrCommentPreventsSuffixSort) Error() string { 130 | return fmt.Sprintf("%s: comment prevents full sorting of suffixes", e.SourceRange.LocationString()) 131 | } 132 | 133 | type ErrCommentPreventsSectionSort struct { 134 | SourceRange 135 | } 136 | 137 | func (e ErrCommentPreventsSectionSort) Error() string { 138 | return fmt.Sprintf("%s: comment prevents full sorting of PSL section", e.SourceRange.LocationString()) 139 | } 140 | 141 | type ErrDuplicateSection struct { 142 | *Section 143 | FirstDefinition *Section 144 | } 145 | 146 | func (e ErrDuplicateSection) Error() string { 147 | return fmt.Sprintf("%s: duplicate section %q, first definition at %s", e.LocationString(), e.Name, e.FirstDefinition.LocationString()) 148 | } 149 | 150 | type ErrUnknownSection struct { 151 | *Section 152 | } 153 | 154 | func (e ErrUnknownSection) Error() string { 155 | return fmt.Sprintf("%s: unknown section %q, allowed sections are 'ICANN DOMAINS' and 'PRIVATE DOMAINS'", e.LocationString(), e.Name) 156 | } 157 | 158 | type ErrMissingSection struct { 159 | Name string 160 | } 161 | 162 | func (e ErrMissingSection) Error() string { 163 | return fmt.Sprintf("missing required section %q", e.Name) 164 | } 165 | 166 | type ErrDuplicateSuffix struct { 167 | Name string 168 | Block // Suffix or Wildcard 169 | FirstDefinition Block // Suffix or Wildcard 170 | } 171 | 172 | func (e ErrDuplicateSuffix) Error() string { 173 | return fmt.Sprintf("%s: duplicate suffix definition for %q, first definition at %s", e.SrcRange().LocationString(), e.Name, e.FirstDefinition.SrcRange().LocationString()) 174 | } 175 | 176 | type ErrConflictingSuffixAndException struct { 177 | *Suffix 178 | Wildcard *Wildcard 179 | } 180 | 181 | func (e ErrConflictingSuffixAndException) Error() string { 182 | return fmt.Sprintf("%s: suffix %s conflicts with exception in wildcard at %s", e.LocationString(), e.Domain, e.Wildcard.LocationString()) 183 | } 184 | 185 | type ErrMissingTXTRecord struct { 186 | Block 187 | } 188 | 189 | func (e ErrMissingTXTRecord) Error() string { 190 | var name string 191 | switch v := e.Block.(type) { 192 | case *Suffix: 193 | name = v.Domain.String() 194 | case *Wildcard: 195 | name = v.Domain.String() 196 | default: 197 | panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block)) 198 | } 199 | return fmt.Sprintf("%s: suffix %s has no TXT record", e.SrcRange().LocationString(), name) 200 | } 201 | 202 | type ErrTXTRecordMismatch struct { 203 | Block 204 | PR int 205 | } 206 | 207 | func (e ErrTXTRecordMismatch) Error() string { 208 | switch v := e.Block.(type) { 209 | case *Suffix: 210 | return fmt.Sprintf("%s: suffix %s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this suffix", e.SrcRange().LocationString(), v.Domain, e.PR) 211 | case *Wildcard: 212 | return fmt.Sprintf("%s: wildcard *.%s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this wildcard", e.SrcRange().LocationString(), v.Domain, e.PR) 213 | default: 214 | panic(fmt.Sprintf("unexpected block type %T in ErrTXTRecordMismatch", e.Block)) 215 | } 216 | } 217 | 218 | type ErrTXTCheckFailure struct { 219 | Block 220 | Err error 221 | } 222 | 223 | func (e ErrTXTCheckFailure) Error() string { 224 | var name string 225 | switch v := e.Block.(type) { 226 | case *Suffix: 227 | name = v.Domain.String() 228 | case *Wildcard: 229 | name = v.Domain.String() 230 | default: 231 | panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block)) 232 | } 233 | return fmt.Sprintf("%s: error checking suffix %s: %v", e.SrcRange().LocationString(), name, e.Err) 234 | } 235 | -------------------------------------------------------------------------------- /tools/internal/parser/validate_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestValidateEntityMetadata(t *testing.T) { 8 | in := list( 9 | section(1, 1, "PRIVATE DOMAINS", 10 | suffixes(1, 1, info("", nil, emails("Example", "example@example.com"), nil, true), 11 | comment(1, "Submitted by Example "), 12 | suffix(2, "example.com"), 13 | ), 14 | 15 | suffixes(2, 2, info("Example Ltd", nil, nil, nil, true), 16 | comment(1, "Example Ltd"), 17 | suffix(2, "example.org"), 18 | ), 19 | 20 | suffixes(3, 3, noInfo, 21 | suffix(1, "example.net"), 22 | ), 23 | 24 | suffixes(4, 4, info("Foo Ltd", nil, emails("Someone", "example@example.com"), nil, true), 25 | comment(1, "Submitted by Someone "), 26 | suffix(2, "blah.example.com"), 27 | ), 28 | ), 29 | ) 30 | want := []error{ 31 | ErrMissingEntityName{ 32 | Suffixes: suffixes(1, 1, 33 | info("", nil, emails("Example", "example@example.com"), nil, true), 34 | comment(1, "Submitted by Example "), 35 | suffix(2, "example.com"), 36 | ), 37 | }, 38 | ErrMissingEntityEmail{ 39 | Suffixes: suffixes(2, 2, info("Example Ltd", nil, nil, nil, true), 40 | comment(1, "Example Ltd"), 41 | suffix(2, "example.org"), 42 | ), 43 | }, 44 | ErrMissingEntityName{ 45 | Suffixes: suffixes(3, 3, noInfo, 46 | suffix(1, "example.net"), 47 | ), 48 | }, 49 | ErrMissingEntityEmail{ 50 | Suffixes: suffixes(3, 3, noInfo, 51 | suffix(1, "example.net"), 52 | ), 53 | }, 54 | } 55 | 56 | got := validateEntityMetadata(in) 57 | checkDiff(t, "validateEntityMetadata", got, want) 58 | 59 | // Make the change be a diff and check the reduced error set. 60 | prev := list( 61 | section(1, 1, "PRIVATE DOMAINS", 62 | suffixes(1, 1, info("", nil, emails("Example", "example@example.com"), nil, true), 63 | comment(1, "Submitted by Example "), 64 | suffix(2, "example.com"), 65 | ), 66 | 67 | suffixes(2, 2, info("Example Ltd", nil, nil, nil, true), 68 | comment(1, "Example Ltd"), 69 | suffix(2, "example.org"), 70 | ), 71 | 72 | suffixes(3, 3, info("Foo Ltd", nil, emails("Someone", "example@example.com"), nil, true), 73 | comment(1, "Submitted by Someone "), 74 | suffix(2, "blah.example.com"), 75 | ), 76 | ), 77 | ) 78 | 79 | in.SetBaseVersion(prev, false) 80 | got = validateEntityMetadata(in) 81 | 82 | // Second suffix block no longer reports any errors. First one 83 | // still does, because its empty name is a dupe of the last block. 84 | want = []error{ 85 | ErrMissingEntityName{ 86 | Suffixes: suffixes(1, 1, 87 | info("", nil, emails("Example", "example@example.com"), nil, true), 88 | markUnchanged(comment(1, "Submitted by Example ")), 89 | markUnchanged(suffix(2, "example.com")), 90 | ), 91 | }, 92 | ErrMissingEntityName{ 93 | Suffixes: suffixes(3, 3, noInfo, 94 | suffix(1, "example.net"), 95 | ), 96 | }, 97 | ErrMissingEntityEmail{ 98 | Suffixes: suffixes(3, 3, noInfo, 99 | suffix(1, "example.net"), 100 | ), 101 | }, 102 | } 103 | 104 | checkDiff(t, "validateEntityMetadata (changed blocks only)", got, want) 105 | } 106 | 107 | func TestValidateExpectedSections(t *testing.T) { 108 | tests := []struct { 109 | name string 110 | in *List 111 | want []error 112 | }{ 113 | { 114 | name: "ok", 115 | in: list( 116 | section(1, 1, "ICANN DOMAINS"), 117 | section(2, 2, "PRIVATE DOMAINS"), 118 | ), 119 | want: nil, 120 | }, 121 | { 122 | name: "all_missing", 123 | in: list(), 124 | want: []error{ 125 | ErrMissingSection{"ICANN DOMAINS"}, 126 | ErrMissingSection{"PRIVATE DOMAINS"}, 127 | }, 128 | }, 129 | { 130 | name: "one_missing", 131 | in: list( 132 | section(1, 1, "ICANN DOMAINS"), 133 | ), 134 | want: []error{ 135 | ErrMissingSection{"PRIVATE DOMAINS"}, 136 | }, 137 | }, 138 | { 139 | name: "unknown", 140 | in: list( 141 | section(1, 1, "ICANN DOMAINS"), 142 | section(2, 2, "PRIVATE DOMAINS"), 143 | section(3, 3, "NON EUCLIDEAN DOMAINS"), 144 | ), 145 | want: []error{ 146 | ErrUnknownSection{section(3, 3, "NON EUCLIDEAN DOMAINS")}, 147 | }, 148 | }, 149 | { 150 | name: "duplicate_known", 151 | in: list( 152 | section(1, 1, "ICANN DOMAINS"), 153 | section(2, 2, "PRIVATE DOMAINS"), 154 | section(3, 3, "ICANN DOMAINS"), 155 | ), 156 | want: []error{ 157 | ErrDuplicateSection{ 158 | section(3, 3, "ICANN DOMAINS"), 159 | section(1, 1, "ICANN DOMAINS"), 160 | }, 161 | }, 162 | }, 163 | { 164 | name: "duplicate_unknown", 165 | in: list( 166 | section(1, 1, "RIDICULOUS DOMAINS"), 167 | section(2, 2, "ICANN DOMAINS"), 168 | section(3, 3, "PRIVATE DOMAINS"), 169 | section(4, 4, "RIDICULOUS DOMAINS"), 170 | ), 171 | want: []error{ 172 | ErrUnknownSection{section(1, 1, "RIDICULOUS DOMAINS")}, 173 | ErrUnknownSection{section(4, 4, "RIDICULOUS DOMAINS")}, 174 | }, 175 | }, 176 | } 177 | 178 | for _, tc := range tests { 179 | t.Run(tc.name, func(t *testing.T) { 180 | got := validateExpectedSections(tc.in) 181 | checkDiff(t, "validateExpectedSections output", got, tc.want) 182 | }) 183 | } 184 | } 185 | 186 | func TestValidateSuffixUniqueness(t *testing.T) { 187 | tests := []struct { 188 | name string 189 | in *List 190 | want []error 191 | }{ 192 | { 193 | name: "ok", 194 | in: list( 195 | section(1, 2, "PRIVATE DOMAINS", 196 | suffixes(2, 3, noInfo, 197 | suffix(3, "foo.com"), 198 | suffix(4, "bar.com"), 199 | ), 200 | ), 201 | ), 202 | want: nil, 203 | }, 204 | 205 | { 206 | name: "dupe_suffixes", 207 | in: list( 208 | section(1, 2, "PRIVATE DOMAINS", 209 | suffixes(2, 3, noInfo, 210 | suffix(3, "foo.com"), 211 | suffix(4, "bar.com"), 212 | suffix(5, "foo.com"), 213 | ), 214 | ), 215 | ), 216 | want: []error{ 217 | ErrDuplicateSuffix{"foo.com", suffix(5, "foo.com"), suffix(3, "foo.com")}, 218 | }, 219 | }, 220 | 221 | { 222 | name: "dupe_wildcards", 223 | in: list( 224 | section(1, 2, "PRIVATE DOMAINS", 225 | suffixes(2, 3, noInfo, 226 | wildcard(3, 4, "foo.com"), 227 | suffix(4, "bar.com"), 228 | wildcard(5, 6, "foo.com"), 229 | ), 230 | ), 231 | ), 232 | want: []error{ 233 | ErrDuplicateSuffix{"*.foo.com", wildcard(5, 6, "foo.com"), wildcard(3, 4, "foo.com")}, 234 | }, 235 | }, 236 | 237 | { 238 | name: "dupe_wildcard_exceptions", 239 | in: list( 240 | section(1, 2, "PRIVATE DOMAINS", 241 | suffixes(2, 3, noInfo, 242 | wildcard(3, 4, "foo.com", "a", "b", "c", "a"), 243 | suffix(4, "bar.com"), 244 | suffix(5, "b.foo.com"), 245 | ), 246 | ), 247 | ), 248 | want: []error{ 249 | ErrConflictingSuffixAndException{ 250 | Suffix: suffix(5, "b.foo.com"), 251 | Wildcard: wildcard(3, 4, "foo.com", "a", "b", "c", "a"), 252 | }, 253 | }, 254 | }, 255 | 256 | { 257 | name: "dupe_spanning_blocks_and_sections", 258 | in: list( 259 | section(1, 2, "PRIVATE DOMAINS", 260 | suffixes(2, 3, noInfo, 261 | suffix(3, "foo.com"), 262 | suffix(4, "bar.com"), 263 | ), 264 | suffixes(5, 6, noInfo, 265 | suffix(6, "foo.com"), 266 | ), 267 | ), 268 | section(7, 8, "ICANN DOMAINS", 269 | suffixes(8, 9, noInfo, 270 | suffix(9, "qux.com"), 271 | suffix(10, "foo.com"), 272 | ), 273 | ), 274 | ), 275 | want: []error{ 276 | ErrDuplicateSuffix{"foo.com", suffix(6, "foo.com"), suffix(3, "foo.com")}, 277 | ErrDuplicateSuffix{"foo.com", suffix(10, "foo.com"), suffix(3, "foo.com")}, 278 | }, 279 | }, 280 | } 281 | 282 | for _, tc := range tests { 283 | t.Run(tc.name, func(t *testing.T) { 284 | got := validateSuffixUniqueness(tc.in) 285 | checkDiff(t, "validateSuffixUniqueness", got, tc.want) 286 | }) 287 | } 288 | } 289 | -------------------------------------------------------------------------------- /tools/internal/parser/metadata.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "net/mail" 5 | "net/url" 6 | "strings" 7 | ) 8 | 9 | // extractMaintainerInfo extracts structured maintainer metadata from 10 | // comment. 11 | func extractMaintainerInfo(comment *Comment) MaintainerInfo { 12 | if comment == nil || len(comment.Text) == 0 { 13 | return MaintainerInfo{MachineEditable: true} 14 | } 15 | 16 | var ( 17 | ret = MaintainerInfo{ 18 | MachineEditable: true, 19 | } 20 | lines = comment.Text 21 | firstUnusableLine = -1 22 | ) 23 | 24 | // The first line of metadata usually follows a standard 25 | // form. Handle that first, then scan through the rest of the 26 | // comment to find any further stuff. 27 | name, siteURL, email, ok := splitNameish(lines[0]) 28 | if ok { 29 | ret.Name = name 30 | if siteURL != nil { 31 | ret.URLs = append(ret.URLs, siteURL) 32 | } 33 | if email != nil { 34 | ret.Maintainers = append(ret.Maintainers, email) 35 | } 36 | lines = lines[1:] 37 | } 38 | 39 | // Aside from the special first line, remaining lines could be 40 | // maintainer emails in a few formats, or URLs, or something 41 | // else. We accumulate everything we can parse, but also keep 42 | // track of whether the information is laid out such that we could 43 | // write the information back out without data loss (although not 44 | // necessarily in the exact same format). 45 | for i, line := range lines { 46 | lineUsed := false 47 | if emails := getSubmitters(line); len(emails) > 0 { 48 | ret.Maintainers = append(ret.Maintainers, emails...) 49 | lineUsed = true 50 | } else if email, err := mail.ParseAddress(line); err == nil { 51 | ret.Maintainers = append(ret.Maintainers, email) 52 | lineUsed = true 53 | } else if u := getURL(line); u != nil { 54 | ret.URLs = append(ret.URLs, u) 55 | lineUsed = true 56 | } else if i == 0 && ret.Name == "" { 57 | ret.Name = line 58 | lineUsed = true 59 | } else { 60 | ret.Other = append(ret.Other, line) 61 | if firstUnusableLine < 0 { 62 | firstUnusableLine = i + 1 63 | } 64 | } 65 | 66 | if lineUsed && firstUnusableLine >= 0 { 67 | // Parseable lines after non-parseable lines, we cannot 68 | // confidently write the data back out without dataloss. 69 | ret.MachineEditable = false 70 | } 71 | } 72 | 73 | return ret 74 | } 75 | 76 | // submittedBy is the conventional text that precedes email contact 77 | // information in a PSL file. Most PSL entries say "Submitted by", but 78 | // there are 4 entries that are lowercase, and so we do a 79 | // case-insensitive comparison when looking for this marker. 80 | const submittedBy = "submitted by" 81 | 82 | // splitNameish tries to parse line in the form: 83 | // 84 | // ": " 85 | // 86 | // It returns the information it was able to extract. Returns all zero 87 | // values if line does not conform to the expected form. 88 | // 89 | // As of 2024-06, a few legacy representations are also handled to 90 | // improve compatibility with the existing PSL data: 91 | // 92 | // - " ()", where the URL is sometimes allowed to 93 | // omit https://. 94 | // - ": Submitted by ", where the second 95 | // part is any variant accepted by getSubmitter. 96 | // - Any amount of whitespace on either side of the colon (or 97 | // fullwidth colon). 98 | func splitNameish(line string) (name string, url *url.URL, submitter *mail.Address, ok bool) { 99 | if strings.HasPrefix(strings.ToLower(line), submittedBy) { 100 | // submitted-by lines are handled separately elsewhere, and 101 | // can be misinterpreted as entity names. 102 | return "", nil, nil, false 103 | } 104 | 105 | // Some older entries are of the form "entity name (url)". 106 | if strings.HasSuffix(line, ")") { 107 | if name, url, ok := splitNameAndURLInParens(line); ok { 108 | return name, url, nil, true 109 | } 110 | } 111 | 112 | name, rest, ok := strings.Cut(line, ":") 113 | if !ok { 114 | return "", nil, nil, false 115 | } 116 | 117 | // Clean up whitespace either side of the colon. 118 | name = strings.TrimSpace(name) 119 | rest = strings.TrimSpace(rest) 120 | 121 | if u := getURL(rest); u != nil { 122 | return name, u, nil, true 123 | } else if emails := getSubmitters(rest); len(emails) == 1 { 124 | return name, nil, emails[0], true 125 | } 126 | return "", nil, nil, false 127 | } 128 | 129 | // splitNameAndURLInParens tries to parse line in the form: 130 | // 131 | // " ()" 132 | // 133 | // It returns the information it was able to extract, or ok=false if 134 | // the line is not in the expected form. 135 | func splitNameAndURLInParens(line string) (name string, url *url.URL, ok bool) { 136 | idx := strings.LastIndexByte(line, '(') 137 | if idx == -1 { 138 | return "", nil, false 139 | } 140 | name = strings.TrimSpace(line[:idx]) 141 | urlStr := strings.TrimSpace(line[idx+1 : len(line)-1]) 142 | 143 | if u := getURL(urlStr); u != nil { 144 | return name, u, true 145 | } 146 | 147 | return "", nil, false 148 | } 149 | 150 | // getURL tries to parse line as an HTTP/HTTPS URL. 151 | // Returns the URL if line is a well formed URL and nothing but a URL, 152 | // or nil otherwise. 153 | func getURL(line string) *url.URL { 154 | // One PSL entry says "see " instead of just a URL. 155 | // 156 | // TODO: fix the source and delete this hack. 157 | if strings.HasPrefix(line, "see https://www.information.aero") { 158 | line = strings.TrimPrefix(line, "see ") 159 | } 160 | 161 | u, err := url.Parse(line) 162 | if err != nil { 163 | return nil 164 | } 165 | 166 | if u.Scheme != "http" && u.Scheme != "https" { 167 | // Caller might have split https://foo.com into [https : 168 | // //foo.com], and the last part is a valid scheme-relative 169 | // URL. Only accept parses that feature an explicit http(s) 170 | // scheme. 171 | return nil 172 | } 173 | 174 | return u 175 | } 176 | 177 | // getSubmitter tries to parse line as a submitter email line, usually: 178 | // 179 | // Submitted by Person Name 180 | // 181 | // To improve compatibility, a few legacy freeform styles are also 182 | // attempted if the one above fails. 183 | // 184 | // Returns the parsed RFC 5322 address, or nil if line does not 185 | // conform to the expected shape. 186 | func getSubmitters(line string) []*mail.Address { 187 | if strings.HasPrefix(strings.ToLower(line), submittedBy) { 188 | line = line[len(submittedBy):] 189 | } 190 | // Some entries read "Submitted by: ..." with an extra colon. 191 | line = strings.TrimLeft(line, ":") 192 | line = strings.TrimSpace(line) 193 | // Some ICANN domains lead with "Submitted by registry". 194 | line = strings.TrimPrefix(line, "registry ") 195 | 196 | var ret []*mail.Address 197 | emailStrs := strings.Split(line, " and ") 198 | 199 | fullyParsed := true 200 | for _, emailStr := range emailStrs { 201 | addr, err := mail.ParseAddress(emailStr) 202 | if err != nil { 203 | fullyParsed = false 204 | continue 205 | } 206 | ret = append(ret, addr) 207 | } 208 | 209 | if fullyParsed { 210 | // Found a way to consume the entire input, we're done. 211 | return ret 212 | } 213 | 214 | // One current entry uses old school email obfuscation to foil 215 | // spam bots, which makes it an invalid address. 216 | // 217 | // TODO: fix the source and delete this hack. 218 | if strings.Contains(line, "lohmus dot me") { 219 | cleaned := strings.Replace(line, " at ", "@", 1) 220 | cleaned = strings.Replace(cleaned, " dot ", ".", 1) 221 | if addr, err := mail.ParseAddress(cleaned); err == nil { 222 | return []*mail.Address{addr} 223 | } 224 | } 225 | 226 | // The normal form failed but there is a "submitted by". If the 227 | // last word is an email address, assume the remainder is a name. 228 | fs := strings.Fields(line) 229 | if len(fs) > 0 { 230 | if addr, err := mail.ParseAddress(fs[len(fs)-1]); err == nil { 231 | name := strings.Join(fs[:len(fs)-1], " ") 232 | name = strings.Trim(name, " ,:") 233 | addr.Name = name 234 | return []*mail.Address{addr} 235 | } 236 | } 237 | 238 | return nil 239 | } 240 | -------------------------------------------------------------------------------- /linter/pslint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*-# 3 | # 4 | # PSL linter written in python 5 | # 6 | # Copyright 2016 Tim Rühsen (tim dot ruehsen at gmx dot de). All rights reserved. 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a 9 | # copy of this software and associated documentation files (the "Software"), 10 | # to deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | import sys 27 | import codecs 28 | import unicodedata 29 | 30 | nline = 0 31 | line = "" 32 | orig_line = "" 33 | warnings = 0 34 | errors = 0 35 | skip_order_check = False 36 | 37 | def warning(msg): 38 | global warnings, orig_line, nline 39 | print('%d: warning: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else "")) 40 | warnings += 1 41 | 42 | def error(msg): 43 | global errors, orig_line, nline 44 | print('%d: error: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else "")) 45 | errors += 1 46 | # skip_order_check = True 47 | 48 | def print_psl(list): 49 | for domain in list: 50 | print(".".join(str(label) for label in reversed(domain))) 51 | 52 | def psl_key(s): 53 | if s[0] == '*': 54 | return 0 55 | if s[0] == '!': 56 | return 1 57 | return 2 58 | 59 | def check_order(group): 60 | """Check the correct order of a domain group""" 61 | global skip_order_check 62 | 63 | try: 64 | if skip_order_check or len(group) < 2: 65 | skip_order_check = False 66 | return 67 | 68 | # check if the TLD is the identical within the group 69 | if any(group[0][0] != labels[0] for labels in group): 70 | warning('Domain group TLD is not consistent') 71 | 72 | # sort by # of labels, label-by-label (labels are in reversed order) 73 | sorted_group = sorted(group, key = lambda labels: (len(labels), psl_key(labels[-1][0]), labels)) 74 | 75 | if group != sorted_group: 76 | warning('Incorrectly sorted group of domains') 77 | print(" " + str(group)) 78 | print(" " + str(sorted_group)) 79 | print("Correct sorting would be:") 80 | print_psl(sorted_group) 81 | 82 | finally: 83 | del group[:] 84 | 85 | 86 | def lint_psl(infile): 87 | """Parses PSL file and performs syntax checking""" 88 | global orig_line, nline 89 | 90 | PSL_FLAG_EXCEPTION = (1<<0) 91 | PSL_FLAG_WILDCARD = (1<<1) 92 | PSL_FLAG_ICANN = (1<<2) # entry of ICANN section 93 | PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section 94 | PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking 95 | 96 | line2number = {} 97 | line2flag = {} 98 | group = [] 99 | section = 0 100 | icann_sections = 0 101 | private_sections = 0 102 | 103 | lines = [line.strip('\n') for line in infile] 104 | 105 | for line in lines: 106 | nline += 1 107 | 108 | # check for leading/trailing whitespace 109 | stripped = line.strip() 110 | if stripped != line: 111 | line = line.replace('\t','\\t') 112 | line = line.replace('\r','^M') 113 | orig_line = line 114 | warning('Leading/Trailing whitespace') 115 | orig_line = line 116 | line = stripped 117 | 118 | # empty line (end of sorted domain group) 119 | if not line: 120 | # check_order(group) 121 | continue 122 | 123 | # check for section begin/end 124 | if line[0:2] == "//": 125 | # check_order(group) 126 | 127 | if section == 0: 128 | if line == "// ===BEGIN ICANN DOMAINS===": 129 | section = PSL_FLAG_ICANN 130 | icann_sections += 1 131 | elif line == "// ===BEGIN PRIVATE DOMAINS===": 132 | section = PSL_FLAG_PRIVATE 133 | private_sections += 1 134 | elif line[3:11] == "===BEGIN": 135 | error('Unexpected begin of unknown section') 136 | elif line[3:9] == "===END": 137 | error('End of section without previous begin') 138 | elif section == PSL_FLAG_ICANN: 139 | if line == "// ===END ICANN DOMAINS===": 140 | section = 0 141 | elif line[3:11] == "===BEGIN": 142 | error('Unexpected begin of section: ') 143 | elif line[3:9] == "===END": 144 | error('Unexpected end of section') 145 | elif section == PSL_FLAG_PRIVATE: 146 | if line == "// ===END PRIVATE DOMAINS===": 147 | section = 0 148 | elif line[3:11] == "===BEGIN": 149 | error('Unexpected begin of section') 150 | elif line[3:9] == "===END": 151 | error('Unexpected end of section') 152 | 153 | continue # processing of comments ends here 154 | 155 | # No rule must be outside of a section 156 | if section == 0: 157 | error('Rule outside of section') 158 | 159 | group.append(list(reversed(line.split('.')))) 160 | 161 | # decode UTF-8 input into unicode, needed only for python 2.x 162 | try: 163 | if sys.version_info[0] < 3: 164 | line = line.decode('utf-8') 165 | else: 166 | line.encode('utf-8') 167 | except (UnicodeDecodeError, UnicodeEncodeError): 168 | orig_line = None 169 | error('Invalid UTF-8 character') 170 | continue 171 | 172 | # rules must be NFC coded (Unicode's Normal Form Kanonical Composition) 173 | if unicodedata.normalize("NFKC", line) != line: 174 | error('Rule must be NFKC') 175 | 176 | # each rule must be lowercase (or more exactly: not uppercase and not titlecase) 177 | if line != line.lower(): 178 | error('Rule must be lowercase') 179 | 180 | # strip leading wildcards 181 | flags = section 182 | # while line[0:2] == '*.': 183 | if line[0:2] == '*.': 184 | flags |= PSL_FLAG_WILDCARD 185 | line = line[2:] 186 | 187 | if line[0] == '!': 188 | flags |= PSL_FLAG_EXCEPTION 189 | line = line[1:] 190 | else: 191 | flags |= PSL_FLAG_PLAIN 192 | 193 | # wildcard and exception must not combine 194 | if flags & PSL_FLAG_WILDCARD and flags & PSL_FLAG_EXCEPTION: 195 | error('Combination of wildcard and exception') 196 | continue 197 | 198 | labels = line.split('.') 199 | 200 | if flags & PSL_FLAG_EXCEPTION and len(labels) > 1: 201 | domain = ".".join(str(label) for label in labels[1:]) 202 | if not domain in line2flag: 203 | error('Exception without previous wildcard') 204 | elif not line2flag[domain] & PSL_FLAG_WILDCARD: 205 | error('Exception without previous wildcard') 206 | 207 | for label in labels: 208 | if not label: 209 | error('Leading/trailing or multiple dot') 210 | continue 211 | 212 | if label[0:4] == 'xn--': 213 | error('Punycode found') 214 | continue 215 | 216 | if '--' in label: 217 | error('Double minus found') 218 | continue 219 | 220 | # allowed are a-z,0-9,- and unicode >= 128 (maybe that can be finetuned a bit !?) 221 | for c in label: 222 | if not c.isalnum() and c != '-' and ord(c) < 128: 223 | error('Illegal character') 224 | break 225 | 226 | if line in line2flag: 227 | '''Found existing entry: 228 | Combination of exception and plain rule is contradictionary 229 | !foo.bar + foo.bar 230 | Doublette, since *.foo.bar implies foo.bar: 231 | foo.bar + *.foo.bar 232 | Allowed: 233 | !foo.bar + *.foo.bar 234 | ''' 235 | error('Found doublette/ambiguity (previous line was %d)' % line2number[line]) 236 | 237 | line2number[line] = nline 238 | line2flag[line] = flags 239 | 240 | orig_line = None 241 | 242 | if section == PSL_FLAG_ICANN: 243 | error('ICANN section not closed') 244 | elif section == PSL_FLAG_PRIVATE: 245 | error('PRIVATE section not closed') 246 | 247 | if icann_sections < 1: 248 | warning('No ICANN section found') 249 | elif icann_sections > 1: 250 | warning('%d ICANN sections found' % icann_sections) 251 | 252 | if private_sections < 1: 253 | warning('No PRIVATE section found') 254 | elif private_sections > 1: 255 | warning('%d PRIVATE sections found' % private_sections) 256 | 257 | def usage(): 258 | """Prints the usage""" 259 | print('usage: %s PSLfile' % sys.argv[0]) 260 | print('or %s - # To read PSL from STDIN' % sys.argv[0]) 261 | exit(1) 262 | 263 | 264 | def main(): 265 | """Check syntax of a PSL file""" 266 | if len(sys.argv) < 2: 267 | usage() 268 | 269 | with sys.stdin if sys.argv[-1] == '-' else open(sys.argv[-1], 'r', encoding='utf-8', errors="surrogateescape") as infile: 270 | lint_psl(infile) 271 | 272 | return errors != 0 273 | 274 | 275 | if __name__ == '__main__': 276 | sys.exit(main()) 277 | -------------------------------------------------------------------------------- /tools/psltool/psltool.go: -------------------------------------------------------------------------------- 1 | // psltool is a CLI tool to manipulate and validate PSL files. 2 | package main 3 | 4 | import ( 5 | "bytes" 6 | "context" 7 | "errors" 8 | "fmt" 9 | "io" 10 | "log" 11 | "os" 12 | "os/signal" 13 | "path/filepath" 14 | "strconv" 15 | "strings" 16 | "syscall" 17 | "time" 18 | "unicode" 19 | 20 | "github.com/creachadair/command" 21 | "github.com/creachadair/flax" 22 | "github.com/creachadair/mds/mdiff" 23 | "github.com/natefinch/atomic" 24 | "github.com/publicsuffix/list/tools/internal/githistory" 25 | "github.com/publicsuffix/list/tools/internal/github" 26 | "github.com/publicsuffix/list/tools/internal/parser" 27 | ) 28 | 29 | func main() { 30 | log.SetFlags(0) 31 | 32 | root := &command.C{ 33 | Name: filepath.Base(os.Args[0]), 34 | Usage: "command [flags] ...\nhelp [command]", 35 | Help: "A command-line tool to edit and validate PSL files.", 36 | Commands: []*command.C{ 37 | { 38 | Name: "fmt", 39 | Usage: "", 40 | Help: `Format a PSL file. 41 | 42 | By default, the given file is updated in place.`, 43 | SetFlags: command.Flags(flax.MustBind, &fmtArgs), 44 | Run: command.Adapt(runFmt), 45 | }, 46 | { 47 | Name: "validate", 48 | Usage: "", 49 | Help: `Check that a file is a valid PSL file. 50 | 51 | Validation includes basic issues like parse errors, as well as 52 | conformance with the PSL project's style rules and policies. 53 | 54 | The argument can be either a local file, or a git commit hash to fetch 55 | from https://github.com/publicsuffix/list.`, 56 | SetFlags: command.Flags(flax.MustBind, &validateArgs), 57 | Run: command.Adapt(runValidate), 58 | }, 59 | { 60 | Name: "check-pr", 61 | Usage: "", 62 | Help: `Validate an open PR on GitHub. 63 | 64 | Validation includes basic issues like parse errors, as well as 65 | conformance with the PSL project's style rules and policies.`, 66 | SetFlags: command.Flags(flax.MustBind, &checkPRArgs), 67 | Run: command.Adapt(runCheckPR), 68 | }, 69 | { 70 | Name: "debug", 71 | Commands: []*command.C{ 72 | { 73 | Name: "dump", 74 | Usage: "", 75 | Help: "Print a debug dump of a PSL file.", 76 | SetFlags: command.Flags(flax.MustBind, &debugDumpArgs), 77 | Run: command.Adapt(runDebugDump), 78 | }, 79 | }, 80 | }, 81 | 82 | command.HelpCommand(nil), 83 | command.VersionCommand(), 84 | }, 85 | } 86 | 87 | ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) 88 | defer cancel() 89 | env := root.NewEnv(nil).SetContext(ctx).MergeFlags(true) 90 | command.RunOrFail(env, os.Args[1:]) 91 | } 92 | 93 | var fmtArgs struct { 94 | Diff bool `flag:"d,Output a diff of changes instead of rewriting the file"` 95 | } 96 | 97 | func runFmt(env *command.Env, path string) error { 98 | bs, err := os.ReadFile(path) 99 | if err != nil { 100 | return fmt.Errorf("Failed to read PSL file: %w", err) 101 | } 102 | 103 | psl, parseErrs := parser.Parse(bs) 104 | fmtErrs := psl.Clean() 105 | 106 | for _, err := range parseErrs { 107 | fmt.Fprintln(env, err) 108 | } 109 | for _, err := range fmtErrs { 110 | fmt.Fprintln(env, err) 111 | } 112 | 113 | clean := psl.MarshalPSL() 114 | changed := !bytes.Equal(bs, clean) 115 | 116 | if changed { 117 | if fmtArgs.Diff { 118 | lhs, rhs := strings.Split(string(bs), "\n"), strings.Split(string(clean), "\n") 119 | diff := mdiff.New(lhs, rhs).AddContext(3) 120 | mdiff.FormatUnified(os.Stdout, diff, &mdiff.FileInfo{ 121 | Left: "a/" + path, 122 | Right: "b/" + path, 123 | }) 124 | return errors.New("File needs reformatting, rerun without -d to fix") 125 | } 126 | if len(parseErrs) > 0 { 127 | return errors.New("Cannot reformat file due to parse errors") 128 | } 129 | if err := atomic.WriteFile(path, bytes.NewReader(clean)); err != nil { 130 | return fmt.Errorf("Failed to reformat: %w", err) 131 | } 132 | } 133 | 134 | return nil 135 | } 136 | 137 | var validateArgs struct { 138 | Owner string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"` 139 | Repo string `flag:"gh-repo,default=list,Github repository to check"` 140 | Clone string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"` 141 | Online bool `flag:"online-checks,Run validations that require querying third-party servers"` 142 | } 143 | 144 | func isHex(s string) bool { 145 | for _, r := range s { 146 | if !unicode.In(r, unicode.ASCII_Hex_Digit) { 147 | return false 148 | } 149 | } 150 | return true 151 | } 152 | 153 | func runValidate(env *command.Env, pathOrHash string) error { 154 | var bs []byte 155 | var err error 156 | 157 | client := github.Repo{ 158 | Owner: checkPRArgs.Owner, 159 | Repo: checkPRArgs.Repo, 160 | } 161 | 162 | isPath := false 163 | if _, err = os.Stat(pathOrHash); err == nil { 164 | // input is a local file 165 | isPath = true 166 | bs, err = os.ReadFile(pathOrHash) 167 | } else if isHex(pathOrHash) { 168 | // input looks like a git hash 169 | bs, err = client.PSLForHash(context.Background(), pathOrHash) 170 | } else { 171 | return fmt.Errorf("Failed to read PSL file %q, not a local file or a git commit hash", pathOrHash) 172 | } 173 | if err != nil { 174 | return fmt.Errorf("Failed to read PSL file %q: %w", pathOrHash, err) 175 | } 176 | 177 | psl, errs := parser.Parse(bs) 178 | errs = append(errs, psl.Clean()...) 179 | errs = append(errs, parser.ValidateOffline(psl)...) 180 | if validateArgs.Online { 181 | if validateArgs.Clone == "" && isPath { 182 | // Assume the PSL file being validated might be in a git 183 | // clone, and try to use that as the reference for history. 184 | validateArgs.Clone = filepath.Dir(pathOrHash) 185 | } 186 | if validateArgs.Clone == "" { 187 | return errors.New("--gh-local-clone is required for full validation") 188 | } 189 | prHistory, err := githistory.GetPRInfo(validateArgs.Clone) 190 | if err != nil { 191 | return fmt.Errorf("failed to get local PR history, refusing to run full validation to avoid Github DoS: %w", err) 192 | } 193 | 194 | ctx, cancel := context.WithTimeout(env.Context(), 1200*time.Second) 195 | defer cancel() 196 | errs = append(errs, parser.ValidateOnline(ctx, psl, &client, prHistory)...) 197 | } 198 | 199 | clean := psl.MarshalPSL() 200 | if !bytes.Equal(bs, clean) { 201 | errs = append(errs, errors.New("file needs reformatting, run 'psltool fmt' to fix")) 202 | } 203 | 204 | for _, err := range errs { 205 | fmt.Fprintln(env, err) 206 | } 207 | 208 | if l := len(errs); l == 0 { 209 | fmt.Fprintln(env, "PSL file is valid") 210 | return nil 211 | } else if l == 1 { 212 | return errors.New("file has 1 error") 213 | } else { 214 | return fmt.Errorf("file has %d errors", l) 215 | } 216 | } 217 | 218 | var checkPRArgs struct { 219 | Owner string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"` 220 | Repo string `flag:"gh-repo,default=list,Github repository to check"` 221 | Clone string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"` 222 | Online bool `flag:"online-checks,Run validations that require querying third-party servers"` 223 | } 224 | 225 | func runCheckPR(env *command.Env, prStr string) error { 226 | pr, err := strconv.Atoi(prStr) 227 | if err != nil { 228 | return fmt.Errorf("invalid PR number %q: %w", prStr, err) 229 | } 230 | 231 | client := github.Repo{ 232 | Owner: checkPRArgs.Owner, 233 | Repo: checkPRArgs.Repo, 234 | } 235 | withoutPR, withPR, err := client.PSLForPullRequest(env.Context(), pr) 236 | if err != nil { 237 | return err 238 | } 239 | 240 | before, _ := parser.Parse(withoutPR) 241 | after, errs := parser.Parse(withPR) 242 | after.SetBaseVersion(before, true) 243 | errs = append(errs, after.Clean()...) 244 | errs = append(errs, parser.ValidateOffline(after)...) 245 | if checkPRArgs.Online { 246 | var prHistory *githistory.History 247 | if validateArgs.Clone != "" { 248 | prHistory, err = githistory.GetPRInfo(validateArgs.Clone) 249 | if err != nil { 250 | return fmt.Errorf("failed to get local PR history: %w", err) 251 | } 252 | } 253 | 254 | ctx, cancel := context.WithTimeout(env.Context(), 300*time.Second) 255 | defer cancel() 256 | errs = append(errs, parser.ValidateOnline(ctx, after, &client, prHistory)...) 257 | } 258 | 259 | clean := after.MarshalPSL() 260 | if !bytes.Equal(withPR, clean) { 261 | errs = append(errs, errors.New("file needs reformatting, run 'psltool fmt' to fix")) 262 | } 263 | 264 | // Print the blocks marked changed, so a human can check that 265 | // something was actually checked by validations. 266 | var changed []*parser.Suffixes 267 | for _, block := range parser.BlocksOfType[*parser.Suffixes](after) { 268 | if block.Changed() { 269 | changed = append(changed, block) 270 | } 271 | } 272 | if len(changed) == 0 { 273 | fmt.Fprintln(env, "No suffix blocks changed. This can happen if only top-level comments have been edited.") 274 | } else { 275 | fmt.Fprintln(env, "Checked the following changed suffix blocks:") 276 | for _, block := range changed { 277 | fmt.Fprintf(env, " %q (%s)\n", block.Info.Name, block.LocationString()) 278 | } 279 | } 280 | io.WriteString(env, "\n") 281 | 282 | if len(errs) > 0 { 283 | for _, err := range errs { 284 | fmt.Fprintln(env, err) 285 | } 286 | io.WriteString(env, "\n") 287 | } 288 | 289 | if l := len(errs); l == 0 { 290 | fmt.Fprintln(env, "PSL change is valid") 291 | return nil 292 | } else if l == 1 { 293 | return errors.New("change has 1 error") 294 | } else { 295 | return fmt.Errorf("change has %d errors", l) 296 | } 297 | } 298 | 299 | var debugDumpArgs struct { 300 | Clean bool `flag:"c,Clean AST before dumping"` 301 | Format string `flag:"f,default=ast,Format to dump in, one of 'ast' or 'psl'"` 302 | } 303 | 304 | func runDebugDump(env *command.Env, path string) error { 305 | var dumpFn func(*parser.List) []byte 306 | switch debugDumpArgs.Format { 307 | case "ast": 308 | dumpFn = (*parser.List).MarshalDebug 309 | case "psl": 310 | dumpFn = (*parser.List).MarshalPSL 311 | default: 312 | return fmt.Errorf("unknown dump format %q", debugDumpArgs.Format) 313 | } 314 | 315 | bs, err := os.ReadFile(path) 316 | if err != nil { 317 | return fmt.Errorf("failed to read PSL file: %w", err) 318 | } 319 | 320 | psl, errs := parser.Parse(bs) 321 | 322 | if debugDumpArgs.Clean { 323 | errs = append(errs, psl.Clean()...) 324 | } 325 | 326 | for _, err := range errs { 327 | fmt.Fprintln(env, err) 328 | } 329 | 330 | bs = dumpFn(psl) 331 | os.Stdout.Write(bs) 332 | return nil 333 | } 334 | -------------------------------------------------------------------------------- /tools/internal/domain/domain_test.go: -------------------------------------------------------------------------------- 1 | package domain_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "slices" 8 | "strconv" 9 | "strings" 10 | "testing" 11 | "unicode/utf8" 12 | 13 | "github.com/publicsuffix/list/tools/internal/domain" 14 | "golang.org/x/net/idna" 15 | ) 16 | 17 | //go:generate go run update_idna_testdata.go 18 | 19 | func TestParse(t *testing.T) { 20 | // This test is using the official Unicode IDNA test vectors, to 21 | // verify that domain.Parse is processing inputs exactly as 22 | // Unicode TR46 specifies. This is mostly a test of the behavior 23 | // of the underlying x/net/idna, but given the importance of 24 | // correctly validating public suffixes, we explicitly verify that 25 | // x/text/idna behaves correctly, and that our wrapper code 26 | // doesn't do anything surprising. 27 | 28 | numVectors := forEachIDNATestVector(t, func(input, want string, wantErr bool) { 29 | // PSL style deviates slightly from pure IDNA style, by 30 | // removing trailing dots if present. The removal is silent 31 | // because it doesn't affect the meaning of suffixes, but that 32 | // means the following tests have to allow for missing dots. 33 | // 34 | // Fortunately this adjustment does not break any of the IDNA 35 | // test vectors. 36 | wantNoTrailingDot := strings.TrimSuffix(want, ".") 37 | 38 | got, err := domain.Parse(input) 39 | gotErr := err != nil 40 | if gotErr != wantErr { 41 | t.Errorf("domain.Parse(%q) gotErr=%v, want %v", input, gotErr, wantErr) 42 | if err != nil { 43 | t.Logf("parse error was: %v", err) 44 | } 45 | } 46 | 47 | if err == nil && got.String() != wantNoTrailingDot { 48 | t.Errorf("domain.Parse(%q) = %q, want %q", input, got.String(), wantNoTrailingDot) 49 | } 50 | 51 | // Further tests only make sense on successful parses. 52 | if wantErr { 53 | return 54 | } 55 | 56 | // Domain parse succeeded, domain.ParseLabel of each label 57 | // must also succeed. 58 | // 59 | // We only do this for test vectors that don't return an 60 | // error, which means 'want' is in canonical form and '.' is 61 | // the only label separator character. 62 | var gotLabels []domain.Label 63 | for _, labelStr := range strings.Split(wantNoTrailingDot, ".") { 64 | label, err := domain.ParseLabel(labelStr) 65 | if err != nil { 66 | t.Errorf("domain.ParseLabel(%q) got err: %v", labelStr, err) 67 | } else { 68 | gotLabels = append(gotLabels, label) 69 | } 70 | } 71 | 72 | if wantLabels := got.Labels(); !slices.EqualFunc(gotLabels, wantLabels, domain.Label.Equal) { 73 | t.Error("domain.ParseLabel() of each label is not equivalent to ParseDomain().Labels()") 74 | t.Logf("domain.ParseLabel() : %#v", gotLabels) 75 | t.Logf("domain.Parse().Labels(): %#v", wantLabels) 76 | } 77 | 78 | // ParseLabel must refuse to parse entire domains 79 | if got.NumLabels() > 1 { 80 | if gotLabel, err := domain.ParseLabel(input); err == nil { 81 | t.Errorf("domain.ParseLabel(%q) got %q, want parse error", input, gotLabel) 82 | } 83 | } 84 | 85 | // Domain and label comparisons are reflexive. 86 | if gotCmp := got.Compare(got); gotCmp != 0 { 87 | t.Errorf("Name.Compare(%q, %q) = %d, want 0", got, got, gotCmp) 88 | } 89 | for _, label := range gotLabels { 90 | if gotCmp := label.Compare(label); gotCmp != 0 { 91 | t.Errorf("Label.Compare(%q, %q) = %d, want 0", label, label, gotCmp) 92 | } 93 | } 94 | }) 95 | t.Logf("checked %d test vectors", numVectors) 96 | 97 | // Sanity check to make sure the parser didn't just silently skip 98 | // all test inputs. Manual inspection of the Unicode 15.0 test 99 | // file shows 6235 tests. We allow a small amount of reduction 100 | // because tests occasionally get removed (e.g. Unicode 15.1 101 | // removes some vectors relating to deprecated special handling 102 | // of "ß" in case mapping). 103 | const minVectors = 6200 104 | if numVectors < minVectors { 105 | t.Errorf("found %d test vectors, want at least %d", numVectors, minVectors) 106 | } 107 | } 108 | 109 | // forEachIDNATestVector parses testdata/idna_test_vectors.txt and 110 | // calls fn in a subtest for each test vector. Return the number of 111 | // test vectors found in the file. 112 | func forEachIDNATestVector(t *testing.T, fn func(input, want string, wantErr bool)) (numVectorsFound int) { 113 | t.Helper() 114 | 115 | const testfile = "testdata/idna_test_vectors.txt" 116 | 117 | // Process the file in 2 passes. This is less efficient, it's 118 | // possible to stream the test file and do all this in one pass, 119 | // but the result is less readable. 120 | bs, err := os.ReadFile(testfile) 121 | if err != nil { 122 | t.Fatalf("reading IDNA test vectors: %v", err) 123 | } 124 | lines := strings.Split(string(bs), "\n") 125 | 126 | type testCase struct { 127 | line int 128 | raw string 129 | fields []string 130 | } 131 | var tests []testCase 132 | foundUnicodeVersion := false 133 | for i, ln := range lines { 134 | if ln == "" { 135 | continue 136 | } 137 | 138 | if unicodeVersion, ok := strings.CutPrefix(ln, "# Version: "); ok { 139 | if unicodeVersion != idna.UnicodeVersion { 140 | t.Fatalf("IDNA test file %q is for Unicode version %s, but x/net/idna uses version %s. Run 'go generate' to update the test file.", testfile, unicodeVersion, idna.UnicodeVersion) 141 | } 142 | foundUnicodeVersion = true 143 | continue 144 | } 145 | 146 | if strings.HasPrefix(ln, "#") { 147 | continue 148 | } 149 | 150 | fs := strings.Split(ln, "; ") 151 | if len(fs) != 7 { 152 | t.Fatalf("line %d: unrecognized test vector format: %s", i+1, ln) 153 | } 154 | tests = append(tests, testCase{i + 1, ln, fs}) 155 | } 156 | if !foundUnicodeVersion { 157 | t.Fatalf("failed to determine Unicode version of test file, cannot proceed") 158 | } 159 | 160 | // Now we've collected all the test cases, prepare the inputs and 161 | // run the tests. 162 | for _, tc := range tests { 163 | input := tc.fields[0] 164 | want := tc.fields[1] 165 | wantErr := tc.fields[2] != "" 166 | 167 | // the input and want strings contain Unicode escape 168 | // sequences, so that the test can express precise invalid 169 | // inputs without risking accidental canonicalization by 170 | // editors and file readers. We have to carefully undo that 171 | // here, without making unwanted changes to the strings. 172 | input = unquoteVector(t, input) 173 | want = unquoteVector(t, want) 174 | 175 | // The test file format specifies that if the expected output 176 | // is the same as the input, they don't repeat it. 177 | if want == "" { 178 | want = input 179 | } 180 | 181 | t.Run(fmt.Sprintf("line_%d", tc.line), func(t *testing.T) { 182 | fn(input, want, wantErr) 183 | if t.Failed() { 184 | t.Logf("failing test vector: %s", tc.raw) 185 | } 186 | }) 187 | } 188 | 189 | return len(tests) 190 | } 191 | 192 | // unquoteVector returns its input, with \uXXXX Unicode escape 193 | // sequences converted to the corresponding UTF-8 bytes. 194 | // 195 | // In theory we could use strconv.Unquote, but that function handles 196 | // more escape sequences that are not specified in the IDNA test 197 | // format. Unquote may also mangle strings that are not valid UTF-8 in 198 | // surprising ways, which could silently make tests check the wrong 199 | // thing. To be safe, we do the unquoting ourselves, so that we are in 200 | // full control of all mutations. 201 | func unquoteVector(t *testing.T, s string) string { 202 | t.Helper() 203 | 204 | bs := []byte(s) 205 | var out []byte 206 | 207 | for { 208 | start, rest, found := bytes.Cut(bs, []byte(`\u`)) 209 | out = append(out, start...) 210 | if !found { 211 | // No more escapes, we're done. 212 | break 213 | } 214 | 215 | // next 4 bytes are hex digits 216 | if len(rest) < 4 { 217 | t.Fatalf("malformed unicode escape sequence in %q", s) 218 | } 219 | hexStr := string(rest[:4]) 220 | runeVal, err := strconv.ParseUint(hexStr, 16, 64) 221 | if err != nil { 222 | t.Fatalf("malformed unicode escape sequence in %q", s) 223 | } 224 | out = utf8.AppendRune(out, rune(runeVal)) 225 | 226 | bs = rest[4:] 227 | } 228 | 229 | if !utf8.Valid(out) { 230 | t.Fatalf("string %q is invalid UTF-8 after unquote: %q", s, string(out)) 231 | } 232 | return string(out) 233 | } 234 | 235 | func TestLabelCompare(t *testing.T) { 236 | tests := []struct { 237 | a, b string 238 | want int 239 | }{ 240 | {"com", "com", 0}, 241 | {"com", "org", -1}, 242 | {"com", "aaa", +1}, 243 | // Equivalent strings in NFC and NFD, ParseLabel should 244 | // canonicalize to equal. 245 | {"Québécois", "Que\u0301be\u0301cois", 0}, 246 | // From the xn--o3cw4h block of the PSL. 247 | {"ทหาร", "ธุรกิจ", -1}, 248 | {"ทหาร", "com", +1}, 249 | } 250 | 251 | for _, tc := range tests { 252 | la, err := domain.ParseLabel(tc.a) 253 | if err != nil { 254 | t.Fatalf("ParseLabel(%q) failed: %v", tc.a, err) 255 | } 256 | lb, err := domain.ParseLabel(tc.b) 257 | if err != nil { 258 | t.Fatalf("ParseLabel(%q) failed: %v", tc.b, err) 259 | } 260 | 261 | gotCmp := domain.Label.Compare(la, lb) 262 | if gotCmp != tc.want { 263 | t.Errorf("Label.Compare(%q, %q) = %d, want %d", la, lb, gotCmp, tc.want) 264 | } 265 | wantEq := tc.want == 0 266 | if gotEq := domain.Label.Equal(la, lb); gotEq != wantEq { 267 | t.Errorf("Label.Equal(%q, %q) = %v, want %v", la, lb, gotEq, wantEq) 268 | } 269 | 270 | // Same again, but backwards. 271 | gotCmp = domain.Label.Compare(lb, la) 272 | if want := -tc.want; gotCmp != want { 273 | t.Errorf("Label.Compare(%q, %q) = %d, want %d", lb, la, gotCmp, want) 274 | } 275 | if gotEq := domain.Label.Equal(lb, la); gotEq != wantEq { 276 | t.Errorf("Label.Equal(%q, %q) = %v, want %v", lb, la, gotEq, wantEq) 277 | } 278 | } 279 | } 280 | 281 | func TestNameCompare(t *testing.T) { 282 | tests := []struct { 283 | a, b string 284 | want int 285 | }{ 286 | {"foo.com", "foo.com.", 0}, 287 | {"com", "org", -1}, 288 | {"com", "aaa", +1}, 289 | // Equivalent strings in NFC and NFD, ParseLabel should 290 | // canonicalize to equal. 291 | {"Québécois", "Que\u0301be\u0301cois", 0}, 292 | // From the xn--o3cw4h block of the PSL. 293 | {"ทหาร", "ธุรกิจ", -1}, 294 | {"ทหาร", "com", +1}, 295 | } 296 | 297 | for _, tc := range tests { 298 | da, err := domain.Parse(tc.a) 299 | if err != nil { 300 | t.Fatalf("ParseLabel(%q) failed: %v", tc.a, err) 301 | } 302 | db, err := domain.Parse(tc.b) 303 | if err != nil { 304 | t.Fatalf("ParseLabel(%q) failed: %v", tc.b, err) 305 | } 306 | 307 | gotCmp := domain.Name.Compare(da, db) 308 | if gotCmp != tc.want { 309 | t.Errorf("Label.Compare(%q, %q) = %d, want %d", da, db, gotCmp, tc.want) 310 | } 311 | wantEq := tc.want == 0 312 | if gotEq := domain.Name.Equal(da, db); gotEq != wantEq { 313 | t.Errorf("Label.Equal(%q, %q) = %v, want %v", da, db, gotEq, wantEq) 314 | } 315 | 316 | // Same again, but backwards. 317 | gotCmp = domain.Name.Compare(db, da) 318 | if want := -tc.want; gotCmp != want { 319 | t.Errorf("Label.Compare(%q, %q) = %d, want %d", db, da, gotCmp, want) 320 | } 321 | if gotEq := domain.Name.Equal(db, da); gotEq != wantEq { 322 | t.Errorf("Label.Equal(%q, %q) = %v, want %v", db, da, gotEq, wantEq) 323 | } 324 | } 325 | } 326 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Public Suffix List (PSL) Submission 2 | 3 | 6 | 7 | 27 | 28 | ### Checklist of required steps 29 | 30 | * [ ] Description of Organization 31 | * [ ] Robust Reason for PSL Inclusion 32 | * [ ] DNS verification via dig 33 | 34 | * [ ] Each domain listed in the PRIVATE section has and shall maintain at least two years remaining on registration, and we shall keep the `_psl` TXT record in place in the respective zone(s). 35 | 36 | __Submitter affirms the following:__ 37 | 48 | 49 | * [ ] We are listing *any* third-party limits that we seek to work around in our rationale such as those between IOS 14.5+ and Facebook (see [Issue #1245](https://github.com/publicsuffix/list/issues/1245) as a well-documented example) 50 | - [Cloudflare](https://developers.cloudflare.com/learning-paths/get-started/add-domain-to-cf/add-site/) 51 | - [Let's Encrypt](https://letsencrypt.org/docs/rate-limits/) 52 | - MAKE SURE UPDATE THE FOLLOWING LIST WITH YOUR LIMITATIONS! REMOVE ENTRIES WHICH DO NOT APPLY AS WELL AS REMOVING THIS LINE! 53 | 54 | 69 | 70 | * [ ] This request was _not_ submitted with the objective of working around other third-party limits. 71 | 72 | 77 | 78 | * [ ] The submitter acknowledges that it is their responsibility to maintain the domains within their section. This includes removing names which are no longer used, retaining the _psl DNS entry, and responding to e-mails to the supplied address. Failure to maintain entries may result in removal of individual entries or the entire section. 79 | 80 | 88 | 89 | * [ ] The [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines) were carefully _read_ and _understood_, and this request conforms to them. 90 | * [ ] The submission follows the [guidelines](https://github.com/publicsuffix/list/wiki/Format) on formatting and sorting. 91 | 92 | 98 | 99 | 103 | 104 | * [ ] A role-based email address has been used and this inbox is actively monitored with a response time of no more than 30 days. 105 | 106 | **Abuse Contact:** 107 | 108 | 113 | 114 | * [ ] Abuse contact information (email or web form) is available and easily accessible. 115 | 116 | URL where abuse contact or abuse reporting form can be found: 117 | 118 | 119 | --- 120 | 121 | For PRIVATE section requests that are submitting entries for domains that match their organization website's primary domain, please understand that this can have impacts that may not match the desired outcome and take a long time to rollback, if at all. 122 | 123 | To ensure that requested changes are entirely intentional, make sure that you read the affectation and propagation expectations, that you understand them, and confirm this understanding. 124 | 125 | PR Rollbacks have lower priority, and the volunteers are unable to control when or if browsers or other parties using the PSL will refresh or update. 126 | 127 | 141 | 142 | (Link: [about propagation/expectations](https://github.com/publicsuffix/list/wiki/Guidelines#appropriate-expectations-on-derivative-propagation-use-or-inclusion)) 143 | 144 | * [ ] *Yes, I understand*. I could break my organization's website cookies and cause other issues, and the rollback timing is acceptable. *Proceed anyways*. 145 | --- 146 | 147 | 153 | 154 | ## Description of Organization 155 | 173 | 174 | **Organization Website:** 175 | 176 | 177 | ## Reason for PSL Inclusion 178 | 197 | 198 | **Number of users this request is being made to serve:** 199 | 200 | 201 | ## DNS Verification 202 | 225 | -------------------------------------------------------------------------------- /tools/internal/parser/diff.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | // SetBaseVersion sets the list's base of comparison to old, and 8 | // updates the changed/unchanged annotations on all Blocks to match. 9 | // 10 | // If wholeSuffixBlocks is true, any changed Suffix or Wildcard within 11 | // a Suffixes block marks all suffixes and wildcards in that block as 12 | // changed. 13 | // 14 | // Precise marking (wholeSuffixBlocks=false) is intended for 15 | // maintainer and machine edits, where change-aware validators should 16 | // exaine only the specific changed items. 17 | // 18 | // Expansive marking (wholeSuffixBlocks=true) is intended for external 19 | // PRs from suffix block owners, to opportunistically point out more 20 | // issues that they have the knowledge and authority to fix. 21 | func (l *List) SetBaseVersion(old *List, wholeSuffixBlocks bool) { 22 | diff := differ{ 23 | oldCnt: map[string]int{}, 24 | inCurrent: map[string][][]Block{}, 25 | keys: map[Block]string{}, 26 | 27 | wholeSuffixBlocks: wholeSuffixBlocks, 28 | } 29 | 30 | // Tree diff is an open area of research, and it's possible to use 31 | // extremely fancy (and slow) algorithms. Thankfully, the PSL has 32 | // some additional domain-specific properties that let us take 33 | // shortcuts and implement something O(n). 34 | // 35 | // First, academic tree_diff(OLD,NEW) produces an "edit script" as 36 | // the output, which describes how to add, delete, move and mutate 37 | // tree nodes to transform the OLD tree into the NEW tree. For the 38 | // PSL, we don't care about the exact structural changes, we just 39 | // need to know if we can skip validation checks. So we have to 40 | // answer a simple question: is a given block in NEW also present 41 | // in OLD? 42 | // 43 | // Second, all nodes in a well-formed list have a stable unique 44 | // identity. We can use this to answer the previous question in 45 | // constant time, instead of having to do complex tree analysis to 46 | // locate equivalent nodes. 47 | // 48 | // Node identities may be duplicated in an ill-formed List, for 49 | // example a suffix block that lists the same suffix twice. We 50 | // deal with this using brute force, and mark all duplicate 51 | // identities as changed. This means that a malformed PSL file 52 | // might report more changes than the strict minimum, but in 53 | // practice it's not much more, and in exchange we don't have to 54 | // do anything complex to decide what to revalidate. 55 | // 56 | // Third, how do we propagate child changes to parents? This is 57 | // where academic algorithms quickly go into O(n^3) 58 | // territory. Once again, we avoid this with brute force: a 59 | // changed tree node marks all its parents as changed as 60 | // well. That means that if you fix a typo in one Suffix, we say 61 | // that the Suffix changed, but also its parent Suffixes, Section, 62 | // and List nodes. 63 | // 64 | // We could theoretically dirty fewer nodes in some cases, but 65 | // that introduces a risk of false negatives (we forget to re-run 66 | // a necessary validation), and it makes the diff harder to reason 67 | // about when writing validators. In practice, this slightly 68 | // pessimistic dirtying is cheap for the currently-planned 69 | // validators, so we stick with the behavior that is easy to 70 | // reason about and simple to implement. 71 | // 72 | // Finally, we need to do something about deleted nodes. We can 73 | // handle that with a single additional pass through the OLD list, 74 | // thanks to the node identity property. Again for simplicity, we 75 | // treat deletions similar to edits: all the parents of a deleted 76 | // node are marked dirty. Again we could be more precise here, but 77 | // in practice it's currently cheap to be pessimistic, and makes 78 | // the code and mental model simpler. 79 | // 80 | // There are various optimizations possible for this code. The 81 | // biggest would be doing something more efficient to track block 82 | // identities, which are currently expressed as big strings 83 | // because that makes them convenient to compare and use as map 84 | // keys. However, this algorithm as currently implemented takes 85 | // <100ms to diff a full PSL file, so for now we err on the side 86 | // of simplicity. 87 | 88 | // Compile the identities of all the blocks in old. 89 | diff.scanOld(old, "") 90 | // Mark unchanged blocks. Thanks to the previous step, each tree 91 | // node can be checked in O(1) time. 92 | diff.scanCurrent(l, "", nil) 93 | // Dirty the parents of deleted blocks. 94 | diff.markDeletions(old, "") 95 | } 96 | 97 | type differ struct { 98 | // wholeSuffixBlocks is whether Suffix/Wildcard changes propagate 99 | // to all children of the parent Suffixes block. 100 | wholeSuffixBlocks bool 101 | 102 | // oldCnt counts the number of blocks in the old list with a given 103 | // identity key. 104 | oldCnt map[string]int 105 | 106 | // inCurrent maps block identity keys to the tree paths in of the 107 | // current list with that identity. Given a block with identity K, 108 | // inCurrent[K] is a list of paths. In each path, path[0] is a 109 | // block with identity K, and path[1..n] are its parents going 110 | // back to the root of the tree. 111 | // 112 | // In a well-formed List, each cache entry has a single path, but 113 | // we track duplicates in order to function correctly on malformed 114 | // lists as well. 115 | inCurrent map[string][][]Block 116 | 117 | // keys caches identity keys by block pointer. There are several 118 | // passes of traversal through trees, and when old and current are 119 | // nearly identical (the common case) this can save significant 120 | // CPU time. 121 | keys map[Block]string 122 | } 123 | 124 | // scanOld records b and its children in d.oldCnt. 125 | func (d *differ) scanOld(b Block, parentKey string) { 126 | k := d.getKey(b, parentKey) 127 | d.oldCnt[k]++ 128 | for _, child := range b.Children() { 129 | d.scanOld(child, k) 130 | } 131 | } 132 | 133 | // scanCurrent adds b and all its children to b.inCurrent, and updates 134 | // their isUnchanged annotation based on the information in d.oldCnt. 135 | func (d *differ) scanCurrent(curBlock Block, parentKey string, parents []Block) { 136 | k := d.getKey(curBlock, parentKey) 137 | 138 | path := make([]Block, 0, len(parents)+1) 139 | path = append(path, curBlock) 140 | path = append(path, parents...) 141 | 142 | // Assume we're unchanged to start with. The job of the remaining 143 | // diff code is to falsify this claim and mark the node as changed 144 | // if needed. 145 | // 146 | // Setting this early and unconditionally lets us optimize the 147 | // logic in markChanged, by ensuring that each node transitions 148 | // false->true only once, before any possible true->false 149 | // transitions that affect it. 150 | curBlock.info().isUnchanged = true 151 | 152 | // Record the path to the current block, and if it's a 153 | // doppelganger of some other Block, mark changed. Tracking diffs 154 | // of duplicates requires solving some hard theoretical problems 155 | // of tree diff, so we don't bother. 156 | // 157 | // Duplicate identities only happens on a malformed PSL, and we 158 | // can save a lot of pain by just over-rechecking such PSLs 159 | // slightly. 160 | d.inCurrent[k] = append(d.inCurrent[k], path) 161 | if l := len(d.inCurrent[k]); l == 2 { 162 | // This is the first duplicate, previous path didn't know it 163 | // wasn't unique. Mark both the current and earlier path as 164 | // changed. 165 | d.markChanged(d.inCurrent[k]...) 166 | } else if l > 2 { 167 | // Previous paths already marked, only curBlock's one needs 168 | // updating. 169 | d.markChanged(path) 170 | } 171 | 172 | // This covers both the case where a block is new (oldCnt of 0), 173 | // and the case where this block isn't a dupe in current, but was 174 | // a dupe in old. In that case, like above we avoid algorithmic 175 | // headaches by just dirtying the block instead of trying to 176 | // resolve which version of the old dupes we're looking at. 177 | if d.oldCnt[k] != 1 { 178 | d.markChanged(path) 179 | } 180 | 181 | // Scan through child subtrees. These subtrees may call 182 | // markChanged and set Unchanged=false on us. 183 | for _, child := range curBlock.Children() { 184 | d.scanCurrent(child, k, path) 185 | } 186 | 187 | // If the caller requested, and we're changed anyway, see if we 188 | // should propagate the change back downwards again. 189 | if !curBlock.info().isUnchanged { 190 | d.maybeMarkWholeSuffixBlock(path) 191 | } 192 | } 193 | 194 | // markDeletions marks parents of deleted nodes as changed in current. 195 | // 196 | // For example, if the diff contains a suffix deletion, this will mark 197 | // the enclosing Suffixes block as changed. 198 | func (d *differ) markDeletions(oldBlock Block, parentKey string) bool { 199 | k := d.getKey(oldBlock, parentKey) 200 | 201 | pathsInCurrent, ok := d.inCurrent[k] 202 | if !ok { 203 | // oldBlock was deleted, report to caller. 204 | return true 205 | } 206 | 207 | childDeleted := false 208 | for _, child := range oldBlock.Children() { 209 | if d.markDeletions(child, k) { 210 | // Note, can't short-circuit here because there may be 211 | // other paths under this block that also need to be 212 | // updated. We're not only trying to update oldBlock, but 213 | // also all of its children. 214 | childDeleted = true 215 | } 216 | } 217 | 218 | // Children were deleted, mark ourselves changed. This implicitly 219 | // also marks the parent as changed, so no need to tell it that a 220 | // change happened, it'll just do extra no-op work. 221 | if childDeleted { 222 | d.markChanged(pathsInCurrent...) 223 | } 224 | 225 | return false 226 | } 227 | 228 | // maybeMarkWholeSuffixBlock calls markSuffixAndWildcardChanged on all 229 | // Suffixes in path, if the caller of MarkUnchanged requested 230 | // expansive marking. 231 | func (d *differ) maybeMarkWholeSuffixBlock(path []Block) { 232 | if !d.wholeSuffixBlocks { 233 | return 234 | } 235 | 236 | switch path[0].(type) { 237 | case *Suffixes, *Suffix, *Wildcard: 238 | for i, parent := range path { 239 | if _, ok := parent.(*Suffixes); ok { 240 | d.markSuffixAndWildcardChanged(parent, path[i+1:]) 241 | } 242 | } 243 | } 244 | } 245 | 246 | // markSuffixAndWildcardChanged marks as changed all Suffix and 247 | // Wildcard blocks in the tree rooted at curBlock. 248 | func (d *differ) markSuffixAndWildcardChanged(curBlock Block, parents []Block) { 249 | path := append([]Block{curBlock}, parents...) 250 | 251 | switch curBlock.(type) { 252 | case *Suffix, *Wildcard: 253 | d.markChanged(path) 254 | default: 255 | for _, child := range curBlock.Children() { 256 | d.markSuffixAndWildcardChanged(child, path) 257 | } 258 | } 259 | } 260 | 261 | // markChanged marks as changed all the blocks in paths. 262 | func (d *differ) markChanged(paths ...[]Block) { 263 | pathLoop: 264 | for _, path := range paths { 265 | for _, b := range path { 266 | if b.info().isUnchanged == false { 267 | // We never mark a node as changed in isolation, we 268 | // always propagate the change to all its 269 | // parents. Therefore, we can stop the upwards 270 | // traversal in this path as soon as we find any node 271 | // that's already in the correct state. 272 | continue pathLoop 273 | } 274 | b.info().isUnchanged = false 275 | } 276 | } 277 | } 278 | 279 | // getKey returns the identity key for blk, which must be a direct 280 | // child of parentKey. getKey keeps a cache of all keys built in the 281 | // lifetime of this differ, to make future calls more efficient. 282 | func (d *differ) getKey(blk Block, parentKey string) string { 283 | ret, ok := d.keys[blk] 284 | if !ok { 285 | ret = d.makeKey(blk, parentKey) 286 | d.keys[blk] = ret 287 | } 288 | return ret 289 | } 290 | 291 | // makeKey builds the identity key of blk, which must be a child node 292 | // of parentKey. 293 | func (d *differ) makeKey(b Block, parentKey string) string { 294 | switch v := b.(type) { 295 | case *List: 296 | return fmt.Sprintf("%s;List", parentKey) 297 | case *Section: 298 | return fmt.Sprintf("%s;Section,%q", parentKey, v.Name) 299 | case *Suffixes: 300 | // Note parsed suffix metadata isn't included in the identity, 301 | // to avoid marking all suffixes in a block changed when 302 | // someone adjusts their URL or email. Such edits will still 303 | // indirectly dirty the block, because the metadata comment 304 | // includes the entire comment text in its identity, and will 305 | // dirty the parent Suffixes. 306 | ret := fmt.Sprintf("%s;Suffixes,%q", parentKey, v.Info.Name) 307 | return ret 308 | case *Suffix: 309 | return fmt.Sprintf("%s;Suffix,%q", parentKey, v.Domain) 310 | case *Wildcard: 311 | return fmt.Sprintf("%s;Wildcard,%q,%#v", parentKey, v.Domain, v.Exceptions) 312 | case *Comment: 313 | return fmt.Sprintf("%s;Comment,%#v", parentKey, v.Text) 314 | default: 315 | panic("unknown ast node") 316 | } 317 | } 318 | -------------------------------------------------------------------------------- /tools/internal/parser/parser_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "net/mail" 5 | "net/url" 6 | "os" 7 | "testing" 8 | 9 | "github.com/publicsuffix/list/tools/internal/domain" 10 | ) 11 | 12 | // TestParser runs a battery of synthetic parse and validation tests. 13 | func TestParser(t *testing.T) { 14 | // These test cases have a fair amount of repetition in them, 15 | // since both errors and suffix blocks contain repeated nestings 16 | // of blocks and Source objects. While it's tempting to try and 17 | // reduce duplication through clever code, you are encouraged to 18 | // resist the urge. 19 | // 20 | // Each test case is quite verbose, but being laid out with 21 | // minimal indirection makes it easier to inspect and debug when a 22 | // failure happens. 23 | 24 | tests := []struct { 25 | name string 26 | psl []byte 27 | downgradeToWarning func(error) bool 28 | want *List 29 | wantErrs []error 30 | }{ 31 | { 32 | name: "empty", 33 | psl: byteLines(""), 34 | want: list(), 35 | }, 36 | 37 | { 38 | name: "just_comments", 39 | psl: byteLines( 40 | "// This is an empty PSL file.", 41 | "", 42 | "// Here is a second comment.", 43 | ), 44 | want: list( 45 | comment(0, "This is an empty PSL file."), 46 | comment(2, "Here is a second comment."), 47 | ), 48 | }, 49 | 50 | { 51 | name: "just_suffixes_in_block", 52 | psl: byteLines( 53 | "// ===BEGIN PRIVATE DOMAINS===", 54 | "", 55 | "example.com", 56 | "other.example.com", 57 | "*.example.org", 58 | "", 59 | "// ===END PRIVATE DOMAINS===", 60 | ), 61 | want: list( 62 | section(0, 7, "PRIVATE DOMAINS", 63 | suffixes(2, 5, noInfo, 64 | suffix(2, "example.com"), 65 | suffix(3, "other.example.com"), 66 | wildcard(4, 5, "example.org"), 67 | ), 68 | ), 69 | ), 70 | }, 71 | 72 | { 73 | name: "empty_sections", 74 | psl: byteLines( 75 | "// ===BEGIN IMAGINARY DOMAINS===", 76 | "// ===END IMAGINARY DOMAINS===", 77 | "// ===BEGIN FAKE DOMAINS===", 78 | "// ===END FAKE DOMAINS===", 79 | ), 80 | want: list( 81 | section(0, 2, "IMAGINARY DOMAINS"), 82 | section(2, 4, "FAKE DOMAINS"), 83 | ), 84 | }, 85 | 86 | { 87 | name: "missing_section_end", 88 | psl: byteLines( 89 | "// ===BEGIN ICANN DOMAINS===", 90 | ), 91 | want: list( 92 | section(0, 1, "ICANN DOMAINS"), 93 | ), 94 | wantErrs: []error{ 95 | ErrUnclosedSection{section(0, 1, "ICANN DOMAINS")}, 96 | }, 97 | }, 98 | 99 | { 100 | name: "nested_sections", 101 | psl: byteLines( 102 | "// ===BEGIN ICANN DOMAINS===", 103 | "// ===BEGIN SECRET DOMAINS===", 104 | "// ===END SECRET DOMAINS===", 105 | "// ===END ICANN DOMAINS===", 106 | ), 107 | want: list( 108 | section(0, 4, "ICANN DOMAINS"), 109 | ), 110 | 111 | wantErrs: []error{ 112 | ErrNestedSection{ 113 | SourceRange: mkSrc(1, 3), 114 | Name: "SECRET DOMAINS", 115 | Section: section(0, 4, "ICANN DOMAINS"), 116 | }, 117 | }, 118 | }, 119 | 120 | { 121 | name: "unknown_section_header", 122 | psl: byteLines( 123 | "// ===TRANSFORM DOMAINS===", 124 | ), 125 | want: list(), 126 | wantErrs: []error{ 127 | ErrUnknownSectionMarker{mkSrc(0, 1)}, 128 | }, 129 | }, 130 | 131 | { 132 | name: "suffixes_with_section_marker_in_header", 133 | psl: byteLines( 134 | "// Just some suffixes", 135 | "// ===BEGIN ICANN DOMAINS===", 136 | "com", 137 | "org", 138 | "", 139 | "// ===END ICANN DOMAINS===", 140 | ), 141 | want: list( 142 | comment(0, "Just some suffixes"), 143 | section(1, 6, "ICANN DOMAINS", 144 | suffixes(2, 4, noInfo, 145 | suffix(2, "com"), 146 | suffix(3, "org"), 147 | ), 148 | ), 149 | ), 150 | }, 151 | 152 | { 153 | name: "suffixes_with_section_markers_inline", 154 | psl: byteLines( 155 | "// ===BEGIN ICANN DOMAINS===", 156 | "// Just some suffixes", 157 | "com", 158 | "// ===BEGIN OTHER DOMAINS===", 159 | "org", 160 | "// ===END OTHER DOMAINS===", 161 | "net", 162 | "", 163 | "// ===END ICANN DOMAINS===", 164 | ), 165 | want: list( 166 | section(0, 9, "ICANN DOMAINS", 167 | suffixes(1, 7, 168 | info("Just some suffixes", nil, nil, nil, true), 169 | comment(1, "Just some suffixes"), 170 | suffix(2, "com"), 171 | suffix(4, "org"), 172 | suffix(6, "net"), 173 | ), 174 | ), 175 | ), 176 | wantErrs: []error{ 177 | ErrSectionInSuffixBlock{mkSrc(3, 4)}, 178 | ErrSectionInSuffixBlock{mkSrc(5, 6)}, 179 | }, 180 | }, 181 | 182 | { 183 | name: "suffixes_with_unstructured_header", 184 | psl: byteLines( 185 | "// Unstructured header.", 186 | "// I'm just going on about random things.", 187 | "example.com", 188 | "example.org", 189 | ), 190 | want: list( 191 | suffixes(0, 4, 192 | info( 193 | "Unstructured header.", 194 | nil, 195 | nil, 196 | []string{"I'm just going on about random things."}, 197 | true, 198 | ), 199 | comment(0, "Unstructured header.", "I'm just going on about random things."), 200 | suffix(2, "example.com"), 201 | suffix(3, "example.org"), 202 | ), 203 | ), 204 | }, 205 | 206 | { 207 | name: "suffixes_with_canonical_private_header", 208 | psl: byteLines( 209 | "// DuckCorp Inc: https://example.com", 210 | "// Submitted by Not A Duck ", 211 | "// Seriously, not a duck", 212 | "example.com", 213 | "example.org", 214 | ), 215 | want: list( 216 | suffixes(0, 5, 217 | info( 218 | "DuckCorp Inc", 219 | urls("https://example.com"), 220 | emails("Not A Duck", "duck@example.com"), 221 | []string{"Seriously, not a duck"}, 222 | true), 223 | comment(0, "DuckCorp Inc: https://example.com", "Submitted by Not A Duck ", 224 | "Seriously, not a duck"), 225 | suffix(3, "example.com"), 226 | suffix(4, "example.org"), 227 | ), 228 | ), 229 | }, 230 | 231 | { 232 | name: "suffixes_with_entity_and_submitter", 233 | psl: byteLines( 234 | "// DuckCorp Inc: submitted by Not A Duck ", 235 | "example.com", 236 | ), 237 | want: list( 238 | suffixes(0, 2, 239 | info( 240 | "DuckCorp Inc", 241 | nil, 242 | emails("Not A Duck", "duck@example.com"), 243 | nil, 244 | true), 245 | comment(0, "DuckCorp Inc: submitted by Not A Duck "), 246 | suffix(1, "example.com"), 247 | ), 248 | ), 249 | }, 250 | 251 | { 252 | name: "suffixes_with_all_separate_lines", 253 | psl: byteLines( 254 | "// DuckCorp Inc", 255 | "// https://example.com", 256 | "// Submitted by Not A Duck ", 257 | "example.com", 258 | ), 259 | want: list( 260 | suffixes(0, 4, 261 | info( 262 | "DuckCorp Inc", 263 | urls("https://example.com"), 264 | emails("Not A Duck", "duck@example.com"), 265 | nil, 266 | true), 267 | comment(0, "DuckCorp Inc", "https://example.com", `Submitted by Not A Duck `), 268 | suffix(3, "example.com"), 269 | ), 270 | ), 271 | }, 272 | 273 | { 274 | // Regression test for a few blocks that start with "name 275 | // (url)" instead of the more common "name: url". 276 | name: "url_in_parens", 277 | psl: byteLines( 278 | "// Parens Appreciation Society (https://example.org)", 279 | "example.com", 280 | ), 281 | want: list( 282 | suffixes(0, 2, 283 | info( 284 | "Parens Appreciation Society", 285 | urls("https://example.org"), 286 | nil, 287 | nil, 288 | true), 289 | comment(0, "Parens Appreciation Society (https://example.org)"), 290 | suffix(1, "example.com"), 291 | ), 292 | ), 293 | }, 294 | 295 | { 296 | // Regression test for a sneaky bug during development: 297 | // when an entity name is found when parsing Suffixes 298 | // headers, don't keep trying to find it in subsequent 299 | // lines, or you might overwrite the correct answer with 300 | // someething else that happens to have the right shape. 301 | name: "accept_first_valid_entity", 302 | psl: byteLines( 303 | "// cd : https://en.wikipedia.org/wiki/.cd", 304 | "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", 305 | "cd", 306 | ), 307 | want: list( 308 | suffixes(0, 3, 309 | info( 310 | "cd", 311 | urls("https://en.wikipedia.org/wiki/.cd"), 312 | nil, 313 | []string{"see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"}, 314 | true), 315 | comment(0, "cd : https://en.wikipedia.org/wiki/.cd", 316 | "see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"), 317 | suffix(2, "cd"), 318 | ), 319 | ), 320 | }, 321 | } 322 | 323 | for _, test := range tests { 324 | t.Run(test.name, func(t *testing.T) { 325 | got, errs := Parse(test.psl) 326 | checkDiff(t, "parse result", got, test.want) 327 | checkDiff(t, "parse errors", errs, test.wantErrs) 328 | }) 329 | } 330 | } 331 | 332 | // mkSrc returns a SourceRange with the given start and end. 333 | func mkSrc(start, end int) SourceRange { 334 | return SourceRange{start, end} 335 | } 336 | 337 | // TestParseRealList checks that the real public suffix list can parse 338 | // without errors. 339 | func TestParseRealList(t *testing.T) { 340 | bs, err := os.ReadFile("../../../public_suffix_list.dat") 341 | if err != nil { 342 | t.Fatal(err) 343 | } 344 | 345 | _, errs := Parse(bs) 346 | 347 | for _, err := range errs { 348 | t.Errorf("Parse error: %v", err) 349 | } 350 | } 351 | 352 | func list(blocks ...Block) *List { 353 | return &List{ 354 | Blocks: blocks, 355 | } 356 | } 357 | 358 | func comment(start int, lines ...string) *Comment { 359 | return &Comment{ 360 | blockInfo: blockInfo{ 361 | SourceRange: mkSrc(start, start+len(lines)), 362 | }, 363 | Text: lines, 364 | } 365 | } 366 | 367 | func section(start, end int, name string, blocks ...Block) *Section { 368 | return &Section{ 369 | blockInfo: blockInfo{ 370 | SourceRange: mkSrc(start, end), 371 | }, 372 | Name: name, 373 | Blocks: blocks, 374 | } 375 | } 376 | 377 | func suffixes(start, end int, info MaintainerInfo, blocks ...Block) *Suffixes { 378 | return &Suffixes{ 379 | blockInfo: blockInfo{ 380 | SourceRange: mkSrc(start, end), 381 | }, 382 | Info: info, 383 | Blocks: blocks, 384 | } 385 | } 386 | 387 | func info(name string, urls []*url.URL, emails []*mail.Address, other []string, editable bool) MaintainerInfo { 388 | return MaintainerInfo{ 389 | Name: name, 390 | URLs: urls, 391 | Maintainers: emails, 392 | Other: other, 393 | MachineEditable: editable, 394 | } 395 | } 396 | 397 | var noInfo = info("", nil, nil, nil, true) 398 | 399 | func suffix(line int, domainStr string) *Suffix { 400 | domain, err := domain.Parse(domainStr) 401 | if err != nil { 402 | panic(err) 403 | } 404 | return &Suffix{ 405 | blockInfo: blockInfo{ 406 | SourceRange: mkSrc(line, line+1), 407 | }, 408 | Domain: domain, 409 | } 410 | } 411 | 412 | func wildcard(start, end int, base string, exceptions ...string) *Wildcard { 413 | dom, err := domain.Parse(base) 414 | if err != nil { 415 | panic(err) 416 | } 417 | 418 | ret := &Wildcard{ 419 | blockInfo: blockInfo{ 420 | SourceRange: mkSrc(start, end), 421 | }, 422 | Domain: dom, 423 | } 424 | for _, s := range exceptions { 425 | exc, err := domain.ParseLabel(s) 426 | if err != nil { 427 | panic(err) 428 | } 429 | ret.Exceptions = append(ret.Exceptions, exc) 430 | } 431 | return ret 432 | } 433 | 434 | // zeroSourceRange destructively zeroes the SourceRange of the given 435 | // block and its children. We use a zero SourceRange to communicate 436 | // "this block did not exist in the original input", when adding 437 | // machine-generated blocks. 438 | func zeroSourceRange(b Block) Block { 439 | switch v := b.(type) { 440 | case *List: 441 | v.SourceRange = SourceRange{} 442 | case *Section: 443 | v.SourceRange = SourceRange{} 444 | case *Suffixes: 445 | v.SourceRange = SourceRange{} 446 | case *Suffix: 447 | v.SourceRange = SourceRange{} 448 | case *Wildcard: 449 | v.SourceRange = SourceRange{} 450 | case *Comment: 451 | v.SourceRange = SourceRange{} 452 | default: 453 | panic("unknown ast node") 454 | } 455 | for _, child := range b.Children() { 456 | zeroSourceRange(child) 457 | } 458 | return b 459 | } 460 | 461 | // markUnchanged makes .Changed() return false for b. It does not 462 | // touch parent or child blocks. 463 | // 464 | // It's generic so that it works in places that require a specific 465 | // instance type, not just places that accept a Block interface. 466 | func markUnchanged[T Block](b T) T { 467 | b.info().isUnchanged = true 468 | return b 469 | } 470 | -------------------------------------------------------------------------------- /tools/internal/parser/file.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "cmp" 5 | "fmt" 6 | "net/mail" 7 | "net/url" 8 | "slices" 9 | 10 | "github.com/publicsuffix/list/tools/internal/domain" 11 | ) 12 | 13 | // A Block is a parsed chunk of a PSL file. Each block is one of the 14 | // concrete types Comment, Section, Suffixes, Suffix, or Wildcard. 15 | type Block interface { 16 | // SrcRange returns the block's SourceRange. 17 | SrcRange() SourceRange 18 | // Children returns the block's direct children, if any. 19 | Children() []Block 20 | // Changed reports whether the tree rooted at block has changed 21 | // since the base of comparison (see List.SetBaseVersion). 22 | Changed() bool 23 | 24 | info() *blockInfo 25 | } 26 | 27 | // BlocksOfType recursively collects and returns all blocks of 28 | // concrete type T in the given parse tree. 29 | // 30 | // For example, BlocksOfType[*parser.Comment](ast) returns all comment 31 | // nodes in ast. 32 | func BlocksOfType[T Block](tree Block) []T { 33 | var ret []T 34 | blocksOfTypeRec(tree, &ret) 35 | return ret 36 | } 37 | 38 | func blocksOfTypeRec[T Block](tree Block, out *[]T) { 39 | if v, ok := tree.(T); ok { 40 | *out = append(*out, v) 41 | } 42 | for _, child := range tree.Children() { 43 | blocksOfTypeRec(child, out) 44 | } 45 | } 46 | 47 | // blockInfo is common information shared by all Block types. 48 | type blockInfo struct { 49 | SourceRange 50 | 51 | // isUnchanged records that a Block (including any children) is 52 | // semantically unchanged from a past base point. The default base 53 | // of comparison is a null List, meaning that Unchanged=false for 54 | // all blocks. A different base of comparison can be set with 55 | // List.Diff. 56 | isUnchanged bool 57 | } 58 | 59 | func (b blockInfo) SrcRange() SourceRange { 60 | return b.SourceRange 61 | } 62 | 63 | func (b blockInfo) Changed() bool { 64 | return !b.isUnchanged 65 | } 66 | 67 | func (b *blockInfo) info() *blockInfo { 68 | return b 69 | } 70 | 71 | // List is a parsed public suffix list. 72 | type List struct { 73 | blockInfo 74 | 75 | // Blocks are the top-level elements of the list, in the order 76 | // they appear. 77 | Blocks []Block 78 | } 79 | 80 | func (l *List) Children() []Block { return l.Blocks } 81 | 82 | // PublicSuffix returns the public suffix of n. 83 | // 84 | // This follows the PSL algorithm to the letter. Notably: a rule 85 | // "*.foo.com" does not implicitly create a "foo.com" rule, and there 86 | // is a hardcoded implicit "*" rule so that unknown TLDs are all 87 | // public suffixes. 88 | func (l *List) PublicSuffix(d domain.Name) domain.Name { 89 | if d.NumLabels() == 0 { 90 | // Edge case: zero domain.Name value 91 | return d 92 | } 93 | 94 | // Look at wildcards first, because the PSL algorithm says that 95 | // exceptions to wildcards take priority over all other rules. So, 96 | // if we find a wildcard exception, we can halt early. 97 | var ( 98 | ret domain.Name 99 | matchLen int 100 | gotException bool 101 | ) 102 | for _, w := range BlocksOfType[*Wildcard](l) { 103 | suf, isException, ok := w.PublicSuffix(d) 104 | switch { 105 | case !ok: 106 | continue 107 | case isException && !gotException: 108 | // First matching exception encountered. 109 | gotException = true 110 | matchLen = suf.NumLabels() 111 | ret = suf 112 | case isException: 113 | // Second or later exception match. According to the 114 | // format, only 0 or 1 exceptions can match, 115 | // multi-exception matches are undefined and unused. But 116 | // just to be safe, handle the N exception case by 117 | // accepting the longest matching exception. 118 | if nl := suf.NumLabels(); nl > matchLen { 119 | matchLen = nl 120 | ret = suf 121 | } 122 | case !gotException: 123 | // Non-exception match. 124 | if nl := suf.NumLabels(); nl > matchLen { 125 | matchLen = nl 126 | ret = suf 127 | } 128 | } 129 | } 130 | if gotException { 131 | return ret 132 | } 133 | 134 | // Otherwise, keep scanning through the regular suffixes. 135 | for _, s := range BlocksOfType[*Suffix](l) { 136 | if suf, ok := s.PublicSuffix(d); ok && suf.NumLabels() > matchLen { 137 | matchLen = suf.NumLabels() 138 | ret = suf 139 | } 140 | } 141 | 142 | if matchLen == 0 { 143 | // The PSL algorithm includes an implicit "*" to match every 144 | // TLD, in the absence of any matching explicit rule. 145 | labels := d.Labels() 146 | tld := labels[len(labels)-1].AsTLD() 147 | return tld 148 | } 149 | 150 | return ret 151 | } 152 | 153 | // RegisteredDomain returns the registered/registerable domain of 154 | // n. Returns (domain, true) when the input is a child of a public 155 | // suffix, and (zero, false) when the input is itself a public suffix. 156 | // 157 | // RegisteredDomain follows the PSL algorithm to the letter. Notably: 158 | // a rule "*.foo.com" does not implicitly create a "foo.com" rule, and 159 | // there is a hardcoded implicit "*" rule so that unknown TLDs are all 160 | // public suffixes. 161 | func (l *List) RegisteredDomain(d domain.Name) (domain.Name, bool) { 162 | suf := l.PublicSuffix(d) 163 | if suf.Equal(d) { 164 | return domain.Name{}, false 165 | } 166 | 167 | next, ok := d.CutSuffix(suf) 168 | if !ok { 169 | panic(fmt.Sprintf("public suffix %q is not a suffix of domain %q", suf, d)) 170 | } 171 | return suf.MustAddPrefix(next[len(next)-1]), true 172 | } 173 | 174 | // Comment is a comment block, consisting of one or more contiguous 175 | // lines of commented text. 176 | type Comment struct { 177 | blockInfo 178 | // Text is the unprocessed content of the comment lines, with the 179 | // leading comment syntax removed. 180 | Text []string 181 | } 182 | 183 | func (c *Comment) Children() []Block { return nil } 184 | 185 | // Section is a named part of a PSL file, containing suffixes which 186 | // behave similarly. 187 | type Section struct { 188 | blockInfo 189 | 190 | // Name is he section name. In a normal well-formed PSL file, the 191 | // names are "ICANN DOMAINS" and "PRIVATE DOMAINS". 192 | Name string 193 | // Blocks are the child blocks contained within the section. 194 | Blocks []Block 195 | } 196 | 197 | func (s *Section) Children() []Block { return s.Blocks } 198 | 199 | // Suffixes is a list of PSL domain suffixes with optional additional 200 | // metadata. 201 | // 202 | // Suffix sections consist of a header comment that contains a mix of 203 | // structured and unstructured information, followed by a list of 204 | // domain suffixes. The suffix list may contain additional 205 | // unstructured inline comments. 206 | type Suffixes struct { 207 | blockInfo 208 | 209 | // Info is information about the authoritative maintainers for 210 | // this set of suffixes. 211 | Info MaintainerInfo 212 | 213 | // Blocks are the child blocks contained within the section. 214 | Blocks []Block 215 | } 216 | 217 | func (s *Suffixes) Children() []Block { return s.Blocks } 218 | 219 | type MaintainerInfo struct { 220 | // Name is the name of the entity responsible for maintaining a 221 | // set of suffixes. 222 | // 223 | // For ICANN suffixes, this is typically the TLD name, or the name 224 | // of NIC that controls the TLD. 225 | // 226 | // For private domains this is the name of the legal entity 227 | // (usually a company, sometimes an individual) that owns all 228 | // domains in the block. 229 | // 230 | // In a well-formed PSL file, Name is non-empty for all suffix 231 | // blocks. 232 | Name string 233 | 234 | // URLs are links to further information about the suffix block's 235 | // domains and its maintainer. 236 | // 237 | // For ICANN domains this is typically the NIC's information page 238 | // for the TLD, or failing that a general information page such as 239 | // a Wikipedia entry. 240 | // 241 | // For private domains this is usually the website for the owner 242 | // of the domains. 243 | // 244 | // May be empty when the block header doesn't have 245 | // machine-readable URLs. 246 | URLs []*url.URL 247 | 248 | // Maintainer is the contact name and email address of the person 249 | // or persons responsible for maintaining a block. 250 | // 251 | // This field may be empty if there is no machine-readable contact 252 | // information. 253 | Maintainers []*mail.Address 254 | 255 | // Other is some unstructured additional notes. They may contain 256 | // anything, including some of the above information that wasn't 257 | // in a known parseable form. 258 | Other []string 259 | 260 | // MachineEditable is whether this information can be 261 | // machine-edited and written back out without loss of 262 | // information. The exact formatting of the information may 263 | // change, but no information will be lost. 264 | MachineEditable bool 265 | } 266 | 267 | func (m *MaintainerInfo) Compare(n *MaintainerInfo) int { 268 | if r := compareCommentText(m.Name, n.Name); r != 0 { 269 | return r 270 | } 271 | 272 | if r := cmp.Compare(len(m.URLs), len(n.URLs)); r != 0 { 273 | return r 274 | } 275 | for i := range m.URLs { 276 | if r := cmp.Compare(m.URLs[i].String(), n.URLs[i].String()); r != 0 { 277 | return r 278 | } 279 | } 280 | 281 | if r := cmp.Compare(len(m.Maintainers), len(n.Maintainers)); r != 0 { 282 | return r 283 | } 284 | for i := range m.Maintainers { 285 | if r := cmp.Compare(m.Maintainers[i].String(), n.Maintainers[i].String()); r != 0 { 286 | return r 287 | } 288 | } 289 | 290 | if r := slices.Compare(m.Other, n.Other); r != 0 { 291 | return r 292 | } 293 | 294 | if m.MachineEditable == n.MachineEditable { 295 | return 0 296 | } else if !m.MachineEditable { 297 | return -1 298 | } else { 299 | return 1 300 | } 301 | } 302 | 303 | // HasInfo reports whether m has any maintainer information at all. 304 | func (m MaintainerInfo) HasInfo() bool { 305 | return m.Name != "" || len(m.URLs) > 0 || len(m.Maintainers) > 0 || len(m.Other) > 0 306 | } 307 | 308 | // Suffix is one public suffix, represented in the standard domain 309 | // name format. 310 | type Suffix struct { 311 | blockInfo 312 | 313 | // Domain is the public suffix's domain name. 314 | Domain domain.Name 315 | } 316 | 317 | func (s *Suffix) Children() []Block { return nil } 318 | 319 | // PublicSuffix returns the public suffix of n according to this 320 | // Suffix rule taken in isolation. If n is not a child domain of s 321 | // PublicSuffix returns (zeroValue, false). 322 | func (s *Suffix) PublicSuffix(n domain.Name) (suffix domain.Name, ok bool) { 323 | if n.Equal(s.Domain) { 324 | return s.Domain, true 325 | } 326 | if _, ok := n.CutSuffix(s.Domain); ok { 327 | return s.Domain, true 328 | } 329 | return domain.Name{}, false 330 | } 331 | 332 | // RegisteredDomain returns the registered/registerable domain of n 333 | // according to this Suffix rule taken in isolation. The registered 334 | // domain is defined as n's public suffix plus one more child 335 | // label. If n is not a child domain of s, RegisteredDomain returns 336 | // (zeroValue, false). 337 | func (s *Suffix) RegisteredDomain(n domain.Name) (regDomain domain.Name, ok bool) { 338 | if prefix, ok := n.CutSuffix(s.Domain); ok { 339 | return s.Domain.MustAddPrefix(prefix[len(prefix)-1]), true 340 | } 341 | return domain.Name{}, false 342 | } 343 | 344 | // Wildcard is a wildcard public suffix, along with any exceptions to 345 | // that wildcard. 346 | type Wildcard struct { 347 | blockInfo 348 | 349 | // Domain is the base of the wildcard public suffix, without the 350 | // leading "*" label. 351 | Domain domain.Name 352 | // Exceptions are the domain.Labels that, when they appear in the 353 | // wildcard position of Domain, cause a FQDN to _not_ match this 354 | // wildcard. For example, if Domain="foo.com" and Exceptions=[bar, 355 | // qux], zot.foo.com is a public suffix, but bar.foo.com and 356 | // qux.foo.com are not. 357 | Exceptions []domain.Label 358 | } 359 | 360 | func (w *Wildcard) Children() []Block { return nil } 361 | 362 | // PublicSuffix returns the public suffix of n according to this 363 | // Wildcard rule taken in isolation. If n is not a child domain of w 364 | // PublicSuffix returns (zeroValue, false). 365 | func (w *Wildcard) PublicSuffix(n domain.Name) (suffix domain.Name, isException, ok bool) { 366 | if prefix, ok := n.CutSuffix(w.Domain); ok { 367 | next := prefix[len(prefix)-1] 368 | if slices.Contains(w.Exceptions, next) { 369 | return w.Domain, true, true 370 | } 371 | 372 | return w.Domain.MustAddPrefix(next), false, true 373 | } 374 | return domain.Name{}, false, false 375 | } 376 | 377 | // RegisteredDomain returns the registered/registerable domain of n 378 | // according to this Suffix rule taken in isolation. The registered 379 | // domain is defined as n's public suffix plus one more child 380 | // label. If n is not a child domain of s, RegisteredDomain returns 381 | // (zeroValue, false). 382 | func (w *Wildcard) RegisteredDomain(n domain.Name) (regDomain domain.Name, isException, ok bool) { 383 | if prefix, ok := n.CutSuffix(w.Domain); ok && len(prefix) >= 2 { 384 | next := prefix[len(prefix)-1] 385 | if slices.Contains(w.Exceptions, next) { 386 | return w.Domain.MustAddPrefix(next), true, true 387 | } 388 | 389 | return w.Domain.MustAddPrefix(prefix[len(prefix)-2:]...), false, true 390 | } 391 | return domain.Name{}, false, false 392 | } 393 | -------------------------------------------------------------------------------- /tools/internal/parser/parser.go: -------------------------------------------------------------------------------- 1 | // Package parser implements a validating parser for the PSL files. 2 | package parser 3 | 4 | import ( 5 | "fmt" 6 | "strings" 7 | 8 | "github.com/publicsuffix/list/tools/internal/domain" 9 | ) 10 | 11 | // Parse parses bs as a PSL file and returns the parse result. 12 | // 13 | // The parser tries to keep going when it encounters errors. Parse and 14 | // validation errors are accumulated in the Errors field of the 15 | // returned File. 16 | // 17 | // If the returned File has a non-empty Errors field, the parsed file 18 | // does not comply with the PSL format (documented at 19 | // https://github.com/publicsuffix/list/wiki/Format), or with PSL 20 | // submission guidelines 21 | // (https://github.com/publicsuffix/list/wiki/Guidelines). A File with 22 | // errors should not be used to calculate public suffixes for FQDNs. 23 | func Parse(bs []byte) (*List, []error) { 24 | lines, errs := normalizeToUTF8Lines(bs) 25 | p := &parser{ 26 | input: lines, 27 | inputLine: 0, 28 | } 29 | for _, err := range errs { 30 | p.addError(err) 31 | } 32 | ret := p.parseTopLevel() 33 | return ret, p.errs 34 | } 35 | 36 | // parser is the state for a single PSL file parse. 37 | type parser struct { 38 | // input is the remaining unparsed and untokenized source text. 39 | input []string 40 | // inputLine is the offset for input[0]. That is, input[0] is line 41 | // number inputLine of the source text. 42 | inputLine int 43 | // peekBuf is a buffer containing zero or one input tokens. 44 | peekBuf any 45 | // errs are the accumulated parse errors so far. 46 | errs []error 47 | } 48 | 49 | // addError records err as a parse/validation error. 50 | // 51 | // If err matches a legacy exemption from current validation rules, 52 | // err is recorded as a non-fatal warning instead. 53 | func (p *parser) addError(err error) { 54 | p.errs = append(p.errs, err) 55 | } 56 | 57 | // The following types and functions are the lexer portion of the 58 | // parsing logic. This is a very simplistic lexer, since 59 | // normalizeToUTF8Lines has already done a lot of heavy lifting to 60 | // clean up the input. Each line of input is converted to a token for 61 | // that line's content. The parser then assembles that stream of 62 | // tokens into multiline blocks, and eventually into a parse tree. 63 | 64 | const ( 65 | sectionStartPrefix = "// ===BEGIN " 66 | sectionEndPrefix = "// ===END " 67 | sectionPrefix = "// ===" 68 | commentPrefix = "// " 69 | wildcardPrefix = "*." 70 | exceptionPrefix = "!" 71 | ) 72 | 73 | type line struct { 74 | SourceRange 75 | Text string 76 | } 77 | type tokenEOF struct{} 78 | type tokenBlank struct{ line } 79 | type tokenComment struct{ line } 80 | type tokenSectionUnknown struct{ line } 81 | type tokenSectionStart struct { 82 | line 83 | Name string 84 | } 85 | type tokenSectionEnd struct { 86 | line 87 | Name string 88 | } 89 | type tokenSuffix struct{ line } 90 | type tokenWildcard struct { 91 | line 92 | Suffix string 93 | } 94 | type tokenException struct { 95 | line 96 | Suffix string 97 | } 98 | 99 | // next lexes the next token of input and returns it. 100 | func (p *parser) next() (ret any) { 101 | if p.peekBuf != nil { 102 | ret := p.peekBuf 103 | p.peekBuf = nil 104 | return ret 105 | } 106 | 107 | if len(p.input) == 0 { 108 | return tokenEOF{} 109 | } 110 | 111 | // No matter what, next is going to emit the next line of p.input, 112 | // the rest of the function is just to determine what kind of 113 | // token to return. 114 | src := line{ 115 | SourceRange: SourceRange{p.inputLine, p.inputLine + 1}, 116 | Text: p.input[0], 117 | } 118 | p.input = p.input[1:] 119 | p.inputLine++ 120 | 121 | switch { 122 | case src.Text == "": 123 | return tokenBlank{src} 124 | 125 | case strings.HasPrefix(src.Text, sectionStartPrefix): 126 | // To avoid repeated string processing in different portions 127 | // of the parser code, the lexer tears apart section markers 128 | // here to extract the section name. 129 | name := strings.TrimPrefix(src.Text, sectionStartPrefix) 130 | name, ok := strings.CutSuffix(name, "===") 131 | if !ok { 132 | return tokenSectionUnknown{src} 133 | } 134 | return tokenSectionStart{src, name} 135 | case strings.HasPrefix(src.Text, sectionEndPrefix): 136 | name := strings.TrimPrefix(src.Text, sectionEndPrefix) 137 | name, ok := strings.CutSuffix(name, "===") 138 | if !ok { 139 | return tokenSectionUnknown{src} 140 | } 141 | return tokenSectionEnd{src, name} 142 | case strings.HasPrefix(src.Text, sectionPrefix): 143 | return tokenSectionUnknown{src} 144 | 145 | case strings.HasPrefix(src.Text, commentPrefix): 146 | // Similarly, the following do some light processing of the 147 | // input so that this doesn't need to be repeated in several 148 | // portions of the parser. 149 | src.Text = strings.TrimPrefix(src.Text, "// ") 150 | return tokenComment{src} 151 | case strings.HasPrefix(src.Text, wildcardPrefix): 152 | return tokenWildcard{src, strings.TrimPrefix(src.Text, wildcardPrefix)} 153 | case strings.HasPrefix(src.Text, exceptionPrefix): 154 | return tokenException{src, strings.TrimPrefix(src.Text, exceptionPrefix)} 155 | 156 | default: 157 | return tokenSuffix{src} 158 | } 159 | } 160 | 161 | // peek returns the next token of input, without consuming it. 162 | func (p *parser) peek() any { 163 | if p.peekBuf == nil { 164 | p.peekBuf = p.next() 165 | } 166 | return p.peekBuf 167 | } 168 | 169 | // The rest of this file is the parser itself. It follows the common 170 | // recursive descent structure. 171 | 172 | // blockEmitter returns a function that appends blocks to a given 173 | // output list, and also updates an output SourceRange to cover the 174 | // superset of all emitted blocks. 175 | // 176 | // This is a helper to make the functions that parse intermediate AST 177 | // nodes (which have to accumulate a list of children) more readable. 178 | func blockEmitter(out *[]Block, srcRange *SourceRange) func(...Block) { 179 | 180 | return func(bs ...Block) { 181 | for _, b := range bs { 182 | if b == nil { 183 | // Sub-parsers sometimes return nil to indicate the 184 | // thing they tried to parse was bad and they have 185 | // nothing to contribute to the output. 186 | continue 187 | } 188 | 189 | *out = append(*out, b) 190 | 191 | if srcRange == nil { 192 | continue 193 | } else if *srcRange == (SourceRange{}) { 194 | // Zero value, this is the first emitted block. 195 | *srcRange = b.SrcRange() 196 | } else { 197 | *srcRange = (*srcRange).merge(b.SrcRange()) 198 | } 199 | } 200 | } 201 | } 202 | 203 | // parseTopLevel parses the top level of a PSL file. 204 | func (p *parser) parseTopLevel() *List { 205 | ret := &List{} 206 | emit := blockEmitter(&ret.Blocks, nil) 207 | 208 | for { 209 | switch tok := p.peek().(type) { 210 | case tokenEOF: 211 | return ret 212 | case tokenBlank: 213 | p.next() 214 | case tokenComment: 215 | emit(p.parseCommentOrSuffixBlock()) 216 | case tokenSectionStart: 217 | emit(p.parseSection()) 218 | case tokenSectionEnd: 219 | p.addError(ErrUnstartedSection{tok.SourceRange, tok.Name}) 220 | p.next() 221 | case tokenSectionUnknown: 222 | p.addError(ErrUnknownSectionMarker{tok.SourceRange}) 223 | p.next() 224 | case tokenSuffix, tokenWildcard, tokenException: 225 | emit(p.parseSuffixBlock(nil)) 226 | default: 227 | panic("unhandled token") 228 | } 229 | } 230 | } 231 | 232 | // parseSection parses the contents of a PSL file section. 233 | func (p *parser) parseSection() *Section { 234 | // Initialize with the start-of-section marker's data. 235 | start := p.next().(tokenSectionStart) 236 | ret := &Section{ 237 | blockInfo: blockInfo{ 238 | SourceRange: start.SourceRange, 239 | }, 240 | Name: start.Name, 241 | } 242 | emit := blockEmitter(&ret.Blocks, &ret.SourceRange) 243 | 244 | for { 245 | switch tok := p.peek().(type) { 246 | case tokenEOF: 247 | p.addError(ErrUnclosedSection{ret}) 248 | return ret 249 | case tokenBlank: 250 | p.next() 251 | case tokenComment: 252 | emit(p.parseCommentOrSuffixBlock()) 253 | case tokenSectionStart: 254 | // The PSL doesn't allow nested sections, so we pretend 255 | // like the inner section never existed and grab all its 256 | // blocks for ourselves. Still record an error for the 257 | // nested section though. 258 | inner := p.parseSection() 259 | emit(inner.Blocks...) 260 | p.addError(ErrNestedSection{inner.SourceRange, inner.Name, ret}) 261 | case tokenSectionEnd: 262 | p.next() 263 | if tok.Name != ret.Name { 264 | p.addError(ErrMismatchedSection{tok.SourceRange, tok.Name, ret}) 265 | } 266 | ret.SourceRange.LastLine = tok.SourceRange.LastLine 267 | return ret 268 | case tokenSectionUnknown: 269 | p.next() 270 | p.addError(ErrUnknownSectionMarker{tok.SourceRange}) 271 | case tokenSuffix, tokenWildcard, tokenException: 272 | emit(p.parseSuffixBlock(nil)) 273 | default: 274 | panic("unhandled token") 275 | } 276 | } 277 | } 278 | 279 | // parseCommentOrSuffixBlock parses a comment, then either returns it 280 | // as a lone comment or chains into suffix block parsing, depending on 281 | // what follows the comment. 282 | // 283 | // This is used to resolve an ambiguity in the PSL format when parsing 284 | // linearly: if we see a comment, that could be a standalone comment, 285 | // or it could be the beginning of a suffix block. In the latter case, 286 | // it's very important to attach the comment to the suffix block, 287 | // since it contains metadata about those suffixes. 288 | func (p *parser) parseCommentOrSuffixBlock() Block { 289 | comment := p.parseComment() 290 | switch p.peek().(type) { 291 | case tokenSuffix, tokenWildcard, tokenException: 292 | return p.parseSuffixBlock(comment) 293 | default: 294 | return comment 295 | } 296 | } 297 | 298 | // parseSuffixBlock parses a suffix block, starting with the provided 299 | // optional initial comment. 300 | func (p *parser) parseSuffixBlock(initialComment *Comment) *Suffixes { 301 | ret := &Suffixes{ 302 | Info: extractMaintainerInfo(initialComment), 303 | } 304 | emit := blockEmitter(&ret.Blocks, &ret.SourceRange) 305 | 306 | if initialComment != nil { 307 | emit(initialComment) 308 | } 309 | 310 | for { 311 | switch tok := p.peek().(type) { 312 | case tokenBlank: 313 | return ret 314 | case tokenComment: 315 | emit(p.parseComment()) 316 | case tokenSectionUnknown: 317 | p.next() 318 | p.addError(ErrUnknownSectionMarker{tok.SourceRange}) 319 | case tokenSectionStart: 320 | p.next() 321 | p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) 322 | case tokenSectionEnd: 323 | p.next() 324 | p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) 325 | case tokenSuffix: 326 | emit(p.parseSuffix()) 327 | case tokenWildcard: 328 | emit(p.parseWildcard()) 329 | case tokenException: 330 | // Note we don't emit here, exceptions receive a list of 331 | // existing blocks and attach the exception to the 332 | // corresponding wildcard entry. 333 | p.parseException(ret.Blocks) 334 | case tokenEOF: 335 | return ret 336 | default: 337 | panic("unhandled token") 338 | } 339 | } 340 | } 341 | 342 | // parseSuffix parses a basic public suffix entry (i.e. not a wildcard 343 | // or an exception. 344 | func (p *parser) parseSuffix() Block { 345 | tok := p.next().(tokenSuffix) 346 | 347 | domain, err := domain.Parse(tok.Text) 348 | if err != nil { 349 | p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Text, err}) 350 | return nil 351 | } 352 | 353 | return &Suffix{ 354 | blockInfo: blockInfo{ 355 | SourceRange: tok.SourceRange, 356 | }, 357 | Domain: domain, 358 | } 359 | } 360 | 361 | // parseWildcard parses a public suffix wildcard entry, of the form 362 | // "*.example.com". 363 | func (p *parser) parseWildcard() Block { 364 | tok := p.next().(tokenWildcard) 365 | 366 | domain, err := domain.Parse(tok.Suffix) 367 | if err != nil { 368 | p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) 369 | return nil 370 | } 371 | 372 | return &Wildcard{ 373 | blockInfo: blockInfo{ 374 | SourceRange: tok.SourceRange, 375 | }, 376 | Domain: domain, 377 | } 378 | } 379 | 380 | // parseException parses a public suffix wildcard exception, of the 381 | // form "!foo.example.com". The parsed exception is attached to the 382 | // related Wildcard block in previous. If no such block exists, the 383 | // exception is dropped and a parse error recorded. 384 | func (p *parser) parseException(previous []Block) { 385 | tok := p.next().(tokenException) 386 | 387 | domain, err := domain.Parse(tok.Suffix) 388 | if err != nil { 389 | p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) 390 | return 391 | } 392 | 393 | for _, block := range previous { 394 | w, ok := block.(*Wildcard) 395 | if !ok { 396 | continue 397 | } 398 | 399 | if rest, ok := domain.CutSuffix(w.Domain); ok && len(rest) == 1 { 400 | w.Exceptions = append(w.Exceptions, domain.Labels()[0]) 401 | return 402 | } 403 | } 404 | p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, fmt.Errorf("exception %q does not match any wildcard", tok.Suffix)}) 405 | } 406 | 407 | // parseComment parses a multiline comment block. 408 | func (p *parser) parseComment() *Comment { 409 | tok := p.next().(tokenComment) 410 | ret := &Comment{ 411 | blockInfo: blockInfo{ 412 | SourceRange: tok.SourceRange, 413 | }, 414 | Text: []string{tok.Text}, 415 | } 416 | for { 417 | if tok, ok := p.peek().(tokenComment); ok { 418 | p.next() 419 | ret.SourceRange = ret.SourceRange.merge(tok.SourceRange) 420 | ret.Text = append(ret.Text, tok.Text) 421 | } else { 422 | return ret 423 | } 424 | } 425 | } 426 | --------------------------------------------------------------------------------