├── .gitignore
├── tools
    ├── private_domains_checker
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── TestPSLPrivateDomainsProcessor.py
    │   └── README.md
    ├── convert_tests
    ├── patchnewgtlds
    ├── go.mod
    ├── internal
    │   ├── domain
    │   │   ├── update_idna_testdata.go
    │   │   └── domain_test.go
    │   ├── parser
    │   │   ├── file_test.go
    │   │   ├── write.go
    │   │   ├── unicode.go
    │   │   ├── write_test.go
    │   │   ├── text_test.go
    │   │   ├── text.go
    │   │   ├── metadata_test.go
    │   │   ├── errors.go
    │   │   ├── validate_test.go
    │   │   ├── metadata.go
    │   │   ├── diff.go
    │   │   ├── parser_test.go
    │   │   ├── file.go
    │   │   └── parser.go
    │   ├── githistory
    │   │   └── history.go
    │   └── github
    │   │   └── pr.go
    ├── go.sum
    └── psltool
    │   └── psltool.go
├── linter
    ├── test_section1.input
    ├── test_allowedchars.input
    ├── test_section2.expected
    ├── test_section3.expected
    ├── test_NFKC.expected
    ├── test_section1.expected
    ├── test_punycode.expected
    ├── test_section4.input
    ├── test_section4.expected
    ├── test_allowedchars.expected
    ├── test_punycode.input
    ├── test_section2.input
    ├── test_wildcard.expected
    ├── test_section3.input
    ├── test_dots.expected
    ├── test_wildcard.input
    ├── test_NFKC.input
    ├── test_dots.input
    ├── test_spaces.expected
    ├── test_exception.expected
    ├── test_duplicate.expected
    ├── test_duplicate.input
    ├── test_exception.input
    ├── test_spaces.input
    ├── README.md
    ├── pslint_selftest.sh
    └── pslint.py
├── tests
    ├── README
    ├── test_psl.js
    ├── test_bug414122.js
    ├── tests.txt
    ├── prepare_tlds.py
    └── test_psl.txt
├── .github
    ├── workflows
    │   ├── deploy-site.yml
    │   ├── validate.yml
    │   ├── psltool_pr_check.yml
    │   ├── test.yml
    │   ├── psltool-fmt.yml
    │   └── tld-update.yml
    └── pull_request_template.md
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── SECURITY.md
├── Makefile
├── CONTRIBUTING.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | linter/log
2 | libpsl
3 | coverage.out
4 | 


--------------------------------------------------------------------------------
/tools/private_domains_checker/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__
3 | data/*.csv


--------------------------------------------------------------------------------
/tools/private_domains_checker/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | requests
3 | whoisdomain


--------------------------------------------------------------------------------
/linter/test_section1.input:
--------------------------------------------------------------------------------
1 | // test:
2 | // - no section at all
3 | 
4 | example.com
5 | 


--------------------------------------------------------------------------------
/linter/test_allowedchars.input:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lovablelabs/list/main/linter/test_allowedchars.input


--------------------------------------------------------------------------------
/linter/test_section2.expected:
--------------------------------------------------------------------------------
1 | 11: warning: 2 ICANN sections found
2 | 11: warning: No PRIVATE section found
3 | 


--------------------------------------------------------------------------------
/linter/test_section3.expected:
--------------------------------------------------------------------------------
1 | 11: warning: No ICANN section found
2 | 11: warning: 2 PRIVATE sections found
3 | 


--------------------------------------------------------------------------------
/linter/test_NFKC.expected:
--------------------------------------------------------------------------------
1 | 9: error: Rule must be NFKC: 'südtirol.it'
2 | 11: warning: No PRIVATE section found
3 | 


--------------------------------------------------------------------------------
/linter/test_section1.expected:
--------------------------------------------------------------------------------
1 | 4: error: Rule outside of section: 'example.com'
2 | 4: warning: No ICANN section found
3 | 4: warning: No PRIVATE section found
4 | 


--------------------------------------------------------------------------------
/linter/test_punycode.expected:
--------------------------------------------------------------------------------
1 | 7: error: Punycode found: 'a.xn--0zwm56d'
2 | 8: error: Double minus found: 'a.ex--ample.com'
3 | 10: warning: No PRIVATE section found
4 | 


--------------------------------------------------------------------------------
/linter/test_section4.input:
--------------------------------------------------------------------------------
1 | // test:
2 | // - ICANN section improperly closed
3 | 
4 | // ===BEGIN ICANN DOMAINS===
5 | 
6 | example.com
7 | 
8 | // ===END PRIVATE DOMAINS===
9 | 


--------------------------------------------------------------------------------
/linter/test_section4.expected:
--------------------------------------------------------------------------------
1 | 8: error: Unexpected end of section: '// ===END PRIVATE DOMAINS==='
2 | 8: error: ICANN section not closed
3 | 8: warning: No PRIVATE section found
4 | 


--------------------------------------------------------------------------------
/linter/test_allowedchars.expected:
--------------------------------------------------------------------------------
1 | 10: error: Illegal character: 'a.exam#ple.com'
2 | 11: error: Illegal character: 'b.exam ple.com'
3 | 13: error: Invalid UTF-8 character
4 | 15: warning: No PRIVATE section found
5 | 


--------------------------------------------------------------------------------
/linter/test_punycode.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - label is punycode
 3 | // - label has double minus
 4 | 
 5 | // ===BEGIN ICANN DOMAINS===
 6 | 
 7 | a.xn--0zwm56d
 8 | a.ex--ample.com
 9 | 
10 | // ===END ICANN DOMAINS===
11 | 


--------------------------------------------------------------------------------
/linter/test_section2.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - two ICANN sections
 3 | 
 4 | // ===BEGIN ICANN DOMAINS===
 5 | 
 6 | example.com
 7 | 
 8 | // ===END ICANN DOMAINS===
 9 | 
10 | // ===BEGIN ICANN DOMAINS===
11 | // ===END ICANN DOMAINS===
12 | 


--------------------------------------------------------------------------------
/linter/test_wildcard.expected:
--------------------------------------------------------------------------------
1 | 11: error: Illegal character: '**.com'
2 | 12: error: Illegal character: 'a*.com'
3 | 13: error: Illegal character: 'b.*.com'
4 | 14: error: Illegal character: 'a.b.*'
5 | 16: warning: No PRIVATE section found
6 | 


--------------------------------------------------------------------------------
/linter/test_section3.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - two PRIVATE sections
 3 | 
 4 | // ===BEGIN PRIVATE DOMAINS===
 5 | 
 6 | example.com
 7 | 
 8 | // ===END PRIVATE DOMAINS===
 9 | 
10 | // ===BEGIN PRIVATE DOMAINS===
11 | // ===END PRIVATE DOMAINS===
12 | 


--------------------------------------------------------------------------------
/tools/convert_tests:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Written 2016 by Tim Ruehsen (tim dot ruehsen at gmx dot de)
4 | #
5 | # Convert test_psl.txt to tests.txt (sed hack)
6 | 
7 | sed -e "s/checkPublicSuffix('*\([^']*\)'*, '*\([^']*\)'*);/\1 \2/g" test_psl.txt >tests.txt
8 | 


--------------------------------------------------------------------------------
/linter/test_dots.expected:
--------------------------------------------------------------------------------
1 | 9: error: Leading/trailing or multiple dot: '.a.example.com'
2 | 10: error: Leading/trailing or multiple dot: 'b.example.com.'
3 | 11: error: Leading/trailing or multiple dot: 'c..example.com'
4 | 13: warning: No PRIVATE section found
5 | 


--------------------------------------------------------------------------------
/linter/test_wildcard.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - valid wildcard usage
 3 | // - invalid wildcard usage
 4 | 
 5 | // ===BEGIN ICANN DOMAINS===
 6 | 
 7 | // valid
 8 | *.com
 9 | 
10 | // invalid
11 | **.com
12 | a*.com
13 | b.*.com
14 | a.b.*
15 | 
16 | // ===END ICANN DOMAINS===
17 | 


--------------------------------------------------------------------------------
/tests/README:
--------------------------------------------------------------------------------
1 | prepare_tlds.py:
2 | 
3 | This is a copy of a file mastered in Mozilla's Hg repo at:
4 | https://hg.mozilla.org/mozilla-central/file/default/netwerk/dns/prepare_tlds.py
5 | We include it here so we can check that it still produces valid output when
6 | the PSL changes.
7 | 
8 | 


--------------------------------------------------------------------------------
/linter/test_NFKC.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - label contains non-NFKC character(s)
 3 | //
 4 | // best viewed with 'LC_ALL=C.UTF-8 vi <filename>' (or any other UTF-8 locale)
 5 | 
 6 | // ===BEGIN ICANN DOMAINS===
 7 | 
 8 | südtirol.it
 9 | südtirol.it
10 | 
11 | // ===END ICANN DOMAINS===
12 | 


--------------------------------------------------------------------------------
/linter/test_dots.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - leading dot
 3 | // - trailing dot
 4 | // - consecutive dots
 5 | 
 6 | // ===BEGIN ICANN DOMAINS===
 7 | 
 8 | // example.com: https://www.iana.org/domains/reserved
 9 | .a.example.com
10 | b.example.com.
11 | c..example.com
12 | 
13 | // ===END ICANN DOMAINS===
14 | 


--------------------------------------------------------------------------------
/linter/test_spaces.expected:
--------------------------------------------------------------------------------
1 | 12: warning: Leading/Trailing whitespace: ' a.example.com'
2 | 13: warning: Leading/Trailing whitespace: 'b.example.com '
3 | 14: warning: Leading/Trailing whitespace: '\tc.example.com'
4 | 15: warning: Leading/Trailing whitespace: 'd.example.com\t'
5 | 17: warning: Leading/Trailing whitespace: '  '
6 | 19: warning: No PRIVATE section found
7 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-site.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: deploy website
 3 | on:
 4 |   workflow_dispatch: {}
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | permissions: {}
10 | 
11 | jobs:
12 |   upload-website:
13 |     uses: publicsuffix/publicsuffix.org/.github/workflows/deploy-site.yaml@main
14 |     permissions:
15 |       contents: read
16 |       id-token: write
17 |     secrets: inherit
18 | 


--------------------------------------------------------------------------------
/linter/test_exception.expected:
--------------------------------------------------------------------------------
1 | 17: error: Leading/trailing or multiple dot: '!.example.com'
2 | 18: error: Illegal character: 'w!w.example.com'
3 | 19: error: Found doublette/ambiguity (previous line was 12): '!www.example.com'
4 | 20: error: Exception without previous wildcard: '!a.b.example.com'
5 | 21: error: Exception without previous wildcard: '!a.c.example.com'
6 | 23: warning: No PRIVATE section found
7 | 


--------------------------------------------------------------------------------
/tools/patchnewgtlds:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -o pipefail
 5 | set -x
 6 | 
 7 | if ! [ -x "$(command -v go)" ]
 8 | then
 9 |   echo "error: a go binary in \$PATH is required to run newgtlds.go" >&2
10 |   exit 1
11 | fi
12 | 
13 | SCRIPT=$(realpath "$0")
14 | BASEDIR=$(dirname "$SCRIPT")
15 | 
16 | go run -C "$BASEDIR/" . \
17 |   -overwrite \
18 |   -psl-dat-file="$BASEDIR/../public_suffix_list.dat"
19 | 


--------------------------------------------------------------------------------
/linter/test_duplicate.expected:
--------------------------------------------------------------------------------
1 | 9: error: Found doublette/ambiguity (previous line was 8): '*.com'
2 | 13: error: Found doublette/ambiguity (previous line was 12): '!www.com'
3 | 17: error: Found doublette/ambiguity (previous line was 16): '*.example.com'
4 | 21: error: Found doublette/ambiguity (previous line was 20): 'example1.com'
5 | 24: error: Found doublette/ambiguity (previous line was 17): 'example.com'
6 | 26: warning: No PRIVATE section found
7 | 


--------------------------------------------------------------------------------
/linter/test_duplicate.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - valid wildcard usage
 3 | // - invalid wildcard usage
 4 | 
 5 | // ===BEGIN ICANN DOMAINS===
 6 | 
 7 | // *.com implicitely includes .com
 8 | com
 9 | *.com
10 | 
11 | // double exception
12 | !www.com
13 | !www.com
14 | 
15 | // double wildcard
16 | *.example.com
17 | *.example.com
18 | 
19 | // double plain rule
20 | example1.com
21 | example1.com
22 | 
23 | // redundant/overlapping rule
24 | example.com
25 | 
26 | // ===END ICANN DOMAINS===
27 | 


--------------------------------------------------------------------------------
/linter/test_exception.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - valid exception
 3 | // - invalid exceptions
 4 | // - same exception twice
 5 | // - exception without wildcard
 6 | // - exception with prevailing '*' rule (!localhost)
 7 | 
 8 | // ===BEGIN ICANN DOMAINS===
 9 | 
10 | // valid
11 | *.example.com
12 | !www.example.com
13 | !localhost
14 | c.example.com
15 | 
16 | // invalid
17 | !.example.com
18 | w!w.example.com
19 | !www.example.com
20 | !a.b.example.com
21 | !a.c.example.com
22 | 
23 | // ===END ICANN DOMAINS===
24 | 


--------------------------------------------------------------------------------
/linter/test_spaces.input:
--------------------------------------------------------------------------------
 1 | // test:
 2 | // - leading space
 3 | // - trailing space, empty line with spaces
 4 | // - leading tab
 5 | // - trailing tab
 6 | // - line ends with CRLF (pslint_selftest will add one to e.example.com and removed it after testing)
 7 | // - empty line with spaces
 8 | 
 9 | // ===BEGIN ICANN DOMAINS===
10 | 
11 | // example.com: https://www.iana.org/domains/reserved
12 |  a.example.com
13 | b.example.com 
14 | 	c.example.com
15 | d.example.com	
16 | e.example.com
17 |   
18 | 
19 | // ===END ICANN DOMAINS===
20 | 


--------------------------------------------------------------------------------
/.github/workflows/validate.yml:
--------------------------------------------------------------------------------
 1 | name: validate
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   validate:
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
12 |       - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0
13 |         with:
14 |           go-version: "stable"
15 |       - name: run validations
16 |         run: |
17 |           cd tools
18 |           go run ./psltool validate ../public_suffix_list.dat
19 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | compiler: gcc
 3 | 
 4 | script: 
 5 |   - make
 6 |   - go test -v -coverprofile=coverage.out tools/*.go
 7 | 
 8 | go:
 9 |   - "1.15.x"
10 | 
11 | addons:
12 |     apt:
13 |         packages:
14 |             - python3
15 |             - autoconf
16 |             - automake
17 |             - autopoint
18 |             - libtool
19 |             - gettext
20 |             - libidn11-dev
21 |             - libidn2-0
22 |             - libidn2-0-dev
23 |             - libicu-dev
24 |             - libunistring0
25 |             - libunistring-dev
26 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ### Community Participation Guidelines
2 | Your participation in the Public Suffix List project should follow the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/ "Mozilla Community Participation Guidelines") as well as the [GitHub Community Participation Guidelines](https://help.github.com/en/github/site-policy/github-community-guidelines "GitHub Community Participation Guidelines"). Behavior that falls into the areas forbidden by either document is unwelcome and will result in further escalation.
3 | 


--------------------------------------------------------------------------------
/tools/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/publicsuffix/list/tools
 2 | 
 3 | go 1.23.0
 4 | 
 5 | toolchain go1.24.2
 6 | 
 7 | require (
 8 | 	github.com/google/go-cmp v0.6.0
 9 | 	golang.org/x/net v0.38.0
10 | 	golang.org/x/text v0.23.0
11 | )
12 | 
13 | require (
14 | 	github.com/creachadair/command v0.1.13
15 | 	github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad
16 | 	github.com/creachadair/mds v0.15.2
17 | 	github.com/creachadair/taskgroup v0.9.0
18 | 	github.com/google/go-github/v63 v63.0.0
19 | 	github.com/natefinch/atomic v1.0.1
20 | )
21 | 
22 | require github.com/google/go-querystring v1.1.0 // indirect
23 | 


--------------------------------------------------------------------------------
/.github/workflows/psltool_pr_check.yml:
--------------------------------------------------------------------------------
 1 | name: psltool PR check
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | permissions: {}
 7 | 
 8 | jobs:
 9 |   validate:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
14 |       - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0
15 |         with:
16 |           go-version: "stable"
17 |       - name: run validations
18 |         run: |
19 |           cd tools
20 |           go run ./psltool fmt -d ../public_suffix_list.dat && go run ./psltool check-pr --gh-owner ${{ github.event.repository.owner.login }} --gh-repo ${{ github.event.repository.name }} --online-checks ${{ github.event.pull_request.number }}
21 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | on: [push, pull_request]
 3 | 
 4 | permissions:
 5 |   contents: read
 6 | 
 7 | jobs:
 8 |   make-test:
 9 |     name: Unit tests
10 |     runs-on: ubuntu-22.04
11 |     steps:
12 |       - name: Check out code
13 |         uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
14 | 
15 |       - name: Set up Go
16 |         uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0
17 |         with:
18 |           go-version: 'stable'
19 | 
20 |       - name: Run Go unit tests
21 |         run: go test -C ./tools -v .
22 | 
23 |       - name: Install dependencies
24 |         run: sudo apt install -y autopoint
25 | 
26 |       - name: Run makefile tests
27 |         run: make test
28 | 


--------------------------------------------------------------------------------
/linter/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains a linter for the Public Suffix List.
 2 | 
 3 | Before you commit any changes to the PSL, please use the
 4 | linter to check the syntax.
 5 | 
 6 | Usage
 7 | =====
 8 | 
 9 | (from the repo's main directory)
10 | 
11 | $ linter/pslint.py public_suffix_list.dat
12 | 
13 | $? is set to 0 on success, else it is set to 1.
14 | 
15 | 
16 | Selftest
17 | ========
18 | 
19 | Every change on pslint.py should be followed by a self-test.
20 | 
21 | ```
22 | $ cd linter
23 | $ ./pslint_selftest.sh
24 | test_allowedchars: OK
25 | test_dots: OK
26 | test_duplicate: OK
27 | test_exception: OK
28 | test_punycode: OK
29 | test_section1: OK
30 | test_section2: OK
31 | test_section3: OK
32 | test_section4: OK
33 | test_spaces: OK
34 | test_wildcard: OK
35 | ```
36 | 


--------------------------------------------------------------------------------
/linter/pslint_selftest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | rc=0
 4 | rm -rf log
 5 | mkdir -p log
 6 | 
 7 | # add CR if missing, it won't possibly survive git
 8 | sed -i -e 's/^e.example.com$/e.example.com\r/g' test_spaces.input
 9 | 
10 | for file in `ls *.input|cut -d'.' -f1`; do
11 |   echo -n "${file}: "
12 |   ./pslint.py ${file}.input >log/${file}.log 2>&1
13 |   diff -u ${file}.expected log/${file}.log >log/${file}.diff
14 |   if [ $? -eq 0 ]; then
15 |     echo OK
16 |     rm log/${file}.diff log/${file}.log
17 |   else
18 |     echo FAILED
19 |     cat log/${file}.diff
20 |     rc=1
21 |   fi
22 | done
23 | 
24 | # remove CR, to not appear as changed to git
25 | sed -i -e 's/^e.example.com\r$/e.example.com/g' test_spaces.input
26 | 
27 | if [ $rc -eq 0 ]; then
28 |   rmdir log
29 | fi
30 | 
31 | exit $rc
32 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | Security updates are applied only to the repository itself.
 4 | 
 5 | ## Reporting a Vulnerability
 6 | 
 7 | Reports are limited to repo matters. Any vulnerability reports related to the addition or removal of PSL entries in the .dat file shall be rejected and referred to filing pull requests that should make mention the alleged urgency.
 8 | 
 9 | If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
10 | 
11 | Please disclose it at [security advisory](https://github.com/publicsuffix/list/security/advisories/new) and send an email with the link to the newly filed issue to [security@mozilla.org](mailto:security@mozilla.org) to expedite the review on our end.
12 | 
13 | This project is maintained by a team of volunteers on a reasonable-effort basis. As such, please give us at least 90 days to work on a fix before public exposure.
14 | 


--------------------------------------------------------------------------------
/.github/workflows/psltool-fmt.yml:
--------------------------------------------------------------------------------
 1 | name: psltool-fmt
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | permissions:
 7 |   pull-requests: write
 8 |   contents: write # This should be okay since you cannot easily run this on
 9 |     # something like an untrusted PR. PRs are not offered in the GUI.
10 | 
11 | jobs:
12 |   validate:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 |       - uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5.4.0
18 |         with:
19 |           go-version: "stable"
20 |       - name: run validations
21 |         run: |
22 |           cd tools
23 |           go run ./psltool fmt ../public_suffix_list.dat
24 |       - name: create PR
25 |         uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
26 |         with:
27 |           commit-message: Apply formatting using `psltool fmt`
28 |           branch: psltool-fmt
29 |           title: 'Automatic PR for workflow `psltool-fmt`'
30 |           body: 'Automatic PR for formatting workflow using `psltool fmt`'
31 | 


--------------------------------------------------------------------------------
/tools/internal/domain/update_idna_testdata.go:
--------------------------------------------------------------------------------
 1 | //go:build ignore
 2 | 
 3 | // This script is run by `go generate` (see domains_test.go) to
 4 | // download a new copy of the IDNA test inputs. They are stored
 5 | // verbatim as provided by the Unicode Consortium to make it easy to
 6 | // verify that it's an unaltered file, and gets parsed for the
 7 | // information relevant to this package in domains_test.go.
 8 | package main
 9 | 
10 | import (
11 | 	"fmt"
12 | 	"log"
13 | 	"net/http"
14 | 
15 | 	"github.com/natefinch/atomic"
16 | 	"golang.org/x/net/idna"
17 | )
18 | 
19 | const (
20 | 	idnaTestVectorsURLPattern = "https://www.unicode.org/Public/idna/%s/IdnaTestV2.txt"
21 | 	idnaTestVectorsPath       = "testdata/idna_test_vectors.txt"
22 | )
23 | 
24 | func main() {
25 | 	// New releases of Unicode can alter the outcome of existing
26 | 	// tests, so it's very important to use the test vectors for the
27 | 	// specific version of Unicode that x/net/idna uses.
28 | 	url := fmt.Sprintf(idnaTestVectorsURLPattern, idna.UnicodeVersion)
29 | 
30 | 	resp, err := http.Get(url)
31 | 	if err != nil {
32 | 		log.Fatal(err)
33 | 	} else if resp.StatusCode != http.StatusOK {
34 | 		log.Fatalf("Fetching %q: %v", url, err)
35 | 	}
36 | 	defer resp.Body.Close()
37 | 
38 | 	if err := atomic.WriteFile(idnaTestVectorsPath, resp.Body); err != nil {
39 | 		log.Fatalf("Writing %q: %v", idnaTestVectorsPath, err)
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/test_psl.js:
--------------------------------------------------------------------------------
 1 | var etld = Cc["@mozilla.org/network/effective-tld-service;1"]
 2 |              .getService(Ci.nsIEffectiveTLDService);
 3 | 
 4 | var idna = Cc["@mozilla.org/network/idn-service;1"]
 5 |            .getService(Ci.nsIIDNService);
 6 | 
 7 | var Cr = Components.results;
 8 | 
 9 | function run_test()
10 | {
11 |   var file = do_get_file("data/test_psl.txt");
12 |   var ios = Cc["@mozilla.org/network/io-service;1"]
13 |               .getService(Ci.nsIIOService);
14 |   var uri = ios.newFileURI(file);
15 |   var scriptLoader = Cc["@mozilla.org/moz/jssubscript-loader;1"]
16 |                        .getService(Ci.mozIJSSubScriptLoader);
17 |   var srvScope = {};
18 |   scriptLoader.loadSubScript(uri.spec, srvScope, "utf-8");
19 | }
20 | 
21 | function checkPublicSuffix(host, expectedSuffix)
22 | {
23 |   var actualSuffix = null;
24 |   try {
25 |     actualSuffix = etld.getBaseDomainFromHost(host);
26 |   } catch (e if e.result == Cr.NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS ||
27 |                 e.result == Cr.NS_ERROR_ILLEGAL_VALUE) {
28 |   }
29 |   // The EffectiveTLDService always gives back punycoded labels.
30 |   // The test suite wants to get back what it put in.
31 |   if (actualSuffix !== null && expectedSuffix !== null &&
32 |       /(^|\.)xn--/.test(actualSuffix) && !/(^|\.)xn--/.test(expectedSuffix)) {
33 |     actualSuffix = idna.convertACEtoUTF8(actualSuffix);
34 |   }
35 |   do_check_eq(actualSuffix, expectedSuffix);
36 | }
37 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | Dir	= $(PWD)
 2 | Options	= --with-psl-file=$(Dir)/public_suffix_list.dat --with-psl-testfile=$(Dir)/tests/tests.txt
 3 | 
 4 | all: test
 5 | 
 6 | test: test-syntax test-rules
 7 | 
 8 | test-rules: libpsl-libicu
 9 | 
10 | test-syntax:
11 | 	@
12 | 	  cd linter;                                \
13 | 	  ./pslint_selftest.sh;                     \
14 | 	  ./pslint.py ../public_suffix_list.dat;
15 | 
16 | libpsl-config:
17 | 	@
18 | 	  test -d libpsl || git clone --depth=1 https://github.com/rockdaboot/libpsl;   \
19 | 	  cd libpsl;                                                                    \
20 | 	  git pull;                                                                     \
21 | 	  echo "EXTRA_DIST =" >  gtk-doc.make;                                          \
22 | 	  echo "CLEANFILES =" >> gtk-doc.make;                                          \
23 | 	  autoreconf --install --force --symlink;
24 | 
25 | # Test PSL data with libicu (IDNA2008 UTS#46)
26 | libpsl-libicu: libpsl-config
27 | 	cd libpsl && ./configure -q -C --enable-runtime=libicu --enable-builtin=libicu $(Options) && make -s clean && make -s check -j4
28 | 
29 | # TEST PSL data with libidn2 (IDNA2008)
30 | libpsl-libidn2: libpsl-config
31 | 	cd libpsl && ./configure -q -C --enable-runtime=libidn2 --enable-builtin=libidn2 $(Options) && make -s clean && make -s check -j4
32 | 
33 | # TEST PSL data with libidn (IDNA2003)
34 | libpsl-libidn: libpsl-config
35 | 	cd libpsl && ./configure -q -C --enable-runtime=libidn --enable-builtin=libidn $(Options) && make -s clean && make -s check -j4


--------------------------------------------------------------------------------
/.github/workflows/tld-update.yml:
--------------------------------------------------------------------------------
 1 | name: tld-update
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     # Run once a day at 15:00 UTC
 6 |     - cron:  '0 15 * * *'
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   psl-gtld-update:
13 |     name: Check for TLD data updates
14 |     runs-on: ubuntu-latest
15 |     permissions:
16 |       contents: write
17 |       pull-requests: write
18 |     steps:
19 | 
20 |       - name: Check out code
21 |         uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
22 | 
23 |       - name: Set up Go
24 |         uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0
25 |         with:
26 |           go-version: 'stable'
27 | 
28 |       - name: Run unit tests
29 |         run: go test -C ./tools -v .
30 | 
31 |       - name: Set current date
32 |         id: get-date
33 |         run: echo "NOW=$(date +'%Y-%m-%dT%H:%M:%S %Z')" >> $GITHUB_OUTPUT
34 | 
35 |       - name: Run patchnewgtlds
36 |         run: tools/patchnewgtlds
37 | 
38 |       - name: Create pull-request
39 |         id: cpr
40 |         uses: peter-evans/create-pull-request@38e0b6e68b4c852a5500a94740f0e535e0d7ba54 # v4.2.4
41 |         with:
42 |           commit-message: "util: gTLD data autopull updates for ${{ steps.get-date.outputs.NOW }}"
43 |           title: "util: gTLD autopull updates for ${{ steps.get-date.outputs.now }}"
44 |           body: "Public suffix list gTLD data updates from `tools/patchnewgtlds` for ${{ steps.get-date.outputs.now }}."
45 |           committer: "GitHub <noreply@github.com>"
46 |           author: "GitHub <noreply@github.com>"
47 |           branch: psl-gtld-update
48 |           labels: |
49 |             ✅ autopull
50 |             🚩ICANN (IANA/ICP-3) Section
51 |           delete-branch: true
52 | 
53 |       - name: Check outputs
54 |         run: |
55 |           echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
56 |           echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
57 | 


--------------------------------------------------------------------------------
/tests/test_bug414122.js:
--------------------------------------------------------------------------------
 1 | const PR_RDONLY = 0x1;
 2 | 
 3 | var etld = Cc["@mozilla.org/network/effective-tld-service;1"]
 4 |              .getService(Ci.nsIEffectiveTLDService);
 5 | var idn = Cc["@mozilla.org/network/idn-service;1"]
 6 |              .getService(Ci.nsIIDNService);
 7 | 
 8 | function run_test()
 9 | {
10 |   var fis = Cc["@mozilla.org/network/file-input-stream;1"]
11 |               .createInstance(Ci.nsIFileInputStream);
12 |   fis.init(do_get_file("effective_tld_names.dat"),
13 |            PR_RDONLY, 0444, Ci.nsIFileInputStream.CLOSE_ON_EOF);
14 | 
15 |   var lis = Cc["@mozilla.org/intl/converter-input-stream;1"]
16 |               .createInstance(Ci.nsIConverterInputStream);
17 |   lis.init(fis, "UTF-8", 1024, 0);
18 |   lis.QueryInterface(Ci.nsIUnicharLineInputStream);
19 | 
20 |   var out = { value: "" };
21 |   do
22 |   {
23 |     var more = lis.readLine(out);
24 |     var line = out.value;
25 | 
26 |     line = line.replace(/^\s+/, "");
27 |     var firstTwo = line.substring(0, 2); // a misnomer, but whatever
28 |     if (firstTwo == "" || firstTwo == "//")
29 |       continue;
30 | 
31 |     var space = line.search(/[ \t]/);
32 |     line = line.substring(0, space == -1 ? line.length : space);
33 | 
34 |     if ("*." == firstTwo)
35 |     {
36 |       let rest = line.substring(2);
37 |       checkPublicSuffix("foo.SUPER-SPECIAL-AWESOME-PREFIX." + rest,
38 |                         "SUPER-SPECIAL-AWESOME-PREFIX." + rest);
39 |     }
40 |     else if ("!" == line.charAt(0))
41 |     {
42 |       checkPublicSuffix(line.substring(1),
43 |                         line.substring(line.indexOf(".") + 1));
44 |     }
45 |     else
46 |     {
47 |       checkPublicSuffix("SUPER-SPECIAL-AWESOME-PREFIX." + line, line);
48 |     }
49 |   }
50 |   while (more);
51 | }
52 | 
53 | function checkPublicSuffix(host, expectedSuffix)
54 | {
55 |   expectedSuffix = idn.convertUTF8toACE(expectedSuffix).toLowerCase();
56 |   var actualSuffix = etld.getPublicSuffixFromHost(host);
57 |   do_check_eq(actualSuffix, expectedSuffix);
58 | }
59 | 


--------------------------------------------------------------------------------
/tools/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/creachadair/command v0.1.13 h1:UDKPF3QYPRS/quZPVYZ7sW1JLxLLOgiyVSLQ+7wwI2o=
 2 | github.com/creachadair/command v0.1.13/go.mod h1:YKwUE49nAi8qxLl8jCQ0GMPvwdxmIBkJW3LqxgZ7ljk=
 3 | github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad h1:Fv6FRWgCJTHsslL0qRhhO7Jj7cL78YW8s1c8UxFGIIo=
 4 | github.com/creachadair/flax v0.0.0-20240525192034-44db93b3a8ad/go.mod h1:K8bFvn8hMdAljQkaKNc7I3os5Wk36JxkyCkfdZ7S8d4=
 5 | github.com/creachadair/mds v0.15.2 h1:es1qGKgRGSaztpvrSQcZ0B9I6NsHYJ1Sa9naD/3OfCM=
 6 | github.com/creachadair/mds v0.15.2/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo=
 7 | github.com/creachadair/taskgroup v0.9.0 h1:kzXSea5C7R5DtnKFBOTEW3hvmCkiVnRkODMVDMgSS6k=
 8 | github.com/creachadair/taskgroup v0.9.0/go.mod h1:+1hJc8zL1rQkxcMVqEYJ0UPGtwl6Iz1+fd4zcOLtt+A=
 9 | github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
10 | github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
11 | github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
12 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
13 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
14 | github.com/google/go-github/v63 v63.0.0 h1:13xwK/wk9alSokujB9lJkuzdmQuVn2QCPeck76wR3nE=
15 | github.com/google/go-github/v63 v63.0.0/go.mod h1:IqbcrgUmIcEaioWrGYei/09o+ge5vhffGOcxrO0AfmA=
16 | github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8=
17 | github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
18 | github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A=
19 | github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM=
20 | golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
21 | golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
22 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
23 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
24 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
25 | 


--------------------------------------------------------------------------------
/tests/tests.txt:
--------------------------------------------------------------------------------
 1 | // Any copyright is dedicated to the Public Domain.
 2 | // https://creativecommons.org/publicdomain/zero/1.0/
 3 | 
 4 | // null input.
 5 | null null
 6 | // Mixed case.
 7 | COM null
 8 | example.COM example.com
 9 | WwW.example.COM example.com
10 | // Leading dot.
11 | .com null
12 | .example null
13 | .example.com null
14 | .example.example null
15 | // Unlisted TLD.
16 | example null
17 | example.example example.example
18 | b.example.example example.example
19 | a.b.example.example example.example
20 | // Listed, but non-Internet, TLD.
21 | //local null
22 | //example.local null
23 | //b.example.local null
24 | //a.b.example.local null
25 | // TLD with only 1 rule.
26 | biz null
27 | domain.biz domain.biz
28 | b.domain.biz domain.biz
29 | a.b.domain.biz domain.biz
30 | // TLD with some 2-level rules.
31 | com null
32 | example.com example.com
33 | b.example.com example.com
34 | a.b.example.com example.com
35 | uk.com null
36 | example.uk.com example.uk.com
37 | b.example.uk.com example.uk.com
38 | a.b.example.uk.com example.uk.com
39 | test.ac test.ac
40 | // TLD with only 1 (wildcard) rule.
41 | mm null
42 | c.mm null
43 | b.c.mm b.c.mm
44 | a.b.c.mm b.c.mm
45 | // More complex TLD.
46 | jp null
47 | test.jp test.jp
48 | www.test.jp test.jp
49 | ac.jp null
50 | test.ac.jp test.ac.jp
51 | www.test.ac.jp test.ac.jp
52 | kyoto.jp null
53 | test.kyoto.jp test.kyoto.jp
54 | ide.kyoto.jp null
55 | b.ide.kyoto.jp b.ide.kyoto.jp
56 | a.b.ide.kyoto.jp b.ide.kyoto.jp
57 | c.kobe.jp null
58 | b.c.kobe.jp b.c.kobe.jp
59 | a.b.c.kobe.jp b.c.kobe.jp
60 | city.kobe.jp city.kobe.jp
61 | www.city.kobe.jp city.kobe.jp
62 | // TLD with a wildcard rule and exceptions.
63 | ck null
64 | test.ck null
65 | b.test.ck b.test.ck
66 | a.b.test.ck b.test.ck
67 | www.ck www.ck
68 | www.www.ck www.ck
69 | // US K12.
70 | us null
71 | test.us test.us
72 | www.test.us test.us
73 | ak.us null
74 | test.ak.us test.ak.us
75 | www.test.ak.us test.ak.us
76 | k12.ak.us null
77 | test.k12.ak.us test.k12.ak.us
78 | www.test.k12.ak.us test.k12.ak.us
79 | // IDN labels.
80 | 食狮.com.cn 食狮.com.cn
81 | 食狮.公司.cn 食狮.公司.cn
82 | www.食狮.公司.cn 食狮.公司.cn
83 | shishi.公司.cn shishi.公司.cn
84 | 公司.cn null
85 | 食狮.中国 食狮.中国
86 | www.食狮.中国 食狮.中国
87 | shishi.中国 shishi.中国
88 | 中国 null
89 | // Same as above, but punycoded.
90 | xn--85x722f.com.cn xn--85x722f.com.cn
91 | xn--85x722f.xn--55qx5d.cn xn--85x722f.xn--55qx5d.cn
92 | www.xn--85x722f.xn--55qx5d.cn xn--85x722f.xn--55qx5d.cn
93 | shishi.xn--55qx5d.cn shishi.xn--55qx5d.cn
94 | xn--55qx5d.cn null
95 | xn--85x722f.xn--fiqs8s xn--85x722f.xn--fiqs8s
96 | www.xn--85x722f.xn--fiqs8s xn--85x722f.xn--fiqs8s
97 | shishi.xn--fiqs8s shishi.xn--fiqs8s
98 | xn--fiqs8s null
99 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Submitting Amendments
 2 | 
 3 | Before submitting any change to the list, please make sure to read the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines).
 4 | 
 5 | A properly formatted and validated patch will decrease the review time, and increase the chances your request will be reviewed and perhaps accepted. Any patch that doesn't follow the Guidelines will be rejected or, in the best scenario, left pending for follow-up.
 6 | 
 7 | The most common time loss comes from not following the sorting guidelines
 8 | - Sorting / Placement needs to comply with [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines)
 9 |   - *PLEASE* order within the existing entries in the PRIVATE DOMAINS section so that your listed organization on your first comment line is alphabetically sorted
10 |   - Do NOT append your PRIVATE DOMAINS entry to end of the file
11 |   - If there are more than one domain within your PR, order your entries alphabetically, ascending by TLD, then SLD, then 3LD and deeper etc (if present)
12 | 
13 | Other Common mistakes that may cause the request to be rejected include:
14 | 
15 | - Invalid patch formatting, rule sorting or changeset position (see this: [Wiki:Formatting](https://github.com/publicsuffix/list/wiki/Format))
16 | - Missing validation records 
17 | - Lack of proper domain ownership or expiry dates less than 2Y away
18 | - Attempts to work around vendor limits (see [#1245](https://github.com/publicsuffix/list/issues/1245) as an example)
19 | - Submissions with TLDs non-compliant with [ICP-3](https://www.icann.org/resources/pages/unique-authoritative-root-2012-02-25-en) or on the [ICANN PSL](https://github.com/publicsuffix/list/wiki/Security-Considerations#icann-public-suffix-list)
20 | - Insufficient or incomplete rationale (be verbose!)
21 | - Smaller, private projects with <2000 stakeholders
22 | 
23 | Frequently, PR submissions overlook the sort ordering guidelines, adding to delay in processing and an increase in the time it takes to process requests.
24 | 
25 | Make sure to review with the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines) before you open a new pull request.
26 | 
27 | Please also note that there is no guarantee of inclusion, nor we are able to provide an ETA for any inclusion request.  This is also true of projects that incorporate the PSL downline.  This is described, outlined and diagrammed [here](
28 | https://github.com/publicsuffix/list/wiki/Guidelines#appropriate-expectations-on-derivative-propagation-use-or-inclusion).
29 | 
30 | Before you attempt to make a contribution or comment, please read the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/).
31 | 
32 | ## PSL Mailing List
33 | 
34 | We suggest that submitters and users/integrators of the PSL to please join the (low traffic) mailing list to be aware of changes to structure, processes or formatting.
35 | 
36 | Some future changes may include automation of DNS tests for the presence of `_PSL` in `#PRIVATE` section entries to indicate ongoing inclusion or remove entries that do not have them, possible file structure or other changes.  The "list list" is located [HERE](https://groups.google.com/g/publicsuffix-discuss)
37 | 


--------------------------------------------------------------------------------
/tools/private_domains_checker/TestPSLPrivateDomainsProcessor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import uuid
 3 | 
 4 | from PSLPrivateDomainsProcessor import PSLPrivateDomainsProcessor, check_dns_status, get_whois_data, check_psl_txt_record
 5 | 
 6 | 
 7 | class TestPSLPrivateDomainsProcessor(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.processor = PSLPrivateDomainsProcessor()
11 |         # Populate icann_domains for testing
12 |         self.processor.icann_domains = {
13 |             "com", "co.uk", "ac.uk", "net", "org"
14 |         }
15 | 
16 |     def test_parse_domain_icann_domain(self):
17 |         # Test cases where domains should be parsed correctly
18 |         test_cases = [
19 |             ("*.example.com", "example.com"),
20 |             ("sub.example.com", "example.com"),
21 |             ("*.sub.example.com", "example.com"),
22 |             ("example.com", "example.com"),
23 |             ("example.co.uk", "example.co.uk"),
24 |             ("sub.example.co.uk", "example.co.uk"),
25 |             ("*.example.co.uk", "example.co.uk"),
26 |             ("*.sub.example.co.uk", "example.co.uk"),
27 |             ("abc.ac.uk", "abc.ac.uk"),
28 |             ("a.b.com", "b.com")
29 |         ]
30 | 
31 |         for domain, expected in test_cases:
32 |             with self.subTest(domain=domain):
33 |                 result = self.processor.parse_domain(domain)
34 |                 self.assertEqual(expected, result)
35 | 
36 |     def test_parse_domain_no_icann(self):
37 |         # Test case where no valid ICANN domain is found
38 |         self.processor.icann_domains.remove("com")
39 |         with self.assertRaises(ValueError):
40 |             self.processor.parse_domain("example.com")
41 | 
42 |     def test_parse_domain_edge_cases(self):
43 |         # Additional edge case testing
44 |         self.assertEqual("example.org", self.processor.parse_domain("sub.example.org"))
45 |         self.assertEqual("example.com", self.processor.parse_domain("example.com"))
46 |         self.assertEqual("example.ac.uk", self.processor.parse_domain("sub.example.ac.uk"))
47 | 
48 |     def test_parse_domain_invalid(self):
49 |         # Test invalid domains which should raise ValueError
50 |         invalid_domains = ["invalid.test", "*.invalid.test", "sub.invalid.test"]
51 |         for domain in invalid_domains:
52 |             with self.subTest(domain=domain):
53 |                 with self.assertRaises(ValueError):
54 |                     self.processor.parse_domain(domain)
55 | 
56 |     def test_check_dns_status(self):
57 |         # Test with a known good domain
58 |         self.assertEqual("ok", check_dns_status("mozilla.org"))
59 |         # Test with a likely non-existent domain
60 |         random_domain = "nxdomain-" + str(uuid.uuid4()) + ".edu"
61 |         self.assertEqual("NXDOMAIN", check_dns_status(random_domain))
62 | 
63 |     def test_check_psl_txt_record(self):
64 |         # Test with a known domain having a valid _psl TXT record
65 |         self.assertEqual("valid", check_psl_txt_record("cdn.cloudflare.net"))
66 |         # Test with a domain without a _psl TXT record
67 |         random_domain = "invalid-" + str(uuid.uuid4()) + ".edu"
68 |         self.assertEqual("invalid", check_psl_txt_record(random_domain))
69 | 
70 |     def test_get_whois_data(self):
71 |         whois_data = get_whois_data("example.com")
72 |         self.assertEqual("ok", whois_data[2])
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/tools/internal/parser/file_test.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/publicsuffix/list/tools/internal/domain"
 7 | )
 8 | 
 9 | func TestPublicSuffix(t *testing.T) {
10 | 	lst := list(
11 | 		section(1, 1, "PRIVATE DOMAINS",
12 | 			suffixes(1, 1, noInfo,
13 | 				suffix(1, "example.com"),
14 | 				wildcard(2, 3, "baz.net", "except", "other"),
15 | 				suffix(4, "com"),
16 | 
17 | 				// Wildcards and exceptions nested inside each
18 | 				// other. This doesn't appear in the PSL in practice,
19 | 				// and is implicitly forbidden by the format spec, but
20 | 				// the parser/validator does not currently reject such
21 | 				// files, so we want PublicSuffix/RegisteredDomain to
22 | 				// be well-defined for such inputs.
23 | 				wildcard(5, 6, "nested.org", "except"),
24 | 				wildcard(7, 8, "in.except.nested.org", "other-except"),
25 | 			),
26 | 		),
27 | 	)
28 | 
29 | 	tests := []struct {
30 | 		in        string
31 | 		pubSuffix string
32 | 		regDomain string
33 | 	}{
34 | 		{"www.example.com", "example.com", "www.example.com"},
35 | 		{"www.public.example.com", "example.com", "public.example.com"},
36 | 		{"example.com", "example.com", ""},
37 | 
38 | 		{"www.other.com", "com", "other.com"},
39 | 		{"other.com", "com", "other.com"},
40 | 		{"com", "com", ""},
41 | 
42 | 		{"qux.bar.baz.net", "bar.baz.net", "qux.bar.baz.net"},
43 | 		{"bar.baz.net", "bar.baz.net", ""},
44 | 		{"baz.net", "net", "baz.net"}, // Implicit * rule
45 | 		{"qux.except.baz.net", "baz.net", "except.baz.net"},
46 | 		{"except.baz.net", "baz.net", "except.baz.net"},
47 | 		{"other.other.baz.net", "baz.net", "other.baz.net"},
48 | 
49 | 		// Tests for nested wildcards+exceptions. Does not appear in
50 | 		// the real PSL, and implicitly disallowed by the format spec,
51 | 		// but necessary to make PublicSuffix and RegisteredDomain's
52 | 		// outputs well defined for all inputs.
53 | 		{"qux.bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"},
54 | 		{"bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"},
55 | 		{"foo.nested.org", "foo.nested.org", ""},
56 | 		{"nested.org", "org", "nested.org"},
57 | 		{"bar.except.nested.org", "nested.org", "except.nested.org"},
58 | 		{"except.nested.org", "nested.org", "except.nested.org"},
59 | 		{"in.except.nested.org", "nested.org", "except.nested.org"},
60 | 		// Matches both nested wildcard and also outer exception,
61 | 		// outer exception wins.
62 | 		{"other.in.except.nested.org", "nested.org", "except.nested.org"},
63 | 		// Matches both outer and inner exceptions, inner exception
64 | 		// wins.
65 | 		{"qux.other-except.in.except.nested.org", "in.except.nested.org", "other-except.in.except.nested.org"},
66 | 	}
67 | 
68 | 	for _, tc := range tests {
69 | 		in := mustParseDomain(tc.in)
70 | 		wantSuffix := mustParseDomain(tc.pubSuffix)
71 | 
72 | 		gotSuffix := lst.PublicSuffix(in)
73 | 		if !gotSuffix.Equal(wantSuffix) {
74 | 			t.Errorf("PublicSuffix(%q) = %q, want %q", in, gotSuffix, wantSuffix)
75 | 		}
76 | 
77 | 		gotReg, ok := lst.RegisteredDomain(in)
78 | 		if ok && tc.regDomain == "" {
79 | 			t.Errorf("RegisteredDomain(%q) = %q, want none", in, gotReg)
80 | 		} else if ok {
81 | 			wantReg := mustParseDomain(tc.regDomain)
82 | 			if !gotReg.Equal(wantReg) {
83 | 				t.Errorf("RegisteredDomain(%q) = %q, want %q", in, gotReg, wantReg)
84 | 			}
85 | 		}
86 | 	}
87 | }
88 | 
89 | func mustParseDomain(s string) domain.Name {
90 | 	d, err := domain.Parse(s)
91 | 	if err != nil {
92 | 		panic(err)
93 | 	}
94 | 	return d
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/prepare_tlds.py:
--------------------------------------------------------------------------------
  1 | # This Source Code Form is subject to the terms of the Mozilla Public
  2 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | import codecs
  6 | import encodings.idna
  7 | import re
  8 | import sys
  9 | 
 10 | """
 11 | Processes a file containing effective TLD data.  See the following URL for a
 12 | description of effective TLDs and of the file format that this script
 13 | processes (although for the latter you're better off just reading this file's
 14 | short source code).
 15 | 
 16 | https://wiki.mozilla.org/Gecko:Effective_TLD_Service
 17 | """
 18 | 
 19 | def getEffectiveTLDs(path):
 20 |   file = codecs.open(path, "r", "UTF-8")
 21 |   domains = set()
 22 |   while True:
 23 |     line = file.readline()
 24 |     # line always contains a line terminator unless the file is empty
 25 |     if len(line) == 0:
 26 |       raise StopIteration
 27 |     line = line.rstrip()
 28 |     # comment, empty, or superfluous line for explicitness purposes
 29 |     if line.startswith("//") or "." not in line:
 30 |       continue
 31 |     line = re.split(r"[ \t\n]", line, 1)[0]
 32 |     entry = EffectiveTLDEntry(line)
 33 |     domain = entry.domain()
 34 |     assert domain not in domains, \
 35 |            "repeating domain %s makes no sense" % domain
 36 |     domains.add(domain)
 37 |     yield entry
 38 | 
 39 | def _normalizeHostname(domain):
 40 |   """
 41 |   Normalizes the given domain, component by component.  ASCII components are
 42 |   lowercased, while non-ASCII components are processed using the ToASCII
 43 |   algorithm.
 44 |   """
 45 |   def convertLabel(label):
 46 |     if _isASCII(label):
 47 |       return label.lower()
 48 |     return encodings.idna.ToASCII(label)
 49 |   return ".".join(map(convertLabel, domain.split(".")))
 50 | 
 51 | def _isASCII(s):
 52 |   "True if s consists entirely of ASCII characters, false otherwise."
 53 |   for c in s:
 54 |     if ord(c) > 127:
 55 |       return False
 56 |   return True
 57 | 
 58 | class EffectiveTLDEntry:
 59 |   """
 60 |   Stores an entry in an effective-TLD name file.
 61 |   """
 62 | 
 63 |   _exception = False
 64 |   _wild = False
 65 | 
 66 |   def __init__(self, line):
 67 |     """
 68 |     Creates a TLD entry from a line of data, which must have been stripped of
 69 |     the line ending.
 70 |     """
 71 |     if line.startswith("!"):
 72 |       self._exception = True
 73 |       domain = line[1:]
 74 |     elif line.startswith("*."):
 75 |       self._wild = True
 76 |       domain = line[2:]
 77 |     else:
 78 |       domain = line
 79 |     self._domain = _normalizeHostname(domain)
 80 | 
 81 |   def domain(self):
 82 |     "The domain this represents."
 83 |     return self._domain
 84 | 
 85 |   def exception(self):
 86 |     "True if this entry's domain denotes does not denote an effective TLD."
 87 |     return self._exception
 88 | 
 89 |   def wild(self):
 90 |     "True if this entry represents a class of effective TLDs."
 91 |     return self._wild
 92 | 
 93 | 
 94 | #################
 95 | # DO EVERYTHING #
 96 | #################
 97 | 
 98 | def main(output, effective_tld_filename):
 99 |   """
100 |   effective_tld_filename is the effective TLD file to parse.
101 |   A C++ array of { domain, exception, wild } entries representing the
102 |   eTLD file is then printed to output.
103 |   """
104 | 
105 |   def boolStr(b):
106 |     if b:
107 |       return "true"
108 |     return "false"
109 | 
110 |   for etld in getEffectiveTLDs(effective_tld_filename):
111 |     exception = boolStr(etld.exception())
112 |     wild = boolStr(etld.wild())
113 |     output.write('ETLD_ENTRY("%s", %s, %s)\n' % (etld.domain(), exception, wild))
114 | 
115 | if __name__ == '__main__':
116 |     main(sys.stdout, sys.argv[1])
117 | 


--------------------------------------------------------------------------------
/tools/internal/parser/write.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"strconv"
  8 | 	"strings"
  9 | )
 10 | 
 11 | // MarshalPSL returns the list serialized to standard PSL text format.
 12 | func (l *List) MarshalPSL() []byte {
 13 | 	var ret bytes.Buffer
 14 | 	writeBlockPSL(&ret, l)
 15 | 	return ret.Bytes()
 16 | }
 17 | 
 18 | func writeBlockPSL(w io.Writer, b Block) {
 19 | 	f := func(msg string, args ...any) {
 20 | 		fmt.Fprintf(w, msg+"\n", args...)
 21 | 	}
 22 | 
 23 | 	switch v := b.(type) {
 24 | 	case *List:
 25 | 		for i, child := range v.Blocks {
 26 | 			if i > 0 {
 27 | 				f("")
 28 | 			}
 29 | 			writeBlockPSL(w, child)
 30 | 		}
 31 | 	case *Section:
 32 | 		f("// ===BEGIN %s===", v.Name)
 33 | 		for _, child := range v.Blocks {
 34 | 			f("")
 35 | 			writeBlockPSL(w, child)
 36 | 		}
 37 | 		f("")
 38 | 		f("// ===END %s===", v.Name)
 39 | 	case *Suffixes:
 40 | 		for _, child := range v.Blocks {
 41 | 			writeBlockPSL(w, child)
 42 | 		}
 43 | 	case *Suffix:
 44 | 		f("%s", v.Domain)
 45 | 	case *Wildcard:
 46 | 		base := v.Domain
 47 | 		f("*.%s", base)
 48 | 		for _, exc := range v.Exceptions {
 49 | 			f("!%s.%s", exc, base)
 50 | 		}
 51 | 	case *Comment:
 52 | 		for _, line := range v.Text {
 53 | 			f("// %s", line)
 54 | 		}
 55 | 	default:
 56 | 		panic("unknown ast node")
 57 | 	}
 58 | }
 59 | 
 60 | // MarhsalDebug returns the list serialized to a verbose debugging
 61 | // format. This format is private to this package and for development
 62 | // use only. The format may change drastically without notice.
 63 | func (l *List) MarshalDebug() []byte {
 64 | 	var ret bytes.Buffer
 65 | 	writeBlockDebug(&ret, l, "")
 66 | 	return ret.Bytes()
 67 | }
 68 | 
 69 | func writeBlockDebug(w io.Writer, b Block, indent string) {
 70 | 	changemark := ""
 71 | 	if b.Changed() {
 72 | 		changemark = "!!"
 73 | 	}
 74 | 	f := func(msg string, args ...any) {
 75 | 		fmt.Fprintf(w, indent+msg+"\n", args...)
 76 | 	}
 77 | 
 78 | 	src := b.SrcRange()
 79 | 	loc := fmt.Sprintf("%d-%d", src.FirstLine, src.LastLine)
 80 | 	if src.FirstLine+1 == src.LastLine {
 81 | 		loc = strconv.Itoa(src.FirstLine)
 82 | 	}
 83 | 
 84 | 	const extraIndent = "   "
 85 | 	nextIndent := indent + extraIndent
 86 | 
 87 | 	switch v := b.(type) {
 88 | 	case *List:
 89 | 		f("%sList(%s) {", changemark, loc)
 90 | 		for _, child := range v.Blocks {
 91 | 			writeBlockDebug(w, child, nextIndent)
 92 | 		}
 93 | 		f("} // List")
 94 | 	case *Section:
 95 | 		f("%sSection(%s, name=%q) {", changemark, loc, v.Name)
 96 | 		for _, child := range v.Blocks {
 97 | 			writeBlockDebug(w, child, nextIndent)
 98 | 		}
 99 | 		f("} // Section(name=%q)", v.Name)
100 | 	case *Suffixes:
101 | 		items := []string{loc, fmt.Sprintf("editable=%v", v.Info.MachineEditable)}
102 | 		if v.Info.Name != "" {
103 | 			items = append(items, fmt.Sprintf("name=%q", v.Info.Name))
104 | 		}
105 | 		for _, u := range v.Info.URLs {
106 | 			items = append(items, fmt.Sprintf("url=%q", u))
107 | 		}
108 | 		for _, e := range v.Info.Maintainers {
109 | 			email := strings.TrimSpace(fmt.Sprintf("%s <%s>", e.Name, e.Address))
110 | 			items = append(items, fmt.Sprintf("contact=%q", email))
111 | 		}
112 | 		for _, o := range v.Info.Other {
113 | 			items = append(items, fmt.Sprintf("other=%q", o))
114 | 		}
115 | 
116 | 		const open = "SuffixBlock("
117 | 		pad := strings.Repeat(" ", len(open))
118 | 		f("%s%s%s) {", changemark, open, strings.Join(items, fmt.Sprintf(",\n%s%s", indent, pad)))
119 | 		for _, child := range v.Blocks {
120 | 			writeBlockDebug(w, child, nextIndent)
121 | 		}
122 | 		f("} // SuffixBlock(name=%q)", v.Info.Name)
123 | 	case *Suffix:
124 | 		f("%sSuffix(%s, %q)", changemark, loc, v.Domain)
125 | 	case *Wildcard:
126 | 		w := fmt.Sprintf("*.%s", v.Domain)
127 | 		if len(v.Exceptions) > 0 {
128 | 			f("%sWildcard(%s, %q, except=%v)", changemark, loc, w, v.Exceptions)
129 | 		} else {
130 | 			f("%sWildcard(%s, %q)", changemark, loc, w)
131 | 		}
132 | 	case *Comment:
133 | 		f("%sComment(%s) {", changemark, loc)
134 | 		for _, line := range v.Text {
135 | 			f("%s%s", extraIndent, line)
136 | 		}
137 | 		f("}")
138 | 	default:
139 | 		panic("unknown ast node")
140 | 	}
141 | }
142 | 


--------------------------------------------------------------------------------
/tools/internal/parser/unicode.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"sync"
 6 | 
 7 | 	"golang.org/x/text/collate"
 8 | 	"golang.org/x/text/language"
 9 | )
10 | 
11 | // How do you sort strings? The answer is surprisingly complex.
12 | //
13 | // "Collation" is the technical term for putting things in a specific
14 | // order. For strings of human text, there is no universal agreement
15 | // on what order is "correct".
16 | //
17 | // Different languages have different sorting conventions: in English
18 | // Ä is an accented A and comes before B, but in Swedish Ä is the 28th
19 | // letter of the alphabet and comes after Z.
20 | //
21 | // A single language also sorts differently sometimes: a phonebook
22 | // written in the German language is in a slightly different order in
23 | // Germany vs. Austria. Or even within a single country: in Germany, a
24 | // list of names can be in "standard" order, or it can be in
25 | // "phonebook" order, with different choices for ä, ö and ü.
26 | //
27 | // Finally, there are style choices available that are considered
28 | // equally valid, depending on the application. A common example is
29 | // "numeric sort", which order numbers inside strings according to
30 | // mathematics: "3" > "24" in "standard" lexicographic order, but if a
31 | // collation uses numeric sort, "3" < "24".
32 | //
33 | // Whitespace and punctuation are another example of a style choice:
34 | // in some applications they participate in the ordering, and in
35 | // others they are ignored and only "real" letters determine the
36 | // order.
37 | //
38 | // Fortunately, the Unicode Consortium has simplified all this for us:
39 | // there is a single universal Unicode Collation Algorithm
40 | // (http://www.unicode.org/reports/tr10/) that handles all of this
41 | // complexity. We just have to tell it which
42 | // language/dialect/country/style we want to use, and now we can
43 | // compare strings.
44 | //
45 | // For non-suffix text, the PSL uses the "basic" English
46 | // collation. Specifically, we use the collation defined in the
47 | // Unicode CLDR (Common Locale Data Repository,
48 | // https://cldr.unicode.org/), described by the BCP 47 language tag
49 | // "en": "global" English, with no country or dialect modifications,
50 | // and "default" style choices for English: ordering is
51 | // case-sensitive, whitespace-sensitive and punctuation-sensitive, and
52 | // numbers are compared in lexicographic order, not numeric order.
53 | 
54 | // compareCommentText compares the strings of comment text a and b,
55 | // using the PSL's chosen collation. It returns -1 if a < b, +1 if a >
56 | // b, or 0 if a == b.
57 | //
58 | // This function MUST NOT be used to compare domain name or DNS label
59 | // strings. For that, use domain.Name.Compare or domain.Label.Compare.
60 | func compareCommentText(a string, b string) int {
61 | 	// golang.org/x/text/collate has a few bugs, and in particular the
62 | 	// "CompareString" method uses a special "incremental collation"
63 | 	// codepath that sometimes returns incorrect results (see
64 | 	// https://github.com/golang/go/issues/68166).
65 | 	//
66 | 	// To be safe, we instead use the "slower" (still pretty fast)
67 | 	// codepath: we explicitly convert the strings into the
68 | 	// corresponding "sort keys", and then bytes.Compare those. There
69 | 	// are more exhaustive tests for sort key computation, so there is
70 | 	// higher confidence that it works correctly.
71 | 	//
72 | 	// Unfortunately individual collators are also not safe for
73 | 	// concurrent use. Wrap them in a global mutex. We could also
74 | 	// construct a new collator for each use, but that ends up being
75 | 	// more expensive and less performant than sharing one collator
76 | 	// with a mutex.
77 | 	commentCollatorMu.Lock()
78 | 	defer commentCollatorMu.Unlock()
79 | 	var buf collate.Buffer
80 | 	ka := commentCollator.KeyFromString(&buf, a)
81 | 	kb := commentCollator.KeyFromString(&buf, b)
82 | 	return bytes.Compare(ka, kb)
83 | }
84 | 
85 | // commentCollator compares strings in the PSL's chosen collation for
86 | // non-suffix text. See the comment at the start of this file for more
87 | // details.
88 | var commentCollator = collate.New(language.MustParse("en"))
89 | var commentCollatorMu sync.Mutex
90 | 


--------------------------------------------------------------------------------
/tools/internal/parser/write_test.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestMarshalPSL(t *testing.T) {
  9 | 	tests := []struct {
 10 | 		name string
 11 | 		in   *List
 12 | 		want []byte
 13 | 	}{
 14 | 		{
 15 | 			name: "empty",
 16 | 			in:   list(),
 17 | 			want: byteLines(""),
 18 | 		},
 19 | 
 20 | 		{
 21 | 			name: "comments_and_empty_sections",
 22 | 			in: list(
 23 | 				comment(0, "This is a two", "line comment"),
 24 | 				comment(0, "Another separate comment"),
 25 | 				section(0, 0, "ICANN DOMAINS",
 26 | 					comment(0, "Inside icann domains"),
 27 | 				),
 28 | 				comment(0, "Between sections"),
 29 | 				section(0, 0, "PRIVATE DOMAINS",
 30 | 					comment(0, "Private domains here"),
 31 | 					comment(0, "More private domains"),
 32 | 				),
 33 | 			),
 34 | 			want: byteLines(
 35 | 				"// This is a two",
 36 | 				"// line comment",
 37 | 				"",
 38 | 				"// Another separate comment",
 39 | 				"",
 40 | 				"// ===BEGIN ICANN DOMAINS===",
 41 | 				"",
 42 | 				"// Inside icann domains",
 43 | 				"",
 44 | 				"// ===END ICANN DOMAINS===",
 45 | 				"",
 46 | 				"// Between sections",
 47 | 				"",
 48 | 				"// ===BEGIN PRIVATE DOMAINS===",
 49 | 				"",
 50 | 				"// Private domains here",
 51 | 				"",
 52 | 				"// More private domains",
 53 | 				"",
 54 | 				"// ===END PRIVATE DOMAINS===",
 55 | 				"",
 56 | 			),
 57 | 		},
 58 | 
 59 | 		{
 60 | 			name: "some_suffixes",
 61 | 			in: list(
 62 | 				comment(1, "Test list"),
 63 | 				section(2, 2, "ICANN DOMAINS",
 64 | 					suffixes(1, 1, noInfo,
 65 | 						suffix(1, "aaa"),
 66 | 						suffix(2, "bbb"),
 67 | 						wildcard(3, 3, "ccc", "d", "e", "f"),
 68 | 					),
 69 | 					suffixes(2, 2, noInfo,
 70 | 						suffix(1, "xxx"),
 71 | 						suffix(2, "yyy"),
 72 | 						suffix(3, "zzz"),
 73 | 					),
 74 | 				),
 75 | 			),
 76 | 			want: byteLines(
 77 | 				"// Test list",
 78 | 				"",
 79 | 				"// ===BEGIN ICANN DOMAINS===",
 80 | 				"",
 81 | 				"aaa",
 82 | 				"bbb",
 83 | 				"*.ccc",
 84 | 				"!d.ccc",
 85 | 				"!e.ccc",
 86 | 				"!f.ccc",
 87 | 				"",
 88 | 				"xxx",
 89 | 				"yyy",
 90 | 				"zzz",
 91 | 				"",
 92 | 				"// ===END ICANN DOMAINS===",
 93 | 				"",
 94 | 			),
 95 | 		},
 96 | 	}
 97 | 
 98 | 	for _, tc := range tests {
 99 | 		t.Run(tc.name, func(t *testing.T) {
100 | 			got := tc.in.MarshalPSL()
101 | 			checkDiff(t, "MarhsalPSL output", got, tc.want)
102 | 
103 | 			// Does the marshaled output parse?
104 | 			in2, errs := Parse(got)
105 | 			if len(errs) > 0 {
106 | 				t.Logf("failed to parse MarshalPSL output:")
107 | 				for _, err := range errs {
108 | 					t.Error(err)
109 | 				}
110 | 				t.FailNow()
111 | 			}
112 | 
113 | 			// Parse result should be identical to the original,
114 | 			// modulo source ranges.
115 | 			zeroSourceRange(tc.in)
116 | 			zeroSourceRange(in2)
117 | 			checkDiff(t, "MarshalPSL then Parse", in2, tc.in)
118 | 			if t.Failed() {
119 | 				t.FailNow()
120 | 			}
121 | 		})
122 | 	}
123 | }
124 | 
125 | func TestRoundtripRealPSL(t *testing.T) {
126 | 	bs, err := os.ReadFile("../../../public_suffix_list.dat")
127 | 	if err != nil {
128 | 		t.Fatal(err)
129 | 	}
130 | 
131 | 	psl, errs := Parse(bs)
132 | 	if len(errs) > 0 {
133 | 		t.Logf("PSL parse failed, skipping round-trip test:")
134 | 		for _, err := range errs {
135 | 			t.Error(err)
136 | 		}
137 | 		t.FailNow()
138 | 	}
139 | 
140 | 	suffixCnt1 := len(BlocksOfType[*Suffix](psl))
141 | 	wildCnt1 := len(BlocksOfType[*Wildcard](psl))
142 | 	if got, wantMin := suffixCnt1, 1000; got < wantMin {
143 | 		t.Fatalf("PSL doesn't have enough suffixes, got %d want at least %d", got, wantMin)
144 | 	}
145 | 	if got, wantMin := wildCnt1, 2; got < wantMin {
146 | 		t.Fatalf("PSL doesn't have enough wildcards, got %d want at least %d", got, wantMin)
147 | 	}
148 | 
149 | 	bs2 := psl.MarshalPSL()
150 | 	psl2, errs := Parse(bs2)
151 | 	if len(errs) > 0 {
152 | 		t.Logf("PSL parse after MarshalPSL failed:")
153 | 		for _, err := range errs {
154 | 			t.Error(err)
155 | 		}
156 | 		t.FailNow()
157 | 	}
158 | 
159 | 	suffixCnt2 := len(BlocksOfType[*Suffix](psl2))
160 | 	wildCnt2 := len(BlocksOfType[*Wildcard](psl2))
161 | 	if got, want := suffixCnt2, suffixCnt1; got != want {
162 | 		t.Errorf("MarshalPSL changed suffix count, got %d want %d", got, want)
163 | 	}
164 | 	if got, want := wildCnt2, wildCnt1; got != want {
165 | 		t.Errorf("MarshalPSL changed wildcard count, got %d want %d", got, want)
166 | 	}
167 | 
168 | 	zeroSourceRange(psl)
169 | 	zeroSourceRange(psl2)
170 | 	checkDiff(t, "PSL roundtrip through MarshalPSL", psl2, psl)
171 | }
172 | 


--------------------------------------------------------------------------------
/tests/test_psl.txt:
--------------------------------------------------------------------------------
 1 | // Any copyright is dedicated to the Public Domain.
 2 | // https://creativecommons.org/publicdomain/zero/1.0/
 3 | 
 4 | // null input.
 5 | checkPublicSuffix(null, null);
 6 | // Mixed case.
 7 | checkPublicSuffix('COM', null);
 8 | checkPublicSuffix('example.COM', 'example.com');
 9 | checkPublicSuffix('WwW.example.COM', 'example.com');
10 | // Leading dot.
11 | checkPublicSuffix('.com', null);
12 | checkPublicSuffix('.example', null);
13 | checkPublicSuffix('.example.com', null);
14 | checkPublicSuffix('.example.example', null);
15 | // Unlisted TLD.
16 | checkPublicSuffix('example', null);
17 | checkPublicSuffix('example.example', 'example.example');
18 | checkPublicSuffix('b.example.example', 'example.example');
19 | checkPublicSuffix('a.b.example.example', 'example.example');
20 | // Listed, but non-Internet, TLD.
21 | //checkPublicSuffix('local', null);
22 | //checkPublicSuffix('example.local', null);
23 | //checkPublicSuffix('b.example.local', null);
24 | //checkPublicSuffix('a.b.example.local', null);
25 | // TLD with only 1 rule.
26 | checkPublicSuffix('biz', null);
27 | checkPublicSuffix('domain.biz', 'domain.biz');
28 | checkPublicSuffix('b.domain.biz', 'domain.biz');
29 | checkPublicSuffix('a.b.domain.biz', 'domain.biz');
30 | // TLD with some 2-level rules.
31 | checkPublicSuffix('com', null);
32 | checkPublicSuffix('example.com', 'example.com');
33 | checkPublicSuffix('b.example.com', 'example.com');
34 | checkPublicSuffix('a.b.example.com', 'example.com');
35 | checkPublicSuffix('uk.com', null);
36 | checkPublicSuffix('example.uk.com', 'example.uk.com');
37 | checkPublicSuffix('b.example.uk.com', 'example.uk.com');
38 | checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
39 | checkPublicSuffix('test.ac', 'test.ac');
40 | // TLD with only 1 (wildcard) rule.
41 | checkPublicSuffix('mm', null);
42 | checkPublicSuffix('c.mm', null);
43 | checkPublicSuffix('b.c.mm', 'b.c.mm');
44 | checkPublicSuffix('a.b.c.mm', 'b.c.mm');
45 | // More complex TLD.
46 | checkPublicSuffix('jp', null);
47 | checkPublicSuffix('test.jp', 'test.jp');
48 | checkPublicSuffix('www.test.jp', 'test.jp');
49 | checkPublicSuffix('ac.jp', null);
50 | checkPublicSuffix('test.ac.jp', 'test.ac.jp');
51 | checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
52 | checkPublicSuffix('kyoto.jp', null);
53 | checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
54 | checkPublicSuffix('ide.kyoto.jp', null);
55 | checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
56 | checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
57 | checkPublicSuffix('c.kobe.jp', null);
58 | checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
59 | checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
60 | checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
61 | checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
62 | // TLD with a wildcard rule and exceptions.
63 | checkPublicSuffix('ck', null);
64 | checkPublicSuffix('test.ck', null);
65 | checkPublicSuffix('b.test.ck', 'b.test.ck');
66 | checkPublicSuffix('a.b.test.ck', 'b.test.ck');
67 | checkPublicSuffix('www.ck', 'www.ck');
68 | checkPublicSuffix('www.www.ck', 'www.ck');
69 | // US K12.
70 | checkPublicSuffix('us', null);
71 | checkPublicSuffix('test.us', 'test.us');
72 | checkPublicSuffix('www.test.us', 'test.us');
73 | checkPublicSuffix('ak.us', null);
74 | checkPublicSuffix('test.ak.us', 'test.ak.us');
75 | checkPublicSuffix('www.test.ak.us', 'test.ak.us');
76 | checkPublicSuffix('k12.ak.us', null);
77 | checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
78 | checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
79 | // IDN labels.
80 | checkPublicSuffix('食狮.com.cn', '食狮.com.cn');
81 | checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn');
82 | checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn');
83 | checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn');
84 | checkPublicSuffix('公司.cn', null);
85 | checkPublicSuffix('食狮.中国', '食狮.中国');
86 | checkPublicSuffix('www.食狮.中国', '食狮.中国');
87 | checkPublicSuffix('shishi.中国', 'shishi.中国');
88 | checkPublicSuffix('中国', null);
89 | // Same as above, but punycoded.
90 | checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn');
91 | checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
92 | checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
93 | checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn');
94 | checkPublicSuffix('xn--55qx5d.cn', null);
95 | checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
96 | checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
97 | checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s');
98 | checkPublicSuffix('xn--fiqs8s', null);
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Public Suffix List
 2 | 
 3 | A "public suffix" is one under which Internet users can (or historically could)
 4 | directly register names. Some examples of public suffixes are `com`, `co.uk` and
 5 | `pvt.k12.ma.us`. The Public Suffix List is a list of all known public suffixes.
 6 | 
 7 | See https://publicsuffix.org/ and the [Wiki](https://github.com/publicsuffix/list/wiki) link above for more information.
 8 | 
 9 | ## Are you here to add or update something?
10 | 
11 | All submissions must conform to the [validation and acceptance factors](https://github.com/publicsuffix/list/wiki/Guidelines#validation-and-non-acceptance-factors) and provide sufficient rationale or basically be as complete as possible, and follow the [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines), especially as they relate to format and [sorting](https://github.com/publicsuffix/list/wiki/Guidelines#sort-your-submission-correctly-important).
12 | 
13 | The list is currently maintained by people who are volunteering their time towards universal acceptance and ensuring there is a bridge between the ICANN world of domain names and the crucial last mile - the world of developers and human users.
14 | 
15 | Iteration back and forth will delay PR review or inclusion. Be extremely thorough, and patient.
16 | 
17 | ## Important Notices
18 | 
19 | ### 2025-05-27
20 | Were you directed here to be able to add a subdomain to your **Cloudflare** account? If so, please work directly with Cloudflare for these account limitations. The PSL is **NOT** intended as a workaround for Cloudflare's subdomain restrictions. 
21 | 
22 | Consult [Cloudflare's subdomain setup documentation](https://developers.cloudflare.com/dns/zone-setups/subdomain-setup/) or contact Cloudflare directly for subdomain setup questions. Only submit a request to the PSL if your domain truly meets our criteria outlined in [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines).
23 | 
24 | ### 2024-07-26
25 | We are sending emails asking for confirmation if certain entries are still required or need updating.
26 | 
27 | Currently, this process is purely manual and extremely low volume but if you do get an email, please respond.
28 | 
29 | Please see the [Email Communication Policy](#email-communication-policy) to see how we will often communicate these changes.
30 | 
31 | ### 2023-02-20
32 | Did [guidance from Google related to the changes that they are making to adsense subdomains](https://support.google.com/adsense/answer/12170421) bring you here? Work with Google Adsense [Help Link](https://support.google.com/adsense/gethelp) with any support questions you have. The PSL is thinly resourced, and the volunteer maintainers are unable to answer questions about Adsense changes or support Adsense.
33 | 
34 | The PSL is volunteer-resourced and is absolutely not resourced to answer questions or support changes. Guidance is in the form of self-help (READ THE [WIKI](https://github.com/publicsuffix/list/wiki)), THERE IS NO PSL CUSTOMER SERVICE RESOURCE TO ASSIST YOU. *Please work directly with Google to ensure your domain does in fact need an entry, and they should help you know what the benefits and consequences are. __IT POSSIBLE TO HARM YOUR WEBSITE COOKIES BY REQUESTING A MALFORMED PSL ENTRY__. Also, understand what propagation delays and rollback processing entail before making requests.*
35 | 
36 | ### 2021-04-23
37 | Did guidance related to an issue with Facebook or Apple bring you here? [Read this before submitting requests](https://github.com/publicsuffix/list/issues/1245) We are not approving workaround requests per the validation and acceptance standards, but do have open discussion with Facebook on the matter.
38 | 
39 | ## Email Communication Policy
40 | 
41 | We tend to use the subject line tag "[PSL notification]" in all Public Suffix List communications. For effective spam filtering, you can create a case-insensitive filter to allow only emails with exact "[PSL notification]" in the subject line. If you choose to set up such a filter in your email application, please verify the filter is implemented correctly and test it thoroughly to ensure you don't accidentally miss important communications from us.
42 | 
43 | ## Code of Conduct
44 | 
45 | Your participation in the Public Suffix List project should follow the [Mozilla Community Participation Guidelines](https://www.mozilla.org/en-US/about/governance/policies/participation/ "Mozilla Community Participation Guidelines") as well as the [GitHub Community Participation Guidelines](https://help.github.com/en/github/site-policy/github-community-guidelines "GitHub Community Participation Guidelines"). Behavior that falls into the areas forbidden by either document is unwelcome and will result in further escalation.
46 | 


--------------------------------------------------------------------------------
/tools/private_domains_checker/README.md:
--------------------------------------------------------------------------------
  1 | # PSL Private Section Domains WHOIS Checker
  2 | 
  3 | ## Overview
  4 | 
  5 | The `PSLPrivateDomainsProcessor` is a Python script designed to fetch data from the Public Suffix List (PSL) and check the domain status, expiry dates, and `_psl` TXT records of the private section domains. 
  6 | 
  7 | It performs WHOIS checks on these domains and saves the results into CSV files for manual review.
  8 | 
  9 | ## Requirements
 10 | 
 11 | - Python 3.x
 12 | - `requests`
 13 | - `pandas`
 14 | - `whoisdomain`
 15 | 
 16 | You can install the required packages using pip:
 17 | 
 18 | ```sh
 19 | pip install -r requirements.txt
 20 | ```
 21 | 
 22 | Ensure that `whois` is installed on your operating system.
 23 | 
 24 | ```sh
 25 | sudo apt install whois  # Debian/Ubuntu
 26 | sudo yum install whois  # Fedora/Centos/Rocky
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | `PSLPrivateDomainsProcessor.py`: The main script containing the `PSLPrivateDomainsProcessor` class and functions for DNS and WHOIS checks.
 32 | 
 33 | Run the script using Python:
 34 | 
 35 | ```sh
 36 | cd private_domains_checker
 37 | mkdir data
 38 | python PSLPrivateDomainsProcessor.py
 39 | ```
 40 | 
 41 | ## Main Components
 42 | 
 43 | ### Functions
 44 | 
 45 | - `make_dns_request(domain, record_type)`: Makes DNS requests to both Google and Cloudflare DNS APIs.
 46 | - `check_dns_status(domain)`: Checks the DNS status of a domain using Google and Cloudflare DNS APIs.
 47 | - `get_whois_data(domain)`: Retrieves WHOIS data for a domain using the whoisdomain package.
 48 | - `check_psl_txt_record(domain)`: Checks the `_psl` TXT record for a domain using Google and Cloudflare DNS APIs.
 49 | 
 50 | ### Class
 51 | 
 52 | #### PSLPrivateDomainsProcessor
 53 | 
 54 | - `fetch_psl_data()`: Fetches the PSL data from the specified URL.
 55 | - `parse_domain(domain)`: Parses and normalizes a domain.
 56 | - `parse_psl_data(psl_data)`: Parses the fetched PSL data and separates ICANN and private domains.
 57 | - `process_domains(raw_domains, domains)`: Processes each domain, performing DNS, WHOIS, and `_psl` TXT record checks.
 58 | - `save_results()`: Saves all processed domain data to `data/all.csv`.
 59 | - `save_invalid_results()`: Saves domains with invalid DNS or expired WHOIS data to `data/nxdomain.csv` and `data/expired.csv`.
 60 | - `save_hold_results()`: Saves domains with WHOIS status containing any form of "hold" to `data/hold.csv`.
 61 | - `save_missing_psl_txt_results()`: Saves domains with invalid `_psl` TXT records to `data/missing_psl_txt.csv`.
 62 | - `save_expiry_less_than_2yrs_results()`: Saves domains with WHOIS expiry date less than 2 years from now to `data/expiry_less_than_2yrs.csv`.
 63 | - `run()`: Executes the entire processing pipeline.
 64 | 
 65 | ## Output
 66 | 
 67 | The script generates the following CSV files in the `data` directory:
 68 | 
 69 | - `all.csv`: Contains all processed domain data.
 70 | - `nxdomain.csv`: Contains domains that could not be resolved (`NXDOMAIN`).
 71 | - `expired.csv`: Contains domains with expired WHOIS records.
 72 | - `hold.csv`: Contains domains with WHOIS status indicating any kind of "hold".
 73 | - `missing_psl_txt.csv`: Contains domains with invalid `_psl` TXT records.
 74 | - `expiry_less_than_2yrs.csv`: Contains domains with WHOIS expiry date less than 2 years from now.
 75 | 
 76 | ## Example
 77 | 
 78 | An example CSV entry:
 79 | 
 80 | | psl_entry      | top_level_domain | dns_status | whois_status | whois_domain_expiry_date | whois_domain_status          | psl_txt_status | expiry_check_status |
 81 | | -------------- | ---------------- | ---------- | ------------ | ----------------------- | ---------------------------- | -------------- | ------------------- |
 82 | | example.com    | example.com      | ok         | ok           | 2024-12-31              | "clientTransferProhibited"   | "valid"        | ok                  |
 83 | 
 84 | ## Publicly Registrable Namespace Determination
 85 | 
 86 | The script determines the publicly registrable namespace from private domains by using the ICANN section. 
 87 | 
 88 | Here's how it works:
 89 | 
 90 | 1. **ICANN Domains Set**: ICANN domains are stored in a set for quick lookup.
 91 | 2. **Domain Parsing**: For each private domain, the script splits the domain into parts. It then checks if any suffix of these parts exists in the ICANN domains set.
 92 | 3. **Normalization**: The private domain is normalized to its publicly registrable form using the ICANN domains set.
 93 | 
 94 | Examples:
 95 | 
 96 | - **Input**: PSL private domain entry `"*.example.com"`
 97 |   - **Process**: 
 98 |     - Remove leading `'*.'`: `"example.com"`
 99 |     - Check `"com"` against the ICANN domains set: Found
100 |   - **Output**: `"example.com"`
101 | 
102 | - **Input**: PSL private domain entry `"sub.example.co.uk"`
103 |   - **Process**:
104 |     - Check `"example.co.uk"` against the ICANN domains set: Not found
105 |     - Check `"co.uk"` against the ICANN domains set: Found
106 |   - **Output**: `"example.co.uk"`
107 | 
108 | The output is then used for checking WHOIS data.
109 | 
110 | ## License
111 | 
112 | This tool is licensed under the MIT License.


--------------------------------------------------------------------------------
/tools/internal/githistory/history.go:
--------------------------------------------------------------------------------
  1 | // Package githistory provides helpers to look up PSL PR changes in a
  2 | // local git repository.
  3 | package githistory
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"fmt"
  8 | 	"os/exec"
  9 | 	"regexp"
 10 | 	"strconv"
 11 | 	"strings"
 12 | )
 13 | 
 14 | // PRInfo lists commit metadata for a given Github PR.
 15 | type PRInfo struct {
 16 | 	Num int
 17 | 	// CommitHash is the git hash in which the PSL contains the
 18 | 	// changes of this PR.
 19 | 	CommitHash string
 20 | 	// ParentHash is the git hash immediately before this PR's changes
 21 | 	// were added to the PSL.
 22 | 	ParentHash string
 23 | }
 24 | 
 25 | // History is PR metadata extracted from a local PSL git clone.
 26 | type History struct {
 27 | 	GitPath string // path to the local git clone
 28 | 	PRs     map[int]PRInfo
 29 | }
 30 | 
 31 | // gitTopLevel finds the top level of the git repository that contains
 32 | // path, if any.
 33 | func gitToplevel(path string) (string, error) {
 34 | 	bs, err := gitStdout(path, "rev-parse", "--show-toplevel")
 35 | 	if err != nil {
 36 | 		return "", fmt.Errorf("finding top level of git repo %q: %w", path, err)
 37 | 	}
 38 | 	return string(bs), nil
 39 | }
 40 | 
 41 | // GetPRInfo extracts PR metadata from the git repository at gitPath.
 42 | func GetPRInfo(gitPath string) (*History, error) {
 43 | 	toplevel, err := gitToplevel(gitPath)
 44 | 	if err != nil {
 45 | 		return nil, err
 46 | 	}
 47 | 
 48 | 	// List all commits that have a description with a '(#1234)' at
 49 | 	// the end of a line of description or "Merge pull request #1234
 50 | 	// from" at the start, and print the matching commits in a form
 51 | 	// that's easy to parse.
 52 | 	prCommits, err := gitStdout(toplevel, "log",
 53 | 		"--perl-regexp",
 54 | 		`--grep=\(#\d+\)$`,
 55 | 		`--grep=^Merge pull request #\d+ from`,
 56 | 		"--pretty=%H@%P@%s",
 57 | 		"master")
 58 | 
 59 | 	ret := &History{
 60 | 		GitPath: toplevel,
 61 | 		PRs:     map[int]PRInfo{},
 62 | 	}
 63 | 	for _, line := range strings.Split(string(prCommits), "\n") {
 64 | 		fs := strings.SplitN(line, "@", 3)
 65 | 		if len(fs) != 3 {
 66 | 			return nil, fmt.Errorf("unexpected line format %q", line)
 67 | 		}
 68 | 		commit, parentsStr, desc := fs[0], fs[1], fs[2]
 69 | 		parents := strings.Split(parentsStr, " ")
 70 | 		// For merge commits, we have multiple parents, and we want
 71 | 		// the "main branch" side of the merge, i.e. the state of the
 72 | 		// tree before the PR was merged. Empirically, Github always
 73 | 		// lists that commit as the 1st parent in merge commits.
 74 | 		//
 75 | 		// For squash commits, there is only one parent.
 76 | 		//
 77 | 		// This logic cannot handle rebase-and-merge actions, since
 78 | 		// those by definition erase the PR history from the git
 79 | 		// history. However, the PSL doesn't use rebase-and-merge by
 80 | 		// convention, so this works out. Worst case, if this logic
 81 | 		// does catch a rebase-and-merge, the result will be false
 82 | 		// positives (suffix flagged for invalid TXT record), if the
 83 | 		// PR contained more than 1 commit.
 84 | 		parent := parents[0]
 85 | 		ms := prNumberRe.FindStringSubmatch(desc)
 86 | 		if len(ms) != 3 {
 87 | 			// The grep on git log returned a false positive where the
 88 | 			// PR number is not on the first line of the commit
 89 | 			// message. This is not a commit in the standard github
 90 | 			// format for PRs.
 91 | 			continue
 92 | 		}
 93 | 
 94 | 		var prNum int
 95 | 		if ms[1] != "" {
 96 | 			prNum, err = strconv.Atoi(ms[1])
 97 | 		} else {
 98 | 			prNum, err = strconv.Atoi(ms[2])
 99 | 		}
100 | 		if err != nil {
101 | 			// Shouldn't happen, the regex isolates digits, why can't
102 | 			// we parse digits?
103 | 			return nil, fmt.Errorf("unexpected invalid PR number string %q", ms[1])
104 | 		}
105 | 
106 | 		ret.PRs[prNum] = PRInfo{
107 | 			Num:        prNum,
108 | 			CommitHash: commit,
109 | 			ParentHash: parent,
110 | 		}
111 | 	}
112 | 
113 | 	return ret, nil
114 | }
115 | 
116 | // GetPSL returns the PSL file at the given commit hash in the git
117 | // repository at gitPath.
118 | func GetPSL(gitPath string, hash string) ([]byte, error) {
119 | 	toplevel, err := gitToplevel(gitPath)
120 | 	if err != nil {
121 | 		return nil, err
122 | 	}
123 | 
124 | 	bs, err := gitStdout(toplevel, "show", fmt.Sprintf("%s:public_suffix_list.dat", hash))
125 | 	if err != nil {
126 | 		return nil, err
127 | 	}
128 | 
129 | 	return bs, nil
130 | }
131 | 
132 | // Matches either "(#1234)" at the end of a line, or "Merge pull
133 | // request #1234 from" at the start of a line. The first is how github
134 | // formats squash-and-merge commits, the second is how github formats
135 | // 2-parent merge commits.
136 | var prNumberRe = regexp.MustCompile(`(?:\(#(\d+)\)$)|(?:^Merge pull request #(\d+) from)`)
137 | 
138 | func gitStdout(repoPath string, args ...string) ([]byte, error) {
139 | 	args = append([]string{"-C", repoPath}, args...)
140 | 	c := exec.Command("git", args...)
141 | 	var stderr bytes.Buffer
142 | 	c.Stderr = &stderr
143 | 	bs, err := c.Output()
144 | 	if err != nil {
145 | 		// Make the error show the git commandline and captured
146 | 		// stderr, not just the plain "exited with code 45" error.
147 | 		cmdline := append([]string{"git"}, args...)
148 | 		var stderrStr string
149 | 		if stderr.Len() != 0 {
150 | 			stderrStr = "stderr:\n" + stderr.String()
151 | 		}
152 | 		return nil, fmt.Errorf("running %q: %w. %s", strings.Join(cmdline, " "), err, stderrStr)
153 | 	}
154 | 	return bytes.TrimSpace(bs), nil
155 | }
156 | 


--------------------------------------------------------------------------------
/tools/internal/github/pr.go:
--------------------------------------------------------------------------------
  1 | // Package github provides a github client with functions tailored to
  2 | // the PSL's needs.
  3 | package github
  4 | 
  5 | import (
  6 | 	"context"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"os"
 10 | 	"time"
 11 | 
 12 | 	"github.com/google/go-github/v63/github"
 13 | )
 14 | 
 15 | // Client is a GitHub API client that performs PSL-specific
 16 | // operations. The zero value is a client that interacts with the
 17 | // official publicsuffix/list repository.
 18 | type Repo struct {
 19 | 	// Owner is the github account of the repository to query. If
 20 | 	// empty, defaults to "publicsuffix".
 21 | 	Owner string
 22 | 	// Repo is the repository to query. If empty, defaults to "list".
 23 | 	Repo string
 24 | 
 25 | 	client *github.Client
 26 | }
 27 | 
 28 | func (c *Repo) owner() string {
 29 | 	if c.Owner != "" {
 30 | 		return c.Owner
 31 | 	}
 32 | 	return "publicsuffix"
 33 | }
 34 | 
 35 | func (c *Repo) repo() string {
 36 | 	if c.Repo != "" {
 37 | 		return c.Repo
 38 | 	}
 39 | 	return "list"
 40 | }
 41 | 
 42 | func (c *Repo) apiClient() *github.Client {
 43 | 	if c.client == nil {
 44 | 		c.client = github.NewClient(nil)
 45 | 		if token := os.Getenv("GITHUB_TOKEN"); token != "" {
 46 | 			c.client = c.client.WithAuthToken(token)
 47 | 		}
 48 | 	}
 49 | 	return c.client
 50 | }
 51 | 
 52 | // PSLForPullRequest fetches the PSL files needed to validate the
 53 | // given pull request. Returns the PSL file for the target branch, and
 54 | // the same but with the PR's changes applied.
 55 | func (c *Repo) PSLForPullRequest(ctx context.Context, prNum int) (withoutPR, withPR []byte, err error) {
 56 | 	// Github sometimes needs a little time to think to update the PR
 57 | 	// state, so we might need to sleep and retry a few times. Usually
 58 | 	// the status updates in <5s, but just for safety, give it a more
 59 | 	// generous timeout.
 60 | 	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
 61 | 	defer cancel()
 62 | 
 63 | 	var withoutHash, withHash string
 64 | 	for withoutHash == "" {
 65 | 		withoutHash, withHash, err = c.getPRCommitInfo(ctx, prNum)
 66 | 		if errors.Is(err, errMergeInfoNotReady) {
 67 | 			// PR exists but merge info is stale, need to wait and
 68 | 			// retry.
 69 | 			select {
 70 | 			case <-time.After(2 * time.Second):
 71 | 				continue
 72 | 			case <-ctx.Done():
 73 | 				return nil, nil, ctx.Err()
 74 | 			}
 75 | 		} else if err != nil {
 76 | 			return nil, nil, err
 77 | 		}
 78 | 	}
 79 | 
 80 | 	withoutPR, err = c.PSLForHash(ctx, withoutHash)
 81 | 	if err != nil {
 82 | 		return nil, nil, err
 83 | 	}
 84 | 	withPR, err = c.PSLForHash(ctx, withHash)
 85 | 	if err != nil {
 86 | 		return nil, nil, err
 87 | 	}
 88 | 	return withoutPR, withPR, nil
 89 | }
 90 | 
 91 | var errMergeInfoNotReady = errors.New("PR mergeability information not available yet, please retry later")
 92 | 
 93 | // getPRCommitInfo returns the "before" and "after" commit hashes for
 94 | // prNum.
 95 | //
 96 | // The exact meaning of "before" and "after" varies, but in general
 97 | // before is the state of the master branch right before the PR is
 98 | // merged, and "after" is the same state plus the PR's changes, with
 99 | // no unrelated changes.
100 | //
101 | // For an unmerged PR, "after" is a "trial merge commit" created
102 | // automatically by Github to run CI and check that the PR is
103 | // mergeable, and "before" is the master branch state from that trial
104 | // merge - usually the latest current state.
105 | //
106 | // For a merged PR, "after" is the commit where the PR's changes first
107 | // appeared in master, and "before" is the state of master immediately
108 | // before that.
109 | //
110 | // getPRCommitInfo returns the sentinel error errMergeInfoNotReady if
111 | // an open PR exists, but github needs a bit more time to update the
112 | // trial merge commit. The caller is expected to retry with
113 | // appropriate backoff.
114 | func (c *Repo) getPRCommitInfo(ctx context.Context, prNum int) (withoutPRCommit, withPRCommit string, err error) {
115 | 	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
116 | 	defer cancel()
117 | 
118 | 	pr, _, err := c.apiClient().PullRequests.Get(ctx, c.owner(), c.repo(), prNum)
119 | 	if err != nil {
120 | 		return "", "", err
121 | 	}
122 | 
123 | 	mergeCommit := pr.GetMergeCommitSHA()
124 | 	if mergeCommit == "" {
125 | 		return "", "", fmt.Errorf("no merge commit available for PR %d", prNum)
126 | 	}
127 | 	commitInfo, _, err := c.apiClient().Git.GetCommit(ctx, c.owner(), c.repo(), mergeCommit)
128 | 	if err != nil {
129 | 		return "", "", fmt.Errorf("getting info for trial merge SHA %q: %w", mergeCommit, err)
130 | 	}
131 | 
132 | 	var beforeMergeCommit string
133 | 	if pr.GetMerged() && len(commitInfo.Parents) == 1 {
134 | 		// PR was merged, PSL policy is to use squash-and-merge, so
135 | 		// the pre-PR commit is simply the parent of the merge commit.
136 | 		beforeMergeCommit = commitInfo.Parents[0].GetSHA()
137 | 	} else if pr.Mergeable == nil {
138 | 		// PR isn't merged, but github needs time to rebase the PR and
139 | 		// create a trial merge. Unfortunately the only way to know
140 | 		// when it's done is to just poll and wait for the mergeable
141 | 		// bool to be valid.
142 | 		return "", "", errMergeInfoNotReady
143 | 	} else if !pr.GetMergeable() {
144 | 		// PR isn't merged, and there's a merge conflict that prevents
145 | 		// us from knowing what the pre- and post-merge states are.
146 | 		return "", "", fmt.Errorf("cannot get PSL for PR %d, needs rebase to resolve conflicts", prNum)
147 | 	} else {
148 | 		// PR is either open, or it was merged without squashing. In
149 | 		// both cases, mergeCommit has 2 parents: one is the PR head
150 | 		// commit, and the other is the master branch without the PR's
151 | 		// changes.
152 | 		if numParents := len(commitInfo.Parents); numParents != 2 {
153 | 			return "", "", fmt.Errorf("unexpected parent count %d for trial merge commit on PR %d, expected 2 parents", numParents, prNum)
154 | 		}
155 | 
156 | 		prHeadCommit := pr.GetHead().GetSHA()
157 | 		if prHeadCommit == "" {
158 | 			return "", "", fmt.Errorf("no commit SHA available for head of PR %d", prNum)
159 | 		}
160 | 		if commitInfo.Parents[0].GetSHA() == prHeadCommit {
161 | 			beforeMergeCommit = commitInfo.Parents[1].GetSHA()
162 | 		} else {
163 | 			beforeMergeCommit = commitInfo.Parents[0].GetSHA()
164 | 		}
165 | 	}
166 | 
167 | 	return beforeMergeCommit, mergeCommit, nil
168 | }
169 | 
170 | // PSLForHash returns the PSL file at the given git commit hash.
171 | func (c *Repo) PSLForHash(ctx context.Context, hash string) ([]byte, error) {
172 | 	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
173 | 	defer cancel()
174 | 
175 | 	opts := &github.RepositoryContentGetOptions{
176 | 		Ref: hash,
177 | 	}
178 | 	content, _, _, err := c.apiClient().Repositories.GetContents(ctx, c.owner(), c.repo(), "public_suffix_list.dat", opts)
179 | 	if err != nil {
180 | 		return nil, fmt.Errorf("getting PSL for commit %q: %w", hash, err)
181 | 	}
182 | 	ret, err := content.GetContent()
183 | 	if err != nil {
184 | 		return nil, err
185 | 	}
186 | 	return []byte(ret), nil
187 | }
188 | 


--------------------------------------------------------------------------------
/tools/internal/parser/text_test.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"reflect"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/google/go-cmp/cmp"
 10 | 	"golang.org/x/text/encoding"
 11 | 	"golang.org/x/text/encoding/unicode"
 12 | )
 13 | 
 14 | func TestNormalize(t *testing.T) {
 15 | 	t.Parallel()
 16 | 
 17 | 	tests := []struct {
 18 | 		name     string
 19 | 		in       []byte
 20 | 		want     []string
 21 | 		wantErrs []error
 22 | 	}{
 23 | 		{
 24 | 			name: "empty_input",
 25 | 			in:   []byte{},
 26 | 			want: []string{},
 27 | 		},
 28 | 		{
 29 | 			name: "no_early_errors",
 30 | 			in: byteLines(
 31 | 				"// This is a small replica",
 32 | 				"// of the PSL",
 33 | 				"com",
 34 | 				"net",
 35 | 				"lol",
 36 | 				"",
 37 | 				"// End of file",
 38 | 			),
 39 | 			want: []string{
 40 | 				"// This is a small replica",
 41 | 				"// of the PSL",
 42 | 				"com",
 43 | 				"net",
 44 | 				"lol",
 45 | 				"",
 46 | 				"// End of file",
 47 | 			},
 48 | 		},
 49 | 		{
 50 | 			name: "utf16be_input_with_bom",
 51 | 			in:   utf16BigWithBOM("utf-16 text"),
 52 | 			want: []string{"utf-16 text"},
 53 | 		},
 54 | 		{
 55 | 			name: "utf16le_input_with_bom",
 56 | 			in:   utf16LittleWithBOM("utf-16 text"),
 57 | 			want: []string{"utf-16 text"},
 58 | 		},
 59 | 		{
 60 | 			name:     "utf16be_input",
 61 | 			in:       utf16Big("utf-16 text utf-16 text utf-16 text"),
 62 | 			want:     []string{"utf-16 text utf-16 text utf-16 text"},
 63 | 			wantErrs: []error{ErrInvalidEncoding{"UTF-16BE (guessed)"}},
 64 | 		},
 65 | 		{
 66 | 			name:     "utf16le_input",
 67 | 			in:       utf16Little("utf-16 text utf-16 text utf-16 text"),
 68 | 			want:     []string{"utf-16 text utf-16 text utf-16 text"},
 69 | 			wantErrs: []error{ErrInvalidEncoding{"UTF-16LE (guessed)"}},
 70 | 		},
 71 | 		{
 72 | 			name: "utf8_with_bom",
 73 | 			in:   utf8WithBOM("utf-8 text"),
 74 | 			want: []string{"utf-8 text"},
 75 | 		},
 76 | 		{
 77 | 			name: "utf8_with_garbage",
 78 | 			// See https://en.wikipedia.org/wiki/UTF-8 for a
 79 | 			// description of UTF-8 encoding, to help understand why
 80 | 			// these inputs are invalid.
 81 | 			//
 82 | 			// The invalid patterns are immediately followed by more
 83 | 			// valid characters, to verify exactly how normalization
 84 | 			// mangles the bytes around an invalid sequence.
 85 | 			in: byteLines(
 86 | 				"normal UTF-8",
 87 | 				// Illegal start bitpattern (5 leading bits set to 1)
 88 | 				"bad1: \xF8abc",
 89 | 				// First byte declares 3-byte character, but ends after 2 bytes
 90 | 				"bad2: \xE0\xBFabc",
 91 | 				// Continuation byte outside of a character
 92 | 				"bad3: \xBFabc",
 93 | 				// Ascii space (0x20) encoded non-minimally
 94 | 				"bad4: \xC0\xA0abc",
 95 | 				"this line is ok",
 96 | 			),
 97 | 			want: []string{
 98 | 				"normal UTF-8",
 99 | 				"bad1: \uFFFDabc",
100 | 				"bad2: \uFFFDabc",
101 | 				"bad3: \uFFFDabc",
102 | 				"bad4: \uFFFD\uFFFDabc",
103 | 				"this line is ok",
104 | 			},
105 | 			wantErrs: []error{
106 | 				ErrInvalidUnicode{mkSrc(1, 2)},
107 | 				ErrInvalidUnicode{mkSrc(2, 3)},
108 | 				ErrInvalidUnicode{mkSrc(3, 4)},
109 | 				ErrInvalidUnicode{mkSrc(4, 5)},
110 | 			},
111 | 		},
112 | 		{
113 | 			name: "dos_line_endings",
114 | 			in: byteLines(
115 | 				"normal file\r",
116 | 				"except the lines\r",
117 | 				"end like it's 1991"),
118 | 			want: []string{
119 | 				"normal file",
120 | 				"except the lines",
121 | 				"end like it's 1991",
122 | 			},
123 | 		},
124 | 		{
125 | 			name: "trailing_whitespace",
126 | 			in: byteLines(
127 | 				"a file  ",
128 | 				"with all kinds\t\t",
129 | 				" \r\t",
130 | 				// Strange "spaces": em space, ideographic space,
131 | 				// 4/18em medium mathematical space.
132 | 				"of trailing space\u2003\u3000\u205f",
133 | 				"and one good line",
134 | 			),
135 | 			want: []string{
136 | 				"a file",
137 | 				"with all kinds",
138 | 				"",
139 | 				"of trailing space",
140 | 				"and one good line",
141 | 			},
142 | 		},
143 | 		{
144 | 			name: "leading_whitespace",
145 | 			in: byteLines(
146 | 				"  a file",
147 | 				"\t\twith all kinds",
148 | 				" \r\t", // ensure this is reported as trailing, not leading
149 | 				// Strange "spaces": em space, ideographic space,
150 | 				// 4/18em medium mathematical space.
151 | 				"\u2003\u3000\u205fof leading space",
152 | 				"and one good line",
153 | 			),
154 | 			want: []string{
155 | 				"a file",
156 | 				"with all kinds",
157 | 				"",
158 | 				"of leading space",
159 | 				"and one good line",
160 | 			},
161 | 		},
162 | 		{
163 | 			name: "the_most_wrong_line",
164 | 			in:   byteLines("\xef\xbb\xbf  \t  // Hello\xc3\x28 very broken line\t  \r"),
165 | 			want: []string{"// Hello\uFFFD( very broken line"},
166 | 			wantErrs: []error{
167 | 				ErrInvalidUnicode{mkSrc(0, 1)},
168 | 			},
169 | 		},
170 | 	}
171 | 
172 | 	for _, tc := range tests {
173 | 		t.Run(tc.name, func(t *testing.T) {
174 | 			lines, errs := normalizeToUTF8Lines(tc.in)
175 | 			checkDiff(t, "newSource error set", errs, tc.wantErrs)
176 | 			checkDiff(t, "newSource result", lines, tc.want)
177 | 		})
178 | 	}
179 | }
180 | 
181 | func byteLines(lines ...any) []byte {
182 | 	var ret [][]byte
183 | 	for _, ln := range lines {
184 | 		switch v := ln.(type) {
185 | 		case string:
186 | 			ret = append(ret, []byte(v))
187 | 		case []byte:
188 | 			ret = append(ret, v)
189 | 		default:
190 | 			panic(fmt.Sprintf("unhandled type %T for bytes()", ln))
191 | 		}
192 | 	}
193 | 	return bytes.Join(ret, []byte("\n"))
194 | }
195 | 
196 | func encodeFromUTF8(s string, e encoding.Encoding) []byte {
197 | 	ret, err := e.NewEncoder().Bytes([]byte(s))
198 | 	if err != nil {
199 | 		// Only way this can happen is if the input isn't valid UTF-8,
200 | 		// and we don't do that in these tests.
201 | 		panic(err)
202 | 	}
203 | 	return ret
204 | }
205 | 
206 | func utf16Big(s string) []byte {
207 | 	return encodeFromUTF8(s, unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM))
208 | }
209 | 
210 | func utf16BigWithBOM(s string) []byte {
211 | 	return encodeFromUTF8(s, unicode.UTF16(unicode.BigEndian, unicode.UseBOM))
212 | }
213 | 
214 | func utf16Little(s string) []byte {
215 | 	return encodeFromUTF8(s, unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM))
216 | }
217 | 
218 | func utf16LittleWithBOM(s string) []byte {
219 | 	return encodeFromUTF8(s, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM))
220 | }
221 | 
222 | func utf8WithBOM(s string) []byte {
223 | 	return encodeFromUTF8(s, unicode.UTF8BOM)
224 | }
225 | 
226 | func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) {
227 | 	t.Helper()
228 | 
229 | 	// cmp.Diff refuses to examine unexported fields by default. Tell
230 | 	// it that it's okay to look at unexported fields of blocks and
231 | 	// blockInfo, since we own those fields and want to include their
232 | 	// values in comparisons.
233 | 	exportInfo := cmp.Exporter(func(t reflect.Type) bool {
234 | 		if t.Kind() != reflect.Pointer {
235 | 			t = reflect.PointerTo(t)
236 | 		}
237 | 
238 | 		if t.Elem() == reflect.TypeFor[blockInfo]() {
239 | 			return true
240 | 		}
241 | 
242 | 		if t.Implements(reflect.TypeFor[Block]()) {
243 | 			return true
244 | 		}
245 | 
246 | 		return false
247 | 	})
248 | 	if diff := cmp.Diff(got, want, exportInfo); diff != "" {
249 | 		t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff)
250 | 	}
251 | }
252 | 


--------------------------------------------------------------------------------
/tools/internal/parser/text.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"strings"
  7 | 	"unicode/utf8"
  8 | 
  9 | 	"golang.org/x/text/encoding"
 10 | 	xunicode "golang.org/x/text/encoding/unicode"
 11 | )
 12 | 
 13 | // SourceRange describes a slice of lines from an unparsed source
 14 | // file. FirstLine and LastLine behave like normal slice offsets,
 15 | // i.e. they represent the half-open range [FirstLine:LastLine).
 16 | type SourceRange struct {
 17 | 	FirstLine int
 18 | 	LastLine  int
 19 | }
 20 | 
 21 | // NumLines returns the number of source lines described by
 22 | // SourceRange.
 23 | func (s SourceRange) NumLines() int {
 24 | 	if s.FirstLine >= s.LastLine {
 25 | 		return 0
 26 | 	}
 27 | 	return s.LastLine - s.FirstLine
 28 | }
 29 | 
 30 | // LocationString prints a human-readable description of the
 31 | // SourceRange.
 32 | func (s SourceRange) LocationString() string {
 33 | 	switch {
 34 | 	case s.LastLine <= s.FirstLine:
 35 | 		return "<invalid SourceRange>"
 36 | 	case s.LastLine == s.FirstLine+1:
 37 | 		return fmt.Sprintf("line %d", s.FirstLine+1)
 38 | 	default:
 39 | 		return fmt.Sprintf("lines %d-%d", s.FirstLine+1, s.LastLine)
 40 | 	}
 41 | }
 42 | 
 43 | // merge returns a SourceRange that contains both s and other. If s
 44 | // and other are not contiguous or overlapping, the returned
 45 | // SourceRange also spans unrelated lines, but always covers both s
 46 | // and other.
 47 | func (s SourceRange) merge(other SourceRange) SourceRange {
 48 | 	return SourceRange{
 49 | 		FirstLine: min(s.FirstLine, other.LastLine),
 50 | 		LastLine:  max(s.LastLine, other.LastLine),
 51 | 	}
 52 | }
 53 | 
 54 | const (
 55 | 	bomUTF8    = "\xEF\xBB\xBF"
 56 | 	bomUTF16BE = "\xFE\xFF"
 57 | 	bomUTF16LE = "\xFF\xFE"
 58 | )
 59 | 
 60 | // The transformers that normalizeToUTF8Lines can use to process input
 61 | // into valid UTF-8, and that guessUTFVariant can return.
 62 | var (
 63 | 	utf8Transform              = xunicode.UTF8BOM
 64 | 	utf16LittleEndianTransform = xunicode.UTF16(xunicode.LittleEndian, xunicode.UseBOM)
 65 | 	utf16BigEndianTransform    = xunicode.UTF16(xunicode.BigEndian, xunicode.UseBOM)
 66 | )
 67 | 
 68 | // normalizeToUTF8Lines slices bs into one string per line.
 69 | //
 70 | // All returned strings contain only valid UTF-8. Invalid byte
 71 | // sequences are replaced with the unicode replacement character
 72 | // (\uFFFD).
 73 | //
 74 | // The canonical PSL encoding is a file consisting entirely of valid
 75 | // UTF-8, with no leading BOM or unicode replacement characters. In an
 76 | // effort to report useful errors for common mangling caused by older
 77 | // Windows software, normalizeToUTF8Lines accepts input encoded as
 78 | // UTF-8, UTF-16LE or UTF-16BE, with or without a leading BOM.
 79 | //
 80 | // normalizeToUTF8Lines returns the normalized lines of bs, as well as
 81 | // errors that report deviations from the canonical encoding, if any.
 82 | func normalizeToUTF8Lines(bs []byte) ([]string, []error) {
 83 | 	var errs []error
 84 | 
 85 | 	// Figure out the byte encoding to use. We try to detect and
 86 | 	// correctly parse UTF-16 that doesn't have a BOM, but we also
 87 | 	// report an explicit parse error in that case, because we cannot
 88 | 	// be confident the parse is 100% correct, and therefore we can't
 89 | 	// automatically fix it.
 90 | 	enc := utf8Transform
 91 | 	switch {
 92 | 	case bytes.HasPrefix(bs, []byte(bomUTF8)):
 93 | 	case bytes.HasPrefix(bs, []byte(bomUTF16BE)):
 94 | 		enc = utf16BigEndianTransform
 95 | 	case bytes.HasPrefix(bs, []byte(bomUTF16LE)):
 96 | 		enc = utf16LittleEndianTransform
 97 | 	default:
 98 | 		enc = guessUTFVariant(bs)
 99 | 		switch enc {
100 | 		case utf16BigEndianTransform:
101 | 			errs = append(errs, ErrInvalidEncoding{"UTF-16BE (guessed)"})
102 | 		case utf16LittleEndianTransform:
103 | 			errs = append(errs, ErrInvalidEncoding{"UTF-16LE (guessed)"})
104 | 		}
105 | 	}
106 | 
107 | 	bs, err := enc.NewDecoder().Bytes(bs)
108 | 	if err != nil {
109 | 		// The decoder shouldn't error out, if it does we can't really
110 | 		// proceed, just return the errors we've found so far.
111 | 		errs = append(errs, err)
112 | 		return []string{}, errs
113 | 	}
114 | 
115 | 	if len(bs) == 0 {
116 | 		return []string{}, errs
117 | 	}
118 | 
119 | 	ret := strings.Split(string(bs), "\n")
120 | 	for i, line := range ret {
121 | 		// capture source info before we tidy up the line starts/ends,
122 | 		// so that input normalization errors show the problem being
123 | 		// described.
124 | 		//
125 | 		// However, we still provide post-sanitization UTF-8 bytes,
126 | 		// not the raw input. The raw input is unlikely to display
127 | 		// correctly in terminals and logs, and because the unicode
128 | 		// replacement character is a distinctive shape that stands
129 | 		// out, it should provide enough hints as to where any invalid
130 | 		// byte sequences are.
131 | 		src := SourceRange{i, i + 1}
132 | 		if strings.ContainsRune(line, utf8.RuneError) {
133 | 			// We can't fix invalid Unicode, by definition we don't
134 | 			// know what it's trying to say.
135 | 			errs = append(errs, ErrInvalidUnicode{src})
136 | 		}
137 | 		ret[i] = strings.TrimSpace(line)
138 | 	}
139 | 
140 | 	return ret, errs
141 | }
142 | 
143 | // guessUTFVariant guesses the encoding of bs.
144 | //
145 | // Returns the transformer to use on bs, one of utf8Transform,
146 | // utf16LittleEndianTransform or utf16BigEndianTransform.
147 | func guessUTFVariant(bs []byte) encoding.Encoding {
148 | 	// Only scan a few hundred bytes. Assume UTF-8 if we don't see
149 | 	// anything odd before that.
150 | 	const checkLimit = 200 // 100 UTF-16 characters
151 | 	if len(bs) > checkLimit {
152 | 		bs = bs[:checkLimit]
153 | 	}
154 | 
155 | 	// This is a crude but effective trick to detect UTF-16: we assume
156 | 	// that the input contains at least some ascii, and that the
157 | 	// decoded input does not contain Unicode \u0000 codepoints
158 | 	// (legacy ascii null).
159 | 	//
160 | 	// If this is true, then valid UTF-8 text does not have any zero
161 | 	// bytes, because UTF-8 never produces a zero byte except when it
162 | 	// encodes the \u0000 codepoint.
163 | 	//
164 | 	// On the other hand, UTF-16 encodes all codepoints a pair of
165 | 	// bytes, and that means an ascii string in UTF-16 a zero byte
166 | 	// every 2 bytes. We can use the presence of zero bytes to
167 | 	// identify UTF-16, and the position of the zero (even or odd
168 | 	// offset) tells us what endianness to use.
169 | 	evenZeros, oddZeros := 0, 0
170 | 	for i, b := range bs {
171 | 		if b != 0 {
172 | 			continue
173 | 		}
174 | 
175 | 		if i%2 == 0 {
176 | 			evenZeros++
177 | 		} else {
178 | 			oddZeros++
179 | 		}
180 | 
181 | 		const (
182 | 			// Wait for a few zero bytes to accumulate, because if
183 | 			// this is just UTF-8 with a few \u0000 codepoints,
184 | 			// decoding as UTF-16 will be complete garbage. So, wait
185 | 			// until we see a suspicious number of zeros, and require
186 | 			// a strong bias towards even/odd before we guess
187 | 			// UTF-16. Otherwise, UTF-8 gives us the best chance of
188 | 			// producing coherent errors.
189 | 			decisionThreshold = 20
190 | 			utf16Threshold    = 15
191 | 		)
192 | 		if evenZeros+oddZeros < decisionThreshold {
193 | 			continue
194 | 		}
195 | 		if evenZeros > utf16Threshold {
196 | 			return utf16BigEndianTransform
197 | 		} else if oddZeros > utf16Threshold {
198 | 			return utf16LittleEndianTransform
199 | 		}
200 | 		// Lots of zeros, but no strong bias. No idea what's going on,
201 | 		// UTF-8 is a safe fallback.
202 | 		return utf8Transform
203 | 	}
204 | 
205 | 	// Didn't find enough zeros, probably UTF-8.
206 | 	return utf8Transform
207 | }
208 | 


--------------------------------------------------------------------------------
/tools/internal/parser/metadata_test.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"net/mail"
  5 | 	"net/url"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func TestMetadata(t *testing.T) {
 10 | 	tests := []struct {
 11 | 		name string
 12 | 		in   *Comment
 13 | 		want MaintainerInfo
 14 | 	}{
 15 | 		{
 16 | 			name: "empty",
 17 | 			in:   nil,
 18 | 			want: MaintainerInfo{
 19 | 				MachineEditable: true,
 20 | 			},
 21 | 		},
 22 | 
 23 | 		{
 24 | 			name: "canonical",
 25 | 			in: comment(0,
 26 | 				"DuckCo : https://example.com",
 27 | 				"Submitted by Duck <duck@example.com>",
 28 | 			),
 29 | 			want: MaintainerInfo{
 30 | 				Name:            "DuckCo",
 31 | 				URLs:            urls("https://example.com"),
 32 | 				Maintainers:     emails("Duck", "duck@example.com"),
 33 | 				MachineEditable: true,
 34 | 			},
 35 | 		},
 36 | 
 37 | 		{
 38 | 			name: "canonical_no_space_around_colon",
 39 | 			in: comment(0,
 40 | 				"DuckCo:https://example.com",
 41 | 				"Submitted by Duck <duck@example.com>",
 42 | 			),
 43 | 			want: MaintainerInfo{
 44 | 				Name:            "DuckCo",
 45 | 				URLs:            urls("https://example.com"),
 46 | 				Maintainers:     emails("Duck", "duck@example.com"),
 47 | 				MachineEditable: true,
 48 | 			},
 49 | 		},
 50 | 
 51 | 		{
 52 | 			name: "canonical_url_in_parens",
 53 | 			in: comment(0,
 54 | 				"DuckCo (https://example.com)",
 55 | 				"Submitted by Duck <duck@example.com>",
 56 | 			),
 57 | 			want: MaintainerInfo{
 58 | 				Name:            "DuckCo",
 59 | 				URLs:            urls("https://example.com"),
 60 | 				Maintainers:     emails("Duck", "duck@example.com"),
 61 | 				MachineEditable: true,
 62 | 			},
 63 | 		},
 64 | 
 65 | 		{
 66 | 			name: "canonical_by_registry",
 67 | 			in: comment(0,
 68 | 				"DuckCo : https://example.com",
 69 | 				"Submitted by registry <duck@example.com>",
 70 | 			),
 71 | 			want: MaintainerInfo{
 72 | 				Name:            "DuckCo",
 73 | 				URLs:            urls("https://example.com"),
 74 | 				Maintainers:     emails("", "duck@example.com"),
 75 | 				MachineEditable: true,
 76 | 			},
 77 | 		},
 78 | 
 79 | 		{
 80 | 			name: "name_and_email_first",
 81 | 			in: comment(0,
 82 | 				"DuckCo : Duck <duck@example.com>",
 83 | 				"https://example.com",
 84 | 			),
 85 | 			want: MaintainerInfo{
 86 | 				Name:            "DuckCo",
 87 | 				URLs:            urls("https://example.com"),
 88 | 				Maintainers:     emails("Duck", "duck@example.com"),
 89 | 				MachineEditable: true,
 90 | 			},
 91 | 		},
 92 | 
 93 | 		{
 94 | 			name: "name_and_naked_email",
 95 | 			in: comment(0,
 96 | 				"DuckCo : duck@example.com",
 97 | 				"https://example.com",
 98 | 			),
 99 | 			want: MaintainerInfo{
100 | 				Name:            "DuckCo",
101 | 				URLs:            urls("https://example.com"),
102 | 				Maintainers:     emails("", "duck@example.com"),
103 | 				MachineEditable: true,
104 | 			},
105 | 		},
106 | 
107 | 		{
108 | 			name: "one_per_line",
109 | 			in: comment(0,
110 | 				"DuckCo",
111 | 				"https://example.com",
112 | 				"Submitted by Duck <duck@example.com>",
113 | 			),
114 | 			want: MaintainerInfo{
115 | 				Name:            "DuckCo",
116 | 				URLs:            urls("https://example.com"),
117 | 				Maintainers:     emails("Duck", "duck@example.com"),
118 | 				MachineEditable: true,
119 | 			},
120 | 		},
121 | 
122 | 		{
123 | 			name: "no_name",
124 | 			in: comment(0,
125 | 				"https://example.com",
126 | 				"Submitted by Duck <duck@example.com>",
127 | 				"Other notes here",
128 | 			),
129 | 			want: MaintainerInfo{
130 | 				Name:            "",
131 | 				URLs:            urls("https://example.com"),
132 | 				Maintainers:     emails("Duck", "duck@example.com"),
133 | 				Other:           []string{"Other notes here"},
134 | 				MachineEditable: true,
135 | 			},
136 | 		},
137 | 
138 | 		{
139 | 			name: "http_url_and_bare_email",
140 | 			in: comment(0,
141 | 				"http://example.com",
142 | 				"duck@example.com",
143 | 			),
144 | 			want: MaintainerInfo{
145 | 				Name:            "",
146 | 				URLs:            urls("http://example.com"),
147 | 				Maintainers:     emails("", "duck@example.com"),
148 | 				MachineEditable: true,
149 | 			},
150 | 		},
151 | 
152 | 		{
153 | 			name: "multiple_urls",
154 | 			in: comment(0,
155 | 				"DuckCo : https://example.com",
156 | 				"https://example.org/details",
157 | 				"Submitted by Duck <duck@example.com>",
158 | 			),
159 | 			want: MaintainerInfo{
160 | 				Name:            "DuckCo",
161 | 				URLs:            urls("https://example.com", "https://example.org/details"),
162 | 				Maintainers:     emails("Duck", "duck@example.com"),
163 | 				MachineEditable: true,
164 | 			},
165 | 		},
166 | 
167 | 		{
168 | 			name: "multiple_emails",
169 | 			in: comment(0,
170 | 				"DuckCo : https://example.com",
171 | 				"Submitted by Duck <duck@example.com> and Goat <goat@example.com>",
172 | 				"llama@example.com",
173 | 			),
174 | 			want: MaintainerInfo{
175 | 				Name: "DuckCo",
176 | 				URLs: urls("https://example.com"),
177 | 				Maintainers: emails(
178 | 					"Duck", "duck@example.com",
179 | 					"Goat", "goat@example.com",
180 | 					"", "llama@example.com"),
181 | 				MachineEditable: true,
182 | 			},
183 | 		},
184 | 
185 | 		{
186 | 			name: "multiple_everything_and_end_notes",
187 | 			in: comment(0,
188 | 				"DuckCo : https://example.com",
189 | 				"http://example.org",
190 | 				"https://example.net/more",
191 | 				"Submitted by Duck <duck@example.com> and Goat <goat@example.com>",
192 | 				"llama@example.com",
193 | 				`"Owl" <owl@example.net>`,
194 | 				"Duck is theoretically in charge, but Owl has influence",
195 | 				"Goat is not to be trusted, don't know about llama yet",
196 | 			),
197 | 			want: MaintainerInfo{
198 | 				Name: "DuckCo",
199 | 				URLs: urls("https://example.com", "http://example.org", "https://example.net/more"),
200 | 				Maintainers: emails(
201 | 					"Duck", "duck@example.com",
202 | 					"Goat", "goat@example.com",
203 | 					"", "llama@example.com",
204 | 					"Owl", "owl@example.net"),
205 | 				Other: []string{
206 | 					"Duck is theoretically in charge, but Owl has influence",
207 | 					"Goat is not to be trusted, don't know about llama yet",
208 | 				},
209 | 				MachineEditable: true,
210 | 			},
211 | 		},
212 | 
213 | 		{
214 | 			name: "info_after_extra_notes",
215 | 			in: comment(0,
216 | 				"DuckCo",
217 | 				"Duck is in charge",
218 | 				"https://example.com",
219 | 				"Submitted by Duck <duck@example.com>",
220 | 			),
221 | 			want: MaintainerInfo{
222 | 				Name:        "DuckCo",
223 | 				URLs:        urls("https://example.com"),
224 | 				Maintainers: emails("Duck", "duck@example.com"),
225 | 				Other: []string{
226 | 					"Duck is in charge",
227 | 				},
228 | 				MachineEditable: false,
229 | 			},
230 | 		},
231 | 
232 | 		{
233 | 			name: "obfuscated_email",
234 | 			in: comment(0,
235 | 				"lohmus",
236 | 				"someone at lohmus dot me",
237 | 			),
238 | 			want: MaintainerInfo{
239 | 				Name:            "lohmus",
240 | 				Maintainers:     emails("", "someone@lohmus.me"),
241 | 				MachineEditable: true,
242 | 			},
243 | 		},
244 | 	}
245 | 
246 | 	for _, tc := range tests {
247 | 		got := extractMaintainerInfo(tc.in)
248 | 		checkDiff(t, "maintainer info", got, tc.want)
249 | 	}
250 | }
251 | 
252 | func urls(us ...string) []*url.URL {
253 | 	var ret []*url.URL
254 | 	for _, s := range us {
255 | 		ret = append(ret, mustURL(s))
256 | 	}
257 | 	return ret
258 | }
259 | 
260 | func mustURL(s string) *url.URL {
261 | 	u, err := url.Parse(s)
262 | 	if err != nil {
263 | 		panic(err)
264 | 	}
265 | 	return u
266 | }
267 | 
268 | func emails(elts ...string) []*mail.Address {
269 | 	var ret []*mail.Address
270 | 	for i := 0; i < len(elts); i += 2 {
271 | 		ret = append(ret, email(elts[i], elts[i+1]))
272 | 	}
273 | 	return ret
274 | }
275 | 
276 | func email(name, email string) *mail.Address {
277 | 	return &mail.Address{
278 | 		Name:    name,
279 | 		Address: email,
280 | 	}
281 | }
282 | 


--------------------------------------------------------------------------------
/tools/internal/parser/errors.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | )
  6 | 
  7 | // ErrInvalidEncoding reports that the input is encoded with
  8 | // something other than UTF-8.
  9 | type ErrInvalidEncoding struct {
 10 | 	Encoding string
 11 | }
 12 | 
 13 | func (e ErrInvalidEncoding) Error() string {
 14 | 	return fmt.Sprintf("invalid character encoding %s", e.Encoding)
 15 | }
 16 | 
 17 | // ErrInvalidUnicode reports that a line contains characters that are
 18 | // not valid Unicode.
 19 | type ErrInvalidUnicode struct {
 20 | 	SourceRange
 21 | }
 22 | 
 23 | func (e ErrInvalidUnicode) Error() string {
 24 | 	return fmt.Sprintf("%s: invalid Unicode character(s)", e.SourceRange.LocationString())
 25 | }
 26 | 
 27 | // ErrSectionInSuffixBlock reports that a comment within a suffix
 28 | // block contains a section delimiter.
 29 | type ErrSectionInSuffixBlock struct {
 30 | 	SourceRange
 31 | }
 32 | 
 33 | func (e ErrSectionInSuffixBlock) Error() string {
 34 | 	return fmt.Sprintf("%s: section delimiter not allowed in suffix block comment", e.SourceRange.LocationString())
 35 | }
 36 | 
 37 | // ErrUnclosedSection reports that a file section was not closed
 38 | // properly before EOF.
 39 | type ErrUnclosedSection struct {
 40 | 	Section *Section
 41 | }
 42 | 
 43 | func (e ErrUnclosedSection) Error() string {
 44 | 	return fmt.Sprintf("%s: section %q is missing its closing marker", e.Section.SourceRange.LocationString(), e.Section.Name)
 45 | }
 46 | 
 47 | // ErrNestedSection reports that a file section is being started while
 48 | // already within a section.
 49 | type ErrNestedSection struct {
 50 | 	SourceRange
 51 | 	Name    string
 52 | 	Section *Section
 53 | }
 54 | 
 55 | func (e ErrNestedSection) Error() string {
 56 | 	return fmt.Sprintf("%s: section %q is nested inside section %q (%s)", e.SourceRange.LocationString(), e.Name, e.Section.Name, e.Section.SourceRange.LocationString())
 57 | }
 58 | 
 59 | // ErrUnstartedSection reports that section end marker was found
 60 | // without a corresponding start.
 61 | type ErrUnstartedSection struct {
 62 | 	SourceRange
 63 | 	Name string
 64 | }
 65 | 
 66 | func (e ErrUnstartedSection) Error() string {
 67 | 	return fmt.Sprintf("%s: end marker for non-existent section %q", e.SourceRange.LocationString(), e.Name)
 68 | }
 69 | 
 70 | // ErrMismatchedSection reports that a file section was started
 71 | // under one name but ended under another.
 72 | type ErrMismatchedSection struct {
 73 | 	SourceRange
 74 | 	EndName string
 75 | 	Section *Section
 76 | }
 77 | 
 78 | func (e ErrMismatchedSection) Error() string {
 79 | 	return fmt.Sprintf("%s: section %q (%s) closed with wrong name %q", e.SourceRange.LocationString(), e.Section.Name, e.Section.SourceRange.LocationString(), e.EndName)
 80 | }
 81 | 
 82 | // ErrUnknownSectionMarker reports that a line looks like a file section
 83 | // marker (e.g. "===BEGIN ICANN DOMAINS==="), but is not one of the
 84 | // recognized kinds of marker.
 85 | type ErrUnknownSectionMarker struct {
 86 | 	SourceRange
 87 | }
 88 | 
 89 | func (e ErrUnknownSectionMarker) Error() string {
 90 | 	return fmt.Sprintf("%s: unknown kind of section marker", e.SourceRange.LocationString())
 91 | }
 92 | 
 93 | // MissingEntityName reports that a block of suffixes does not have a
 94 | // parseable owner name in its header comment.
 95 | type ErrMissingEntityName struct {
 96 | 	Suffixes *Suffixes
 97 | }
 98 | 
 99 | func (e ErrMissingEntityName) Error() string {
100 | 	return fmt.Sprintf("%s: suffix block has no owner name", e.Suffixes.SourceRange.LocationString())
101 | }
102 | 
103 | // ErrMissingEntityEmail reports that a block of suffixes does not have a
104 | // parseable contact email address in its header comment.
105 | type ErrMissingEntityEmail struct {
106 | 	Suffixes *Suffixes
107 | }
108 | 
109 | func (e ErrMissingEntityEmail) Error() string {
110 | 	return fmt.Sprintf("%s: suffix block has no contact email", e.Suffixes.SourceRange.LocationString())
111 | }
112 | 
113 | // ErrInvalidSuffix reports that a suffix suffix is not a valid PSL
114 | // entry.
115 | type ErrInvalidSuffix struct {
116 | 	SourceRange
117 | 	Suffix string
118 | 	Err    error
119 | }
120 | 
121 | func (e ErrInvalidSuffix) Error() string {
122 | 	return fmt.Sprintf("%s: invalid suffix %q: %v", e.SourceRange.LocationString(), e.Suffix, e.Err)
123 | }
124 | 
125 | type ErrCommentPreventsSuffixSort struct {
126 | 	SourceRange
127 | }
128 | 
129 | func (e ErrCommentPreventsSuffixSort) Error() string {
130 | 	return fmt.Sprintf("%s: comment prevents full sorting of suffixes", e.SourceRange.LocationString())
131 | }
132 | 
133 | type ErrCommentPreventsSectionSort struct {
134 | 	SourceRange
135 | }
136 | 
137 | func (e ErrCommentPreventsSectionSort) Error() string {
138 | 	return fmt.Sprintf("%s: comment prevents full sorting of PSL section", e.SourceRange.LocationString())
139 | }
140 | 
141 | type ErrDuplicateSection struct {
142 | 	*Section
143 | 	FirstDefinition *Section
144 | }
145 | 
146 | func (e ErrDuplicateSection) Error() string {
147 | 	return fmt.Sprintf("%s: duplicate section %q, first definition at %s", e.LocationString(), e.Name, e.FirstDefinition.LocationString())
148 | }
149 | 
150 | type ErrUnknownSection struct {
151 | 	*Section
152 | }
153 | 
154 | func (e ErrUnknownSection) Error() string {
155 | 	return fmt.Sprintf("%s: unknown section %q, allowed sections are 'ICANN DOMAINS' and 'PRIVATE DOMAINS'", e.LocationString(), e.Name)
156 | }
157 | 
158 | type ErrMissingSection struct {
159 | 	Name string
160 | }
161 | 
162 | func (e ErrMissingSection) Error() string {
163 | 	return fmt.Sprintf("missing required section %q", e.Name)
164 | }
165 | 
166 | type ErrDuplicateSuffix struct {
167 | 	Name            string
168 | 	Block                 // Suffix or Wildcard
169 | 	FirstDefinition Block // Suffix or Wildcard
170 | }
171 | 
172 | func (e ErrDuplicateSuffix) Error() string {
173 | 	return fmt.Sprintf("%s: duplicate suffix definition for %q, first definition at %s", e.SrcRange().LocationString(), e.Name, e.FirstDefinition.SrcRange().LocationString())
174 | }
175 | 
176 | type ErrConflictingSuffixAndException struct {
177 | 	*Suffix
178 | 	Wildcard *Wildcard
179 | }
180 | 
181 | func (e ErrConflictingSuffixAndException) Error() string {
182 | 	return fmt.Sprintf("%s: suffix %s conflicts with exception in wildcard at %s", e.LocationString(), e.Domain, e.Wildcard.LocationString())
183 | }
184 | 
185 | type ErrMissingTXTRecord struct {
186 | 	Block
187 | }
188 | 
189 | func (e ErrMissingTXTRecord) Error() string {
190 | 	var name string
191 | 	switch v := e.Block.(type) {
192 | 	case *Suffix:
193 | 		name = v.Domain.String()
194 | 	case *Wildcard:
195 | 		name = v.Domain.String()
196 | 	default:
197 | 		panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block))
198 | 	}
199 | 	return fmt.Sprintf("%s: suffix %s has no TXT record", e.SrcRange().LocationString(), name)
200 | }
201 | 
202 | type ErrTXTRecordMismatch struct {
203 | 	Block
204 | 	PR int
205 | }
206 | 
207 | func (e ErrTXTRecordMismatch) Error() string {
208 | 	switch v := e.Block.(type) {
209 | 	case *Suffix:
210 | 		return fmt.Sprintf("%s: suffix %s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this suffix", e.SrcRange().LocationString(), v.Domain, e.PR)
211 | 	case *Wildcard:
212 | 		return fmt.Sprintf("%s: wildcard *.%s has a TXT record pointing to https://github.com/publicsuffix/list/pull/%d, but that PR does not change this wildcard", e.SrcRange().LocationString(), v.Domain, e.PR)
213 | 	default:
214 | 		panic(fmt.Sprintf("unexpected block type %T in ErrTXTRecordMismatch", e.Block))
215 | 	}
216 | }
217 | 
218 | type ErrTXTCheckFailure struct {
219 | 	Block
220 | 	Err error
221 | }
222 | 
223 | func (e ErrTXTCheckFailure) Error() string {
224 | 	var name string
225 | 	switch v := e.Block.(type) {
226 | 	case *Suffix:
227 | 		name = v.Domain.String()
228 | 	case *Wildcard:
229 | 		name = v.Domain.String()
230 | 	default:
231 | 		panic(fmt.Sprintf("unexpected block type %T in ErrInvalidTXTRecord", e.Block))
232 | 	}
233 | 	return fmt.Sprintf("%s: error checking suffix %s: %v", e.SrcRange().LocationString(), name, e.Err)
234 | }
235 | 


--------------------------------------------------------------------------------
/tools/internal/parser/validate_test.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | func TestValidateEntityMetadata(t *testing.T) {
  8 | 	in := list(
  9 | 		section(1, 1, "PRIVATE DOMAINS",
 10 | 			suffixes(1, 1, info("", nil, emails("Example", "example@example.com"), nil, true),
 11 | 				comment(1, "Submitted by Example <example@example.com>"),
 12 | 				suffix(2, "example.com"),
 13 | 			),
 14 | 
 15 | 			suffixes(2, 2, info("Example Ltd", nil, nil, nil, true),
 16 | 				comment(1, "Example Ltd"),
 17 | 				suffix(2, "example.org"),
 18 | 			),
 19 | 
 20 | 			suffixes(3, 3, noInfo,
 21 | 				suffix(1, "example.net"),
 22 | 			),
 23 | 
 24 | 			suffixes(4, 4, info("Foo Ltd", nil, emails("Someone", "example@example.com"), nil, true),
 25 | 				comment(1, "Submitted by Someone <example@example.com>"),
 26 | 				suffix(2, "blah.example.com"),
 27 | 			),
 28 | 		),
 29 | 	)
 30 | 	want := []error{
 31 | 		ErrMissingEntityName{
 32 | 			Suffixes: suffixes(1, 1,
 33 | 				info("", nil, emails("Example", "example@example.com"), nil, true),
 34 | 				comment(1, "Submitted by Example <example@example.com>"),
 35 | 				suffix(2, "example.com"),
 36 | 			),
 37 | 		},
 38 | 		ErrMissingEntityEmail{
 39 | 			Suffixes: suffixes(2, 2, info("Example Ltd", nil, nil, nil, true),
 40 | 				comment(1, "Example Ltd"),
 41 | 				suffix(2, "example.org"),
 42 | 			),
 43 | 		},
 44 | 		ErrMissingEntityName{
 45 | 			Suffixes: suffixes(3, 3, noInfo,
 46 | 				suffix(1, "example.net"),
 47 | 			),
 48 | 		},
 49 | 		ErrMissingEntityEmail{
 50 | 			Suffixes: suffixes(3, 3, noInfo,
 51 | 				suffix(1, "example.net"),
 52 | 			),
 53 | 		},
 54 | 	}
 55 | 
 56 | 	got := validateEntityMetadata(in)
 57 | 	checkDiff(t, "validateEntityMetadata", got, want)
 58 | 
 59 | 	// Make the change be a diff and check the reduced error set.
 60 | 	prev := list(
 61 | 		section(1, 1, "PRIVATE DOMAINS",
 62 | 			suffixes(1, 1, info("", nil, emails("Example", "example@example.com"), nil, true),
 63 | 				comment(1, "Submitted by Example <example@example.com>"),
 64 | 				suffix(2, "example.com"),
 65 | 			),
 66 | 
 67 | 			suffixes(2, 2, info("Example Ltd", nil, nil, nil, true),
 68 | 				comment(1, "Example Ltd"),
 69 | 				suffix(2, "example.org"),
 70 | 			),
 71 | 
 72 | 			suffixes(3, 3, info("Foo Ltd", nil, emails("Someone", "example@example.com"), nil, true),
 73 | 				comment(1, "Submitted by Someone <example@example.com>"),
 74 | 				suffix(2, "blah.example.com"),
 75 | 			),
 76 | 		),
 77 | 	)
 78 | 
 79 | 	in.SetBaseVersion(prev, false)
 80 | 	got = validateEntityMetadata(in)
 81 | 
 82 | 	// Second suffix block no longer reports any errors. First one
 83 | 	// still does, because its empty name is a dupe of the last block.
 84 | 	want = []error{
 85 | 		ErrMissingEntityName{
 86 | 			Suffixes: suffixes(1, 1,
 87 | 				info("", nil, emails("Example", "example@example.com"), nil, true),
 88 | 				markUnchanged(comment(1, "Submitted by Example <example@example.com>")),
 89 | 				markUnchanged(suffix(2, "example.com")),
 90 | 			),
 91 | 		},
 92 | 		ErrMissingEntityName{
 93 | 			Suffixes: suffixes(3, 3, noInfo,
 94 | 				suffix(1, "example.net"),
 95 | 			),
 96 | 		},
 97 | 		ErrMissingEntityEmail{
 98 | 			Suffixes: suffixes(3, 3, noInfo,
 99 | 				suffix(1, "example.net"),
100 | 			),
101 | 		},
102 | 	}
103 | 
104 | 	checkDiff(t, "validateEntityMetadata (changed blocks only)", got, want)
105 | }
106 | 
107 | func TestValidateExpectedSections(t *testing.T) {
108 | 	tests := []struct {
109 | 		name string
110 | 		in   *List
111 | 		want []error
112 | 	}{
113 | 		{
114 | 			name: "ok",
115 | 			in: list(
116 | 				section(1, 1, "ICANN DOMAINS"),
117 | 				section(2, 2, "PRIVATE DOMAINS"),
118 | 			),
119 | 			want: nil,
120 | 		},
121 | 		{
122 | 			name: "all_missing",
123 | 			in:   list(),
124 | 			want: []error{
125 | 				ErrMissingSection{"ICANN DOMAINS"},
126 | 				ErrMissingSection{"PRIVATE DOMAINS"},
127 | 			},
128 | 		},
129 | 		{
130 | 			name: "one_missing",
131 | 			in: list(
132 | 				section(1, 1, "ICANN DOMAINS"),
133 | 			),
134 | 			want: []error{
135 | 				ErrMissingSection{"PRIVATE DOMAINS"},
136 | 			},
137 | 		},
138 | 		{
139 | 			name: "unknown",
140 | 			in: list(
141 | 				section(1, 1, "ICANN DOMAINS"),
142 | 				section(2, 2, "PRIVATE DOMAINS"),
143 | 				section(3, 3, "NON EUCLIDEAN DOMAINS"),
144 | 			),
145 | 			want: []error{
146 | 				ErrUnknownSection{section(3, 3, "NON EUCLIDEAN DOMAINS")},
147 | 			},
148 | 		},
149 | 		{
150 | 			name: "duplicate_known",
151 | 			in: list(
152 | 				section(1, 1, "ICANN DOMAINS"),
153 | 				section(2, 2, "PRIVATE DOMAINS"),
154 | 				section(3, 3, "ICANN DOMAINS"),
155 | 			),
156 | 			want: []error{
157 | 				ErrDuplicateSection{
158 | 					section(3, 3, "ICANN DOMAINS"),
159 | 					section(1, 1, "ICANN DOMAINS"),
160 | 				},
161 | 			},
162 | 		},
163 | 		{
164 | 			name: "duplicate_unknown",
165 | 			in: list(
166 | 				section(1, 1, "RIDICULOUS DOMAINS"),
167 | 				section(2, 2, "ICANN DOMAINS"),
168 | 				section(3, 3, "PRIVATE DOMAINS"),
169 | 				section(4, 4, "RIDICULOUS DOMAINS"),
170 | 			),
171 | 			want: []error{
172 | 				ErrUnknownSection{section(1, 1, "RIDICULOUS DOMAINS")},
173 | 				ErrUnknownSection{section(4, 4, "RIDICULOUS DOMAINS")},
174 | 			},
175 | 		},
176 | 	}
177 | 
178 | 	for _, tc := range tests {
179 | 		t.Run(tc.name, func(t *testing.T) {
180 | 			got := validateExpectedSections(tc.in)
181 | 			checkDiff(t, "validateExpectedSections output", got, tc.want)
182 | 		})
183 | 	}
184 | }
185 | 
186 | func TestValidateSuffixUniqueness(t *testing.T) {
187 | 	tests := []struct {
188 | 		name string
189 | 		in   *List
190 | 		want []error
191 | 	}{
192 | 		{
193 | 			name: "ok",
194 | 			in: list(
195 | 				section(1, 2, "PRIVATE DOMAINS",
196 | 					suffixes(2, 3, noInfo,
197 | 						suffix(3, "foo.com"),
198 | 						suffix(4, "bar.com"),
199 | 					),
200 | 				),
201 | 			),
202 | 			want: nil,
203 | 		},
204 | 
205 | 		{
206 | 			name: "dupe_suffixes",
207 | 			in: list(
208 | 				section(1, 2, "PRIVATE DOMAINS",
209 | 					suffixes(2, 3, noInfo,
210 | 						suffix(3, "foo.com"),
211 | 						suffix(4, "bar.com"),
212 | 						suffix(5, "foo.com"),
213 | 					),
214 | 				),
215 | 			),
216 | 			want: []error{
217 | 				ErrDuplicateSuffix{"foo.com", suffix(5, "foo.com"), suffix(3, "foo.com")},
218 | 			},
219 | 		},
220 | 
221 | 		{
222 | 			name: "dupe_wildcards",
223 | 			in: list(
224 | 				section(1, 2, "PRIVATE DOMAINS",
225 | 					suffixes(2, 3, noInfo,
226 | 						wildcard(3, 4, "foo.com"),
227 | 						suffix(4, "bar.com"),
228 | 						wildcard(5, 6, "foo.com"),
229 | 					),
230 | 				),
231 | 			),
232 | 			want: []error{
233 | 				ErrDuplicateSuffix{"*.foo.com", wildcard(5, 6, "foo.com"), wildcard(3, 4, "foo.com")},
234 | 			},
235 | 		},
236 | 
237 | 		{
238 | 			name: "dupe_wildcard_exceptions",
239 | 			in: list(
240 | 				section(1, 2, "PRIVATE DOMAINS",
241 | 					suffixes(2, 3, noInfo,
242 | 						wildcard(3, 4, "foo.com", "a", "b", "c", "a"),
243 | 						suffix(4, "bar.com"),
244 | 						suffix(5, "b.foo.com"),
245 | 					),
246 | 				),
247 | 			),
248 | 			want: []error{
249 | 				ErrConflictingSuffixAndException{
250 | 					Suffix:   suffix(5, "b.foo.com"),
251 | 					Wildcard: wildcard(3, 4, "foo.com", "a", "b", "c", "a"),
252 | 				},
253 | 			},
254 | 		},
255 | 
256 | 		{
257 | 			name: "dupe_spanning_blocks_and_sections",
258 | 			in: list(
259 | 				section(1, 2, "PRIVATE DOMAINS",
260 | 					suffixes(2, 3, noInfo,
261 | 						suffix(3, "foo.com"),
262 | 						suffix(4, "bar.com"),
263 | 					),
264 | 					suffixes(5, 6, noInfo,
265 | 						suffix(6, "foo.com"),
266 | 					),
267 | 				),
268 | 				section(7, 8, "ICANN DOMAINS",
269 | 					suffixes(8, 9, noInfo,
270 | 						suffix(9, "qux.com"),
271 | 						suffix(10, "foo.com"),
272 | 					),
273 | 				),
274 | 			),
275 | 			want: []error{
276 | 				ErrDuplicateSuffix{"foo.com", suffix(6, "foo.com"), suffix(3, "foo.com")},
277 | 				ErrDuplicateSuffix{"foo.com", suffix(10, "foo.com"), suffix(3, "foo.com")},
278 | 			},
279 | 		},
280 | 	}
281 | 
282 | 	for _, tc := range tests {
283 | 		t.Run(tc.name, func(t *testing.T) {
284 | 			got := validateSuffixUniqueness(tc.in)
285 | 			checkDiff(t, "validateSuffixUniqueness", got, tc.want)
286 | 		})
287 | 	}
288 | }
289 | 


--------------------------------------------------------------------------------
/tools/internal/parser/metadata.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"net/mail"
  5 | 	"net/url"
  6 | 	"strings"
  7 | )
  8 | 
  9 | // extractMaintainerInfo extracts structured maintainer metadata from
 10 | // comment.
 11 | func extractMaintainerInfo(comment *Comment) MaintainerInfo {
 12 | 	if comment == nil || len(comment.Text) == 0 {
 13 | 		return MaintainerInfo{MachineEditable: true}
 14 | 	}
 15 | 
 16 | 	var (
 17 | 		ret = MaintainerInfo{
 18 | 			MachineEditable: true,
 19 | 		}
 20 | 		lines             = comment.Text
 21 | 		firstUnusableLine = -1
 22 | 	)
 23 | 
 24 | 	// The first line of metadata usually follows a standard
 25 | 	// form. Handle that first, then scan through the rest of the
 26 | 	// comment to find any further stuff.
 27 | 	name, siteURL, email, ok := splitNameish(lines[0])
 28 | 	if ok {
 29 | 		ret.Name = name
 30 | 		if siteURL != nil {
 31 | 			ret.URLs = append(ret.URLs, siteURL)
 32 | 		}
 33 | 		if email != nil {
 34 | 			ret.Maintainers = append(ret.Maintainers, email)
 35 | 		}
 36 | 		lines = lines[1:]
 37 | 	}
 38 | 
 39 | 	// Aside from the special first line, remaining lines could be
 40 | 	// maintainer emails in a few formats, or URLs, or something
 41 | 	// else. We accumulate everything we can parse, but also keep
 42 | 	// track of whether the information is laid out such that we could
 43 | 	// write the information back out without data loss (although not
 44 | 	// necessarily in the exact same format).
 45 | 	for i, line := range lines {
 46 | 		lineUsed := false
 47 | 		if emails := getSubmitters(line); len(emails) > 0 {
 48 | 			ret.Maintainers = append(ret.Maintainers, emails...)
 49 | 			lineUsed = true
 50 | 		} else if email, err := mail.ParseAddress(line); err == nil {
 51 | 			ret.Maintainers = append(ret.Maintainers, email)
 52 | 			lineUsed = true
 53 | 		} else if u := getURL(line); u != nil {
 54 | 			ret.URLs = append(ret.URLs, u)
 55 | 			lineUsed = true
 56 | 		} else if i == 0 && ret.Name == "" {
 57 | 			ret.Name = line
 58 | 			lineUsed = true
 59 | 		} else {
 60 | 			ret.Other = append(ret.Other, line)
 61 | 			if firstUnusableLine < 0 {
 62 | 				firstUnusableLine = i + 1
 63 | 			}
 64 | 		}
 65 | 
 66 | 		if lineUsed && firstUnusableLine >= 0 {
 67 | 			// Parseable lines after non-parseable lines, we cannot
 68 | 			// confidently write the data back out without dataloss.
 69 | 			ret.MachineEditable = false
 70 | 		}
 71 | 	}
 72 | 
 73 | 	return ret
 74 | }
 75 | 
 76 | // submittedBy is the conventional text that precedes email contact
 77 | // information in a PSL file. Most PSL entries say "Submitted by", but
 78 | // there are 4 entries that are lowercase, and so we do a
 79 | // case-insensitive comparison when looking for this marker.
 80 | const submittedBy = "submitted by"
 81 | 
 82 | // splitNameish tries to parse line in the form:
 83 | //
 84 | //	"<entity name>: <url or submitter email>"
 85 | //
 86 | // It returns the information it was able to extract. Returns all zero
 87 | // values if line does not conform to the expected form.
 88 | //
 89 | // As of 2024-06, a few legacy representations are also handled to
 90 | // improve compatibility with the existing PSL data:
 91 | //
 92 | //   - "<entity name> (<url>)", where the URL is sometimes allowed to
 93 | //     omit https://.
 94 | //   - "<entity name>: Submitted by <email address>", where the second
 95 | //     part is any variant accepted by getSubmitter.
 96 | //   - Any amount of whitespace on either side of the colon (or
 97 | //     fullwidth colon).
 98 | func splitNameish(line string) (name string, url *url.URL, submitter *mail.Address, ok bool) {
 99 | 	if strings.HasPrefix(strings.ToLower(line), submittedBy) {
100 | 		// submitted-by lines are handled separately elsewhere, and
101 | 		// can be misinterpreted as entity names.
102 | 		return "", nil, nil, false
103 | 	}
104 | 
105 | 	// Some older entries are of the form "entity name (url)".
106 | 	if strings.HasSuffix(line, ")") {
107 | 		if name, url, ok := splitNameAndURLInParens(line); ok {
108 | 			return name, url, nil, true
109 | 		}
110 | 	}
111 | 
112 | 	name, rest, ok := strings.Cut(line, ":")
113 | 	if !ok {
114 | 		return "", nil, nil, false
115 | 	}
116 | 
117 | 	// Clean up whitespace either side of the colon.
118 | 	name = strings.TrimSpace(name)
119 | 	rest = strings.TrimSpace(rest)
120 | 
121 | 	if u := getURL(rest); u != nil {
122 | 		return name, u, nil, true
123 | 	} else if emails := getSubmitters(rest); len(emails) == 1 {
124 | 		return name, nil, emails[0], true
125 | 	}
126 | 	return "", nil, nil, false
127 | }
128 | 
129 | // splitNameAndURLInParens tries to parse line in the form:
130 | //
131 | //	"<entity name> (<url>)"
132 | //
133 | // It returns the information it was able to extract, or ok=false if
134 | // the line is not in the expected form.
135 | func splitNameAndURLInParens(line string) (name string, url *url.URL, ok bool) {
136 | 	idx := strings.LastIndexByte(line, '(')
137 | 	if idx == -1 {
138 | 		return "", nil, false
139 | 	}
140 | 	name = strings.TrimSpace(line[:idx])
141 | 	urlStr := strings.TrimSpace(line[idx+1 : len(line)-1])
142 | 
143 | 	if u := getURL(urlStr); u != nil {
144 | 		return name, u, true
145 | 	}
146 | 
147 | 	return "", nil, false
148 | }
149 | 
150 | // getURL tries to parse line as an HTTP/HTTPS URL.
151 | // Returns the URL if line is a well formed URL and nothing but a URL,
152 | // or nil otherwise.
153 | func getURL(line string) *url.URL {
154 | 	// One PSL entry says "see <url>" instead of just a URL.
155 | 	//
156 | 	// TODO: fix the source and delete this hack.
157 | 	if strings.HasPrefix(line, "see https://www.information.aero") {
158 | 		line = strings.TrimPrefix(line, "see ")
159 | 	}
160 | 
161 | 	u, err := url.Parse(line)
162 | 	if err != nil {
163 | 		return nil
164 | 	}
165 | 
166 | 	if u.Scheme != "http" && u.Scheme != "https" {
167 | 		// Caller might have split https://foo.com into [https :
168 | 		// //foo.com], and the last part is a valid scheme-relative
169 | 		// URL. Only accept parses that feature an explicit http(s)
170 | 		// scheme.
171 | 		return nil
172 | 	}
173 | 
174 | 	return u
175 | }
176 | 
177 | // getSubmitter tries to parse line as a submitter email line, usually:
178 | //
179 | //	Submitted by Person Name <person.email@example.com>
180 | //
181 | // To improve compatibility, a few legacy freeform styles are also
182 | // attempted if the one above fails.
183 | //
184 | // Returns the parsed RFC 5322 address, or nil if line does not
185 | // conform to the expected shape.
186 | func getSubmitters(line string) []*mail.Address {
187 | 	if strings.HasPrefix(strings.ToLower(line), submittedBy) {
188 | 		line = line[len(submittedBy):]
189 | 	}
190 | 	// Some entries read "Submitted by: ..." with an extra colon.
191 | 	line = strings.TrimLeft(line, ":")
192 | 	line = strings.TrimSpace(line)
193 | 	// Some ICANN domains lead with "Submitted by registry".
194 | 	line = strings.TrimPrefix(line, "registry ")
195 | 
196 | 	var ret []*mail.Address
197 | 	emailStrs := strings.Split(line, " and ")
198 | 
199 | 	fullyParsed := true
200 | 	for _, emailStr := range emailStrs {
201 | 		addr, err := mail.ParseAddress(emailStr)
202 | 		if err != nil {
203 | 			fullyParsed = false
204 | 			continue
205 | 		}
206 | 		ret = append(ret, addr)
207 | 	}
208 | 
209 | 	if fullyParsed {
210 | 		// Found a way to consume the entire input, we're done.
211 | 		return ret
212 | 	}
213 | 
214 | 	// One current entry uses old school email obfuscation to foil
215 | 	// spam bots, which makes it an invalid address.
216 | 	//
217 | 	// TODO: fix the source and delete this hack.
218 | 	if strings.Contains(line, "lohmus dot me") {
219 | 		cleaned := strings.Replace(line, " at ", "@", 1)
220 | 		cleaned = strings.Replace(cleaned, " dot ", ".", 1)
221 | 		if addr, err := mail.ParseAddress(cleaned); err == nil {
222 | 			return []*mail.Address{addr}
223 | 		}
224 | 	}
225 | 
226 | 	// The normal form failed but there is a "submitted by". If the
227 | 	// last word is an email address, assume the remainder is a name.
228 | 	fs := strings.Fields(line)
229 | 	if len(fs) > 0 {
230 | 		if addr, err := mail.ParseAddress(fs[len(fs)-1]); err == nil {
231 | 			name := strings.Join(fs[:len(fs)-1], " ")
232 | 			name = strings.Trim(name, " ,:")
233 | 			addr.Name = name
234 | 			return []*mail.Address{addr}
235 | 		}
236 | 	}
237 | 
238 | 	return nil
239 | }
240 | 


--------------------------------------------------------------------------------
/linter/pslint.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-#
  3 | #
  4 | # PSL linter written in python
  5 | #
  6 | # Copyright 2016 Tim Rühsen (tim dot ruehsen at gmx dot de). All rights reserved.
  7 | #
  8 | # Permission is hereby granted, free of charge, to any person obtaining a
  9 | # copy of this software and associated documentation files (the "Software"),
 10 | # to deal in the Software without restriction, including without limitation
 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 12 | # and/or sell copies of the Software, and to permit persons to whom the
 13 | # Software is furnished to do so, subject to the following conditions:
 14 | #
 15 | # The above copyright notice and this permission notice shall be included in
 16 | # all copies or substantial portions of the Software.
 17 | #
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | # DEALINGS IN THE SOFTWARE.
 25 | 
 26 | import sys
 27 | import codecs
 28 | import unicodedata
 29 | 
 30 | nline = 0
 31 | line = ""
 32 | orig_line = ""
 33 | warnings = 0
 34 | errors = 0
 35 | skip_order_check = False
 36 | 
 37 | def warning(msg):
 38 | 	global warnings, orig_line, nline
 39 | 	print('%d: warning: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
 40 | 	warnings += 1
 41 | 
 42 | def error(msg):
 43 | 	global errors, orig_line, nline
 44 | 	print('%d: error: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
 45 | 	errors += 1
 46 | #	skip_order_check = True
 47 | 
 48 | def print_psl(list):
 49 | 	for domain in list:
 50 | 		print(".".join(str(label) for label in reversed(domain)))
 51 | 
 52 | def psl_key(s):
 53 | 	if s[0] == '*':
 54 | 		return 0
 55 | 	if s[0] == '!':
 56 | 		return 1
 57 | 	return 2
 58 | 
 59 | def check_order(group):
 60 | 	"""Check the correct order of a domain group"""
 61 | 	global skip_order_check
 62 | 
 63 | 	try:
 64 | 		if skip_order_check or len(group) < 2:
 65 | 			skip_order_check = False
 66 | 			return
 67 | 
 68 | 		# check if the TLD is the identical within the group
 69 | 		if any(group[0][0] != labels[0] for labels in group):
 70 | 			warning('Domain group TLD is not consistent')
 71 | 
 72 | 		# sort by # of labels, label-by-label (labels are in reversed order)
 73 | 		sorted_group = sorted(group, key = lambda labels: (len(labels), psl_key(labels[-1][0]), labels))
 74 | 
 75 | 		if group != sorted_group:
 76 | 			warning('Incorrectly sorted group of domains')
 77 | 			print("  " + str(group))
 78 | 			print("  " + str(sorted_group))
 79 | 			print("Correct sorting would be:")
 80 | 			print_psl(sorted_group)
 81 | 
 82 | 	finally:
 83 | 		del group[:]
 84 | 
 85 | 
 86 | def lint_psl(infile):
 87 | 	"""Parses PSL file and performs syntax checking"""
 88 | 	global orig_line, nline
 89 | 
 90 | 	PSL_FLAG_EXCEPTION = (1<<0)
 91 | 	PSL_FLAG_WILDCARD = (1<<1)
 92 | 	PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
 93 | 	PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
 94 | 	PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
 95 | 
 96 | 	line2number = {}
 97 | 	line2flag = {}
 98 | 	group = []
 99 | 	section = 0
100 | 	icann_sections = 0
101 | 	private_sections = 0
102 | 
103 | 	lines = [line.strip('\n') for line in infile]
104 | 
105 | 	for line in lines:
106 | 		nline += 1
107 | 
108 | 		# check for leading/trailing whitespace
109 | 		stripped = line.strip()
110 | 		if stripped != line:
111 | 			line = line.replace('\t','\\t')
112 | 			line = line.replace('\r','^M')
113 | 			orig_line = line
114 | 			warning('Leading/Trailing whitespace')
115 | 		orig_line = line
116 | 		line = stripped
117 | 
118 | 		# empty line (end of sorted domain group)
119 | 		if not line:
120 | 			# check_order(group)
121 | 			continue
122 | 
123 | 		# check for section begin/end
124 | 		if line[0:2] == "//":
125 | 			# check_order(group)
126 | 
127 | 			if section == 0:
128 | 				if line == "// ===BEGIN ICANN DOMAINS===":
129 | 					section = PSL_FLAG_ICANN
130 | 					icann_sections += 1
131 | 				elif line == "// ===BEGIN PRIVATE DOMAINS===":
132 | 					section = PSL_FLAG_PRIVATE
133 | 					private_sections += 1
134 | 				elif line[3:11] == "===BEGIN":
135 | 					error('Unexpected begin of unknown section')
136 | 				elif line[3:9] == "===END":
137 | 					error('End of section without previous begin')
138 | 			elif section == PSL_FLAG_ICANN:
139 | 				if line == "// ===END ICANN DOMAINS===":
140 | 					section = 0
141 | 				elif line[3:11] == "===BEGIN":
142 | 					error('Unexpected begin of section: ')
143 | 				elif line[3:9] == "===END":
144 | 					error('Unexpected end of section')
145 | 			elif section == PSL_FLAG_PRIVATE:
146 | 				if line == "// ===END PRIVATE DOMAINS===":
147 | 					section = 0
148 | 				elif line[3:11] == "===BEGIN":
149 | 					error('Unexpected begin of section')
150 | 				elif line[3:9] == "===END":
151 | 					error('Unexpected end of section')
152 | 
153 | 			continue # processing of comments ends here
154 | 
155 | 		# No rule must be outside of a section
156 | 		if section == 0:
157 | 			error('Rule outside of section')
158 | 
159 | 		group.append(list(reversed(line.split('.'))))
160 | 
161 | 		# decode UTF-8 input into unicode, needed only for python 2.x
162 | 		try:
163 | 			if sys.version_info[0] < 3:
164 | 				line = line.decode('utf-8')
165 | 			else:
166 | 				line.encode('utf-8')
167 | 		except (UnicodeDecodeError, UnicodeEncodeError):
168 | 			orig_line = None
169 | 			error('Invalid UTF-8 character')
170 | 			continue
171 | 
172 | 		# rules must be NFC coded (Unicode's Normal Form Kanonical Composition)
173 | 		if unicodedata.normalize("NFKC", line) != line:
174 | 			error('Rule must be NFKC')
175 | 
176 | 		# each rule must be lowercase (or more exactly: not uppercase and not titlecase)
177 | 		if line != line.lower():
178 | 			error('Rule must be lowercase')
179 | 
180 | 		# strip leading wildcards
181 | 		flags = section
182 | 		# while line[0:2] == '*.':
183 | 		if line[0:2] == '*.':
184 | 			flags |= PSL_FLAG_WILDCARD
185 | 			line = line[2:]
186 | 
187 | 		if line[0] == '!':
188 | 			flags |= PSL_FLAG_EXCEPTION
189 | 			line = line[1:]
190 | 		else:
191 | 			flags |= PSL_FLAG_PLAIN
192 | 
193 | 		# wildcard and exception must not combine
194 | 		if flags & PSL_FLAG_WILDCARD and flags & PSL_FLAG_EXCEPTION:
195 | 			error('Combination of wildcard and exception')
196 | 			continue
197 | 
198 | 		labels = line.split('.')
199 | 
200 | 		if flags & PSL_FLAG_EXCEPTION and len(labels) > 1:
201 | 			domain = ".".join(str(label) for label in labels[1:])
202 | 			if not domain in line2flag:
203 | 				error('Exception without previous wildcard')
204 | 			elif not line2flag[domain] & PSL_FLAG_WILDCARD:
205 | 				error('Exception without previous wildcard')
206 | 
207 | 		for label in labels:
208 | 			if not label:
209 | 				error('Leading/trailing or multiple dot')
210 | 				continue
211 | 
212 | 			if label[0:4] == 'xn--':
213 | 				error('Punycode found')
214 | 				continue
215 | 
216 | 			if '--' in label:
217 | 				error('Double minus found')
218 | 				continue
219 | 
220 | 			# allowed are a-z,0-9,- and unicode >= 128 (maybe that can be finetuned a bit !?)
221 | 			for c in label:
222 | 				if not c.isalnum() and c != '-' and ord(c) < 128:
223 | 					error('Illegal character')
224 | 					break
225 | 
226 | 		if line in line2flag:
227 | 			'''Found existing entry:
228 | 			   Combination of exception and plain rule is contradictionary
229 | 			     !foo.bar + foo.bar
230 | 			   Doublette, since *.foo.bar implies foo.bar:
231 | 			      foo.bar + *.foo.bar
232 | 			   Allowed:
233 | 			     !foo.bar + *.foo.bar
234 | 			'''
235 | 			error('Found doublette/ambiguity (previous line was %d)' % line2number[line])
236 | 
237 | 		line2number[line] = nline
238 | 		line2flag[line] = flags
239 | 
240 | 	orig_line = None
241 | 
242 | 	if section == PSL_FLAG_ICANN:
243 | 		error('ICANN section not closed')
244 | 	elif section == PSL_FLAG_PRIVATE:
245 | 		error('PRIVATE section not closed')
246 | 
247 | 	if icann_sections < 1:
248 | 		warning('No ICANN section found')
249 | 	elif icann_sections > 1:
250 | 		warning('%d ICANN sections found' % icann_sections)
251 | 
252 | 	if private_sections < 1:
253 | 		warning('No PRIVATE section found')
254 | 	elif private_sections > 1:
255 | 		warning('%d PRIVATE sections found' % private_sections)
256 | 
257 | def usage():
258 | 	"""Prints the usage"""
259 | 	print('usage: %s PSLfile' % sys.argv[0])
260 | 	print('or     %s -        # To read PSL from STDIN' % sys.argv[0])
261 | 	exit(1)
262 | 
263 | 
264 | def main():
265 | 	"""Check syntax of a PSL file"""
266 | 	if len(sys.argv) < 2:
267 | 		usage()
268 | 
269 | 	with sys.stdin if sys.argv[-1] == '-' else open(sys.argv[-1], 'r', encoding='utf-8', errors="surrogateescape") as infile:
270 | 		lint_psl(infile)
271 | 
272 | 	return errors != 0
273 | 
274 | 
275 | if __name__ == '__main__':
276 | 	sys.exit(main())
277 | 


--------------------------------------------------------------------------------
/tools/psltool/psltool.go:
--------------------------------------------------------------------------------
  1 | // psltool is a CLI tool to manipulate and validate PSL files.
  2 | package main
  3 | 
  4 | import (
  5 | 	"bytes"
  6 | 	"context"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"log"
 11 | 	"os"
 12 | 	"os/signal"
 13 | 	"path/filepath"
 14 | 	"strconv"
 15 | 	"strings"
 16 | 	"syscall"
 17 | 	"time"
 18 | 	"unicode"
 19 | 
 20 | 	"github.com/creachadair/command"
 21 | 	"github.com/creachadair/flax"
 22 | 	"github.com/creachadair/mds/mdiff"
 23 | 	"github.com/natefinch/atomic"
 24 | 	"github.com/publicsuffix/list/tools/internal/githistory"
 25 | 	"github.com/publicsuffix/list/tools/internal/github"
 26 | 	"github.com/publicsuffix/list/tools/internal/parser"
 27 | )
 28 | 
 29 | func main() {
 30 | 	log.SetFlags(0)
 31 | 
 32 | 	root := &command.C{
 33 | 		Name:  filepath.Base(os.Args[0]),
 34 | 		Usage: "command [flags] ...\nhelp [command]",
 35 | 		Help:  "A command-line tool to edit and validate PSL files.",
 36 | 		Commands: []*command.C{
 37 | 			{
 38 | 				Name:  "fmt",
 39 | 				Usage: "<path>",
 40 | 				Help: `Format a PSL file.
 41 | 
 42 | By default, the given file is updated in place.`,
 43 | 				SetFlags: command.Flags(flax.MustBind, &fmtArgs),
 44 | 				Run:      command.Adapt(runFmt),
 45 | 			},
 46 | 			{
 47 | 				Name:  "validate",
 48 | 				Usage: "<path or git commit hash>",
 49 | 				Help: `Check that a file is a valid PSL file.
 50 | 
 51 | Validation includes basic issues like parse errors, as well as
 52 | conformance with the PSL project's style rules and policies.
 53 | 
 54 | The argument can be either a local file, or a git commit hash to fetch
 55 | from https://github.com/publicsuffix/list.`,
 56 | 				SetFlags: command.Flags(flax.MustBind, &validateArgs),
 57 | 				Run:      command.Adapt(runValidate),
 58 | 			},
 59 | 			{
 60 | 				Name:  "check-pr",
 61 | 				Usage: "<number>",
 62 | 				Help: `Validate an open PR on GitHub.
 63 | 
 64 | Validation includes basic issues like parse errors, as well as
 65 | conformance with the PSL project's style rules and policies.`,
 66 | 				SetFlags: command.Flags(flax.MustBind, &checkPRArgs),
 67 | 				Run:      command.Adapt(runCheckPR),
 68 | 			},
 69 | 			{
 70 | 				Name: "debug",
 71 | 				Commands: []*command.C{
 72 | 					{
 73 | 						Name:     "dump",
 74 | 						Usage:    "<path>",
 75 | 						Help:     "Print a debug dump of a PSL file.",
 76 | 						SetFlags: command.Flags(flax.MustBind, &debugDumpArgs),
 77 | 						Run:      command.Adapt(runDebugDump),
 78 | 					},
 79 | 				},
 80 | 			},
 81 | 
 82 | 			command.HelpCommand(nil),
 83 | 			command.VersionCommand(),
 84 | 		},
 85 | 	}
 86 | 
 87 | 	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 88 | 	defer cancel()
 89 | 	env := root.NewEnv(nil).SetContext(ctx).MergeFlags(true)
 90 | 	command.RunOrFail(env, os.Args[1:])
 91 | }
 92 | 
 93 | var fmtArgs struct {
 94 | 	Diff bool `flag:"d,Output a diff of changes instead of rewriting the file"`
 95 | }
 96 | 
 97 | func runFmt(env *command.Env, path string) error {
 98 | 	bs, err := os.ReadFile(path)
 99 | 	if err != nil {
100 | 		return fmt.Errorf("Failed to read PSL file: %w", err)
101 | 	}
102 | 
103 | 	psl, parseErrs := parser.Parse(bs)
104 | 	fmtErrs := psl.Clean()
105 | 
106 | 	for _, err := range parseErrs {
107 | 		fmt.Fprintln(env, err)
108 | 	}
109 | 	for _, err := range fmtErrs {
110 | 		fmt.Fprintln(env, err)
111 | 	}
112 | 
113 | 	clean := psl.MarshalPSL()
114 | 	changed := !bytes.Equal(bs, clean)
115 | 
116 | 	if changed {
117 | 		if fmtArgs.Diff {
118 | 			lhs, rhs := strings.Split(string(bs), "\n"), strings.Split(string(clean), "\n")
119 | 			diff := mdiff.New(lhs, rhs).AddContext(3)
120 | 			mdiff.FormatUnified(os.Stdout, diff, &mdiff.FileInfo{
121 | 				Left:  "a/" + path,
122 | 				Right: "b/" + path,
123 | 			})
124 | 			return errors.New("File needs reformatting, rerun without -d to fix")
125 | 		}
126 | 		if len(parseErrs) > 0 {
127 | 			return errors.New("Cannot reformat file due to parse errors")
128 | 		}
129 | 		if err := atomic.WriteFile(path, bytes.NewReader(clean)); err != nil {
130 | 			return fmt.Errorf("Failed to reformat: %w", err)
131 | 		}
132 | 	}
133 | 
134 | 	return nil
135 | }
136 | 
137 | var validateArgs struct {
138 | 	Owner  string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"`
139 | 	Repo   string `flag:"gh-repo,default=list,Github repository to check"`
140 | 	Clone  string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"`
141 | 	Online bool   `flag:"online-checks,Run validations that require querying third-party servers"`
142 | }
143 | 
144 | func isHex(s string) bool {
145 | 	for _, r := range s {
146 | 		if !unicode.In(r, unicode.ASCII_Hex_Digit) {
147 | 			return false
148 | 		}
149 | 	}
150 | 	return true
151 | }
152 | 
153 | func runValidate(env *command.Env, pathOrHash string) error {
154 | 	var bs []byte
155 | 	var err error
156 | 
157 | 	client := github.Repo{
158 | 		Owner: checkPRArgs.Owner,
159 | 		Repo:  checkPRArgs.Repo,
160 | 	}
161 | 
162 | 	isPath := false
163 | 	if _, err = os.Stat(pathOrHash); err == nil {
164 | 		// input is a local file
165 | 		isPath = true
166 | 		bs, err = os.ReadFile(pathOrHash)
167 | 	} else if isHex(pathOrHash) {
168 | 		// input looks like a git hash
169 | 		bs, err = client.PSLForHash(context.Background(), pathOrHash)
170 | 	} else {
171 | 		return fmt.Errorf("Failed to read PSL file %q, not a local file or a git commit hash", pathOrHash)
172 | 	}
173 | 	if err != nil {
174 | 		return fmt.Errorf("Failed to read PSL file %q: %w", pathOrHash, err)
175 | 	}
176 | 
177 | 	psl, errs := parser.Parse(bs)
178 | 	errs = append(errs, psl.Clean()...)
179 | 	errs = append(errs, parser.ValidateOffline(psl)...)
180 | 	if validateArgs.Online {
181 | 		if validateArgs.Clone == "" && isPath {
182 | 			// Assume the PSL file being validated might be in a git
183 | 			// clone, and try to use that as the reference for history.
184 | 			validateArgs.Clone = filepath.Dir(pathOrHash)
185 | 		}
186 | 		if validateArgs.Clone == "" {
187 | 			return errors.New("--gh-local-clone is required for full validation")
188 | 		}
189 | 		prHistory, err := githistory.GetPRInfo(validateArgs.Clone)
190 | 		if err != nil {
191 | 			return fmt.Errorf("failed to get local PR history, refusing to run full validation to avoid Github DoS: %w", err)
192 | 		}
193 | 
194 | 		ctx, cancel := context.WithTimeout(env.Context(), 1200*time.Second)
195 | 		defer cancel()
196 | 		errs = append(errs, parser.ValidateOnline(ctx, psl, &client, prHistory)...)
197 | 	}
198 | 
199 | 	clean := psl.MarshalPSL()
200 | 	if !bytes.Equal(bs, clean) {
201 | 		errs = append(errs, errors.New("file needs reformatting, run 'psltool fmt' to fix"))
202 | 	}
203 | 
204 | 	for _, err := range errs {
205 | 		fmt.Fprintln(env, err)
206 | 	}
207 | 
208 | 	if l := len(errs); l == 0 {
209 | 		fmt.Fprintln(env, "PSL file is valid")
210 | 		return nil
211 | 	} else if l == 1 {
212 | 		return errors.New("file has 1 error")
213 | 	} else {
214 | 		return fmt.Errorf("file has %d errors", l)
215 | 	}
216 | }
217 | 
218 | var checkPRArgs struct {
219 | 	Owner  string `flag:"gh-owner,default=publicsuffix,Owner of the github repository to check"`
220 | 	Repo   string `flag:"gh-repo,default=list,Github repository to check"`
221 | 	Clone  string `flag:"gh-local-clone,Path to a local clone of the repository specified by gh-owner/gh-repo"`
222 | 	Online bool   `flag:"online-checks,Run validations that require querying third-party servers"`
223 | }
224 | 
225 | func runCheckPR(env *command.Env, prStr string) error {
226 | 	pr, err := strconv.Atoi(prStr)
227 | 	if err != nil {
228 | 		return fmt.Errorf("invalid PR number %q: %w", prStr, err)
229 | 	}
230 | 
231 | 	client := github.Repo{
232 | 		Owner: checkPRArgs.Owner,
233 | 		Repo:  checkPRArgs.Repo,
234 | 	}
235 | 	withoutPR, withPR, err := client.PSLForPullRequest(env.Context(), pr)
236 | 	if err != nil {
237 | 		return err
238 | 	}
239 | 
240 | 	before, _ := parser.Parse(withoutPR)
241 | 	after, errs := parser.Parse(withPR)
242 | 	after.SetBaseVersion(before, true)
243 | 	errs = append(errs, after.Clean()...)
244 | 	errs = append(errs, parser.ValidateOffline(after)...)
245 | 	if checkPRArgs.Online {
246 | 		var prHistory *githistory.History
247 | 		if validateArgs.Clone != "" {
248 | 			prHistory, err = githistory.GetPRInfo(validateArgs.Clone)
249 | 			if err != nil {
250 | 				return fmt.Errorf("failed to get local PR history: %w", err)
251 | 			}
252 | 		}
253 | 
254 | 		ctx, cancel := context.WithTimeout(env.Context(), 300*time.Second)
255 | 		defer cancel()
256 | 		errs = append(errs, parser.ValidateOnline(ctx, after, &client, prHistory)...)
257 | 	}
258 | 
259 | 	clean := after.MarshalPSL()
260 | 	if !bytes.Equal(withPR, clean) {
261 | 		errs = append(errs, errors.New("file needs reformatting, run 'psltool fmt' to fix"))
262 | 	}
263 | 
264 | 	// Print the blocks marked changed, so a human can check that
265 | 	// something was actually checked by validations.
266 | 	var changed []*parser.Suffixes
267 | 	for _, block := range parser.BlocksOfType[*parser.Suffixes](after) {
268 | 		if block.Changed() {
269 | 			changed = append(changed, block)
270 | 		}
271 | 	}
272 | 	if len(changed) == 0 {
273 | 		fmt.Fprintln(env, "No suffix blocks changed. This can happen if only top-level comments have been edited.")
274 | 	} else {
275 | 		fmt.Fprintln(env, "Checked the following changed suffix blocks:")
276 | 		for _, block := range changed {
277 | 			fmt.Fprintf(env, "  %q (%s)\n", block.Info.Name, block.LocationString())
278 | 		}
279 | 	}
280 | 	io.WriteString(env, "\n")
281 | 
282 | 	if len(errs) > 0 {
283 | 		for _, err := range errs {
284 | 			fmt.Fprintln(env, err)
285 | 		}
286 | 		io.WriteString(env, "\n")
287 | 	}
288 | 
289 | 	if l := len(errs); l == 0 {
290 | 		fmt.Fprintln(env, "PSL change is valid")
291 | 		return nil
292 | 	} else if l == 1 {
293 | 		return errors.New("change has 1 error")
294 | 	} else {
295 | 		return fmt.Errorf("change has %d errors", l)
296 | 	}
297 | }
298 | 
299 | var debugDumpArgs struct {
300 | 	Clean  bool   `flag:"c,Clean AST before dumping"`
301 | 	Format string `flag:"f,default=ast,Format to dump in, one of 'ast' or 'psl'"`
302 | }
303 | 
304 | func runDebugDump(env *command.Env, path string) error {
305 | 	var dumpFn func(*parser.List) []byte
306 | 	switch debugDumpArgs.Format {
307 | 	case "ast":
308 | 		dumpFn = (*parser.List).MarshalDebug
309 | 	case "psl":
310 | 		dumpFn = (*parser.List).MarshalPSL
311 | 	default:
312 | 		return fmt.Errorf("unknown dump format %q", debugDumpArgs.Format)
313 | 	}
314 | 
315 | 	bs, err := os.ReadFile(path)
316 | 	if err != nil {
317 | 		return fmt.Errorf("failed to read PSL file: %w", err)
318 | 	}
319 | 
320 | 	psl, errs := parser.Parse(bs)
321 | 
322 | 	if debugDumpArgs.Clean {
323 | 		errs = append(errs, psl.Clean()...)
324 | 	}
325 | 
326 | 	for _, err := range errs {
327 | 		fmt.Fprintln(env, err)
328 | 	}
329 | 
330 | 	bs = dumpFn(psl)
331 | 	os.Stdout.Write(bs)
332 | 	return nil
333 | }
334 | 


--------------------------------------------------------------------------------
/tools/internal/domain/domain_test.go:
--------------------------------------------------------------------------------
  1 | package domain_test
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"slices"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"testing"
 11 | 	"unicode/utf8"
 12 | 
 13 | 	"github.com/publicsuffix/list/tools/internal/domain"
 14 | 	"golang.org/x/net/idna"
 15 | )
 16 | 
 17 | //go:generate go run update_idna_testdata.go
 18 | 
 19 | func TestParse(t *testing.T) {
 20 | 	// This test is using the official Unicode IDNA test vectors, to
 21 | 	// verify that domain.Parse is processing inputs exactly as
 22 | 	// Unicode TR46 specifies. This is mostly a test of the behavior
 23 | 	// of the underlying x/net/idna, but given the importance of
 24 | 	// correctly validating public suffixes, we explicitly verify that
 25 | 	// x/text/idna behaves correctly, and that our wrapper code
 26 | 	// doesn't do anything surprising.
 27 | 
 28 | 	numVectors := forEachIDNATestVector(t, func(input, want string, wantErr bool) {
 29 | 		// PSL style deviates slightly from pure IDNA style, by
 30 | 		// removing trailing dots if present. The removal is silent
 31 | 		// because it doesn't affect the meaning of suffixes, but that
 32 | 		// means the following tests have to allow for missing dots.
 33 | 		//
 34 | 		// Fortunately this adjustment does not break any of the IDNA
 35 | 		// test vectors.
 36 | 		wantNoTrailingDot := strings.TrimSuffix(want, ".")
 37 | 
 38 | 		got, err := domain.Parse(input)
 39 | 		gotErr := err != nil
 40 | 		if gotErr != wantErr {
 41 | 			t.Errorf("domain.Parse(%q) gotErr=%v, want %v", input, gotErr, wantErr)
 42 | 			if err != nil {
 43 | 				t.Logf("parse error was: %v", err)
 44 | 			}
 45 | 		}
 46 | 
 47 | 		if err == nil && got.String() != wantNoTrailingDot {
 48 | 			t.Errorf("domain.Parse(%q) = %q, want %q", input, got.String(), wantNoTrailingDot)
 49 | 		}
 50 | 
 51 | 		// Further tests only make sense on successful parses.
 52 | 		if wantErr {
 53 | 			return
 54 | 		}
 55 | 
 56 | 		// Domain parse succeeded, domain.ParseLabel of each label
 57 | 		// must also succeed.
 58 | 		//
 59 | 		// We only do this for test vectors that don't return an
 60 | 		// error, which means 'want' is in canonical form and '.' is
 61 | 		// the only label separator character.
 62 | 		var gotLabels []domain.Label
 63 | 		for _, labelStr := range strings.Split(wantNoTrailingDot, ".") {
 64 | 			label, err := domain.ParseLabel(labelStr)
 65 | 			if err != nil {
 66 | 				t.Errorf("domain.ParseLabel(%q) got err: %v", labelStr, err)
 67 | 			} else {
 68 | 				gotLabels = append(gotLabels, label)
 69 | 			}
 70 | 		}
 71 | 
 72 | 		if wantLabels := got.Labels(); !slices.EqualFunc(gotLabels, wantLabels, domain.Label.Equal) {
 73 | 			t.Error("domain.ParseLabel() of each label is not equivalent to ParseDomain().Labels()")
 74 | 			t.Logf("domain.ParseLabel()    : %#v", gotLabels)
 75 | 			t.Logf("domain.Parse().Labels(): %#v", wantLabels)
 76 | 		}
 77 | 
 78 | 		// ParseLabel must refuse to parse entire domains
 79 | 		if got.NumLabels() > 1 {
 80 | 			if gotLabel, err := domain.ParseLabel(input); err == nil {
 81 | 				t.Errorf("domain.ParseLabel(%q) got %q, want parse error", input, gotLabel)
 82 | 			}
 83 | 		}
 84 | 
 85 | 		// Domain and label comparisons are reflexive.
 86 | 		if gotCmp := got.Compare(got); gotCmp != 0 {
 87 | 			t.Errorf("Name.Compare(%q, %q) = %d, want 0", got, got, gotCmp)
 88 | 		}
 89 | 		for _, label := range gotLabels {
 90 | 			if gotCmp := label.Compare(label); gotCmp != 0 {
 91 | 				t.Errorf("Label.Compare(%q, %q) = %d, want 0", label, label, gotCmp)
 92 | 			}
 93 | 		}
 94 | 	})
 95 | 	t.Logf("checked %d test vectors", numVectors)
 96 | 
 97 | 	// Sanity check to make sure the parser didn't just silently skip
 98 | 	//  all test inputs. Manual inspection of the Unicode 15.0 test
 99 | 	//  file shows 6235 tests. We allow a small amount of reduction
100 | 	//  because tests occasionally get removed (e.g. Unicode 15.1
101 | 	//  removes some vectors relating to deprecated special handling
102 | 	//  of "ß" in case mapping).
103 | 	const minVectors = 6200
104 | 	if numVectors < minVectors {
105 | 		t.Errorf("found %d test vectors, want at least %d", numVectors, minVectors)
106 | 	}
107 | }
108 | 
109 | // forEachIDNATestVector parses testdata/idna_test_vectors.txt and
110 | // calls fn in a subtest for each test vector. Return the number of
111 | // test vectors found in the file.
112 | func forEachIDNATestVector(t *testing.T, fn func(input, want string, wantErr bool)) (numVectorsFound int) {
113 | 	t.Helper()
114 | 
115 | 	const testfile = "testdata/idna_test_vectors.txt"
116 | 
117 | 	// Process the file in 2 passes. This is less efficient, it's
118 | 	// possible to stream the test file and do all this in one pass,
119 | 	// but the result is less readable.
120 | 	bs, err := os.ReadFile(testfile)
121 | 	if err != nil {
122 | 		t.Fatalf("reading IDNA test vectors: %v", err)
123 | 	}
124 | 	lines := strings.Split(string(bs), "\n")
125 | 
126 | 	type testCase struct {
127 | 		line   int
128 | 		raw    string
129 | 		fields []string
130 | 	}
131 | 	var tests []testCase
132 | 	foundUnicodeVersion := false
133 | 	for i, ln := range lines {
134 | 		if ln == "" {
135 | 			continue
136 | 		}
137 | 
138 | 		if unicodeVersion, ok := strings.CutPrefix(ln, "# Version: "); ok {
139 | 			if unicodeVersion != idna.UnicodeVersion {
140 | 				t.Fatalf("IDNA test file %q is for Unicode version %s, but x/net/idna uses version %s. Run 'go generate' to update the test file.", testfile, unicodeVersion, idna.UnicodeVersion)
141 | 			}
142 | 			foundUnicodeVersion = true
143 | 			continue
144 | 		}
145 | 
146 | 		if strings.HasPrefix(ln, "#") {
147 | 			continue
148 | 		}
149 | 
150 | 		fs := strings.Split(ln, "; ")
151 | 		if len(fs) != 7 {
152 | 			t.Fatalf("line %d: unrecognized test vector format: %s", i+1, ln)
153 | 		}
154 | 		tests = append(tests, testCase{i + 1, ln, fs})
155 | 	}
156 | 	if !foundUnicodeVersion {
157 | 		t.Fatalf("failed to determine Unicode version of test file, cannot proceed")
158 | 	}
159 | 
160 | 	// Now we've collected all the test cases, prepare the inputs and
161 | 	// run the tests.
162 | 	for _, tc := range tests {
163 | 		input := tc.fields[0]
164 | 		want := tc.fields[1]
165 | 		wantErr := tc.fields[2] != ""
166 | 
167 | 		// the input and want strings contain Unicode escape
168 | 		// sequences, so that the test can express precise invalid
169 | 		// inputs without risking accidental canonicalization by
170 | 		// editors and file readers. We have to carefully undo that
171 | 		// here, without making unwanted changes to the strings.
172 | 		input = unquoteVector(t, input)
173 | 		want = unquoteVector(t, want)
174 | 
175 | 		// The test file format specifies that if the expected output
176 | 		// is the same as the input, they don't repeat it.
177 | 		if want == "" {
178 | 			want = input
179 | 		}
180 | 
181 | 		t.Run(fmt.Sprintf("line_%d", tc.line), func(t *testing.T) {
182 | 			fn(input, want, wantErr)
183 | 			if t.Failed() {
184 | 				t.Logf("failing test vector: %s", tc.raw)
185 | 			}
186 | 		})
187 | 	}
188 | 
189 | 	return len(tests)
190 | }
191 | 
192 | // unquoteVector returns its input, with \uXXXX Unicode escape
193 | // sequences converted to the corresponding UTF-8 bytes.
194 | //
195 | // In theory we could use strconv.Unquote, but that function handles
196 | // more escape sequences that are not specified in the IDNA test
197 | // format. Unquote may also mangle strings that are not valid UTF-8 in
198 | // surprising ways, which could silently make tests check the wrong
199 | // thing. To be safe, we do the unquoting ourselves, so that we are in
200 | // full control of all mutations.
201 | func unquoteVector(t *testing.T, s string) string {
202 | 	t.Helper()
203 | 
204 | 	bs := []byte(s)
205 | 	var out []byte
206 | 
207 | 	for {
208 | 		start, rest, found := bytes.Cut(bs, []byte(`\u`))
209 | 		out = append(out, start...)
210 | 		if !found {
211 | 			// No more escapes, we're done.
212 | 			break
213 | 		}
214 | 
215 | 		// next 4 bytes are hex digits
216 | 		if len(rest) < 4 {
217 | 			t.Fatalf("malformed unicode escape sequence in %q", s)
218 | 		}
219 | 		hexStr := string(rest[:4])
220 | 		runeVal, err := strconv.ParseUint(hexStr, 16, 64)
221 | 		if err != nil {
222 | 			t.Fatalf("malformed unicode escape sequence in %q", s)
223 | 		}
224 | 		out = utf8.AppendRune(out, rune(runeVal))
225 | 
226 | 		bs = rest[4:]
227 | 	}
228 | 
229 | 	if !utf8.Valid(out) {
230 | 		t.Fatalf("string %q is invalid UTF-8 after unquote: %q", s, string(out))
231 | 	}
232 | 	return string(out)
233 | }
234 | 
235 | func TestLabelCompare(t *testing.T) {
236 | 	tests := []struct {
237 | 		a, b string
238 | 		want int
239 | 	}{
240 | 		{"com", "com", 0},
241 | 		{"com", "org", -1},
242 | 		{"com", "aaa", +1},
243 | 		// Equivalent strings in NFC and NFD, ParseLabel should
244 | 		// canonicalize to equal.
245 | 		{"Québécois", "Que\u0301be\u0301cois", 0},
246 | 		// From the xn--o3cw4h block of the PSL.
247 | 		{"ทหาร", "ธุรกิจ", -1},
248 | 		{"ทหาร", "com", +1},
249 | 	}
250 | 
251 | 	for _, tc := range tests {
252 | 		la, err := domain.ParseLabel(tc.a)
253 | 		if err != nil {
254 | 			t.Fatalf("ParseLabel(%q) failed: %v", tc.a, err)
255 | 		}
256 | 		lb, err := domain.ParseLabel(tc.b)
257 | 		if err != nil {
258 | 			t.Fatalf("ParseLabel(%q) failed: %v", tc.b, err)
259 | 		}
260 | 
261 | 		gotCmp := domain.Label.Compare(la, lb)
262 | 		if gotCmp != tc.want {
263 | 			t.Errorf("Label.Compare(%q, %q) = %d, want %d", la, lb, gotCmp, tc.want)
264 | 		}
265 | 		wantEq := tc.want == 0
266 | 		if gotEq := domain.Label.Equal(la, lb); gotEq != wantEq {
267 | 			t.Errorf("Label.Equal(%q, %q) = %v, want %v", la, lb, gotEq, wantEq)
268 | 		}
269 | 
270 | 		// Same again, but backwards.
271 | 		gotCmp = domain.Label.Compare(lb, la)
272 | 		if want := -tc.want; gotCmp != want {
273 | 			t.Errorf("Label.Compare(%q, %q) = %d, want %d", lb, la, gotCmp, want)
274 | 		}
275 | 		if gotEq := domain.Label.Equal(lb, la); gotEq != wantEq {
276 | 			t.Errorf("Label.Equal(%q, %q) = %v, want %v", lb, la, gotEq, wantEq)
277 | 		}
278 | 	}
279 | }
280 | 
281 | func TestNameCompare(t *testing.T) {
282 | 	tests := []struct {
283 | 		a, b string
284 | 		want int
285 | 	}{
286 | 		{"foo.com", "foo.com.", 0},
287 | 		{"com", "org", -1},
288 | 		{"com", "aaa", +1},
289 | 		// Equivalent strings in NFC and NFD, ParseLabel should
290 | 		// canonicalize to equal.
291 | 		{"Québécois", "Que\u0301be\u0301cois", 0},
292 | 		// From the xn--o3cw4h block of the PSL.
293 | 		{"ทหาร", "ธุรกิจ", -1},
294 | 		{"ทหาร", "com", +1},
295 | 	}
296 | 
297 | 	for _, tc := range tests {
298 | 		da, err := domain.Parse(tc.a)
299 | 		if err != nil {
300 | 			t.Fatalf("ParseLabel(%q) failed: %v", tc.a, err)
301 | 		}
302 | 		db, err := domain.Parse(tc.b)
303 | 		if err != nil {
304 | 			t.Fatalf("ParseLabel(%q) failed: %v", tc.b, err)
305 | 		}
306 | 
307 | 		gotCmp := domain.Name.Compare(da, db)
308 | 		if gotCmp != tc.want {
309 | 			t.Errorf("Label.Compare(%q, %q) = %d, want %d", da, db, gotCmp, tc.want)
310 | 		}
311 | 		wantEq := tc.want == 0
312 | 		if gotEq := domain.Name.Equal(da, db); gotEq != wantEq {
313 | 			t.Errorf("Label.Equal(%q, %q) = %v, want %v", da, db, gotEq, wantEq)
314 | 		}
315 | 
316 | 		// Same again, but backwards.
317 | 		gotCmp = domain.Name.Compare(db, da)
318 | 		if want := -tc.want; gotCmp != want {
319 | 			t.Errorf("Label.Compare(%q, %q) = %d, want %d", db, da, gotCmp, want)
320 | 		}
321 | 		if gotEq := domain.Name.Equal(db, da); gotEq != wantEq {
322 | 			t.Errorf("Label.Equal(%q, %q) = %v, want %v", db, da, gotEq, wantEq)
323 | 		}
324 | 	}
325 | }
326 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
  1 | # Public Suffix List (PSL) Submission
  2 | 
  3 | <!--
  4 | Each PSL PR needs to have a description, rationale, indication of DNS validation and syntax checking, as well as a number of acknowledgements from the submitter. This template must be included with each PR, and the submitting party MUST provide responses to all of the elements in order to be considered.
  5 | -->
  6 | 
  7 | <!-- #### READ THIS FIRST ####
  8 | 
  9 | If you haven't yet, please read our guidelines:
 10 | https://github.com/publicsuffix/list/wiki/Guidelines#submit-the-change
 11 | 
 12 | Also, read them again, as many skip that part and 
 13 | get confused about why their PR is delayed or does
 14 | not get accepted when their submission didn't follow them.
 15 | 
 16 | A recent PR using the current template is 
 17 | https://github.com/publicsuffix/list/pull/1591, although 
 18 | the organization and description were not as substantial 
 19 | as desired, which required maintainers time to visit the 
 20 | requestor's website to further research.
 21 | Having more robust org/desc improves the PR processing 
 22 | pace due to the extra cycles not being lost to research.
 23 | For an example of what an excellent description in a PR looks like
 24 | see https://github.com/publicsuffix/list/pull/615, 
 25 | although that example uses an earlier template.
 26 | -->
 27 | 
 28 | ### Checklist of required steps
 29 | 
 30 | * [ ] Description of Organization
 31 | * [ ] Robust Reason for PSL Inclusion
 32 | * [ ] DNS verification via dig
 33 | 
 34 | * [ ] Each domain listed in the PRIVATE section has and shall maintain at least two years remaining on registration, and we shall keep the `_psl` TXT record in place in the respective zone(s).
 35 | 
 36 | __Submitter affirms the following:__ 
 37 | <!--
 38 | Third-party Limits are used elsewhere, such as at Cloudflare, Let's 
 39 | Encrypt, Apple, GitLab or others, and having an entry in the PSL alters 
 40 | the manner in which those third-party systems or products treat 
 41 | a given domain name or sub-domains within it.
 42 | 
 43 | To be clear, it is appropriate to address how those limits impact 
 44 | your domain(s) directly with that third-party, and it is inappropriate 
 45 | to submit entries to the PSL as a means to work around those limits or 
 46 | restrictions.
 47 | -->
 48 | 
 49 |  * [ ] We are listing *any* third-party limits that we seek to work around in our rationale such as those between IOS 14.5+ and Facebook (see [Issue #1245](https://github.com/publicsuffix/list/issues/1245) as a well-documented example)
 50 |  - [Cloudflare](https://developers.cloudflare.com/learning-paths/get-started/add-domain-to-cf/add-site/)
 51 |  - [Let's Encrypt](https://letsencrypt.org/docs/rate-limits/)
 52 |  - MAKE SURE UPDATE THE FOLLOWING LIST WITH YOUR LIMITATIONS! REMOVE ENTRIES WHICH DO NOT APPLY AS WELL AS REMOVING THIS LINE!
 53 | 
 54 | <!--
 55 | The purpose of the question above is to expose limit workarounds.
 56 | If there are third party limits that the PR seeks to overcome, those
 57 | must be listed within the rationale section of this request, and 
 58 | provide a good level of detail the effort that was made to work directly 
 59 | with the third part(y|ies) in attempting to address this within their 
 60 | rationale response below.
 61 | In all cases, software and services should be discouraged from use of
 62 | the PSL as a rate-limiting tool, and provide clear instructions to their
 63 | own clients, partners and users on the manner in which they can directly
 64 | request rate limit increases.
 65 | We treat the following as an attestation in the public record of the 
 66 | requesting party that they are not attempting to bypass rate limits through
 67 | the PR.
 68 | -->
 69 | 
 70 |  * [ ] This request was _not_ submitted with the objective of working around other third-party limits.
 71 | 
 72 | <!--
 73 | Submitter will maintain domains in good standing or may lose section.
 74 | 
 75 | The ongoing trust of the PSL requires it to be free of outdated or problematic entries. In making this pull request, there is a commitment by the submitter that they are going to review and maintain their relevant section. By submitting an entry, the requestor acknowledges that their entry and section may be removed if the domain does not maintain the respective _psl entries in DNS, any domain(s) within their section fail to resolve in DNS, the domain does not get renewed, expires or is otherwise unreachable. The submitter further identifies that it is their responsibility to review their submitted section within the PSL, submitting updates or removals as their domain(s) may change over time. It is also the responsibility of the submitter to provide (and keep up to date) a reachable email address within the section, and to maintain that address as it may change over time, so that they receive notices.
 76 | -->
 77 | 
 78 |  * [ ] The submitter acknowledges that it is their responsibility to maintain the domains within their section. This includes removing names which are no longer used, retaining the _psl DNS entry, and responding to e-mails to the supplied address. Failure to maintain entries may result in removal of individual entries or the entire section.
 79 | 
 80 | <!--
 81 | The guidelines describe which section to place the entry, what the 
 82 | order of commented org placement, order of sorting of entries. 
 83 | (hint: TLD then SLD, ascending sorting) Although it seems pedantic, 
 84 | the sorting and formatting rules help ensure all of the automation 
 85 | that uses the PSL operates correctly. Typically both are solved or
 86 | neither.
 87 | -->
 88 | 
 89 |  * [ ] The [Guidelines](https://github.com/publicsuffix/list/wiki/Guidelines) were carefully _read_ and _understood_, and this request conforms to them.
 90 |  * [ ] The submission follows the [guidelines](https://github.com/publicsuffix/list/wiki/Format) on formatting and sorting.
 91 | 
 92 | <!-- 
 93 | Sorting and formatting of the entries is outlined in the guidelines 
 94 | and non-conforming requests are one of the largest sources of delay,
 95 | so getting this right initially will aid successfully having it 
 96 | proceed. Mislocated entries and trailing spaces should be avoided.
 97 | -->
 98 | 
 99 | <!--
100 | In your submission, please include a role-based email address (e.g. security@example.com) rather than a personal email address (e.g. jane.doe@example.com) under your organization name, so that we can maintain contact with your organization, independent of personnel changes.
101 | This email address will be used for any required verification or inquiries regarding your PSL listing. This inbox must be actively maintained and monitored for future communications from the PSL project for as long as the domain remains in the PSL. Any PSL inquiries sent to this address must receive a response within 30 days, as maintaining timely communication is required for continued inclusion in the PSL.
102 | -->
103 | 
104 |  * [ ] A role-based email address has been used and this inbox is actively monitored with a response time of no more than 30 days.
105 | 
106 | **Abuse Contact:**
107 | 
108 | <!--
109 | Please confirm that you have accessible abuse contact information on your website. 
110 | 
111 | At a minimum, you must provide an abuse contact either in the form of an email address or a web form that can be used to report abuse. This contact should be easily accessible to allow concerned parties to notify the registry or subdomain operator directly when malicious activities such as phishing, malware, or abuse are detected. For example, if you provide subdomains at example.com, where users may register subdomains such as clientname.example.com, then in case of abuse, reporters should be able to visit example.com and easily find the relevant abuse contact information.
112 | -->
113 | 
114 | * [ ] Abuse contact information (email or web form) is available and easily accessible.
115 | 
116 |   URL where abuse contact or abuse reporting form can be found: 
117 |   <!-- Provide the URL where an Internet user can access the abuse contact information -->
118 | 
119 | ---
120 | 
121 | For PRIVATE section requests that are submitting entries for domains that match their organization website's primary domain, please understand that this can have impacts that may not match the desired outcome and take a long time to rollback, if at all.
122 | 
123 | To ensure that requested changes are entirely intentional, make sure that you read the affectation and propagation expectations, that you understand them, and confirm this understanding. 
124 | 
125 | PR Rollbacks have lower priority, and the volunteers are unable to control when or if browsers or other parties using the PSL will refresh or update.
126 | 
127 | <!-- 
128 | Seriously, carefully read the downline flow of the PSL and the 
129 | guidelines. Your request could very likely alter the cookie and 
130 | certificate (as well as other) behaviours on your core domain name in 
131 | ways that could be problematic for your business.
132 | 
133 | Rollbacks are really not predictable, as those who use or incorporate 
134 | the PSL do what they do, and when. It is not within the PSL volunteers' 
135 | control to do anything about that. 
136 | 
137 | The volunteers are busy with new requests, and rollbacks are lowest 
138 | priority, so if something gets broken by your PR, it will potentially 
139 | stay that way for an indefinite period of time (typically long).
140 | -->
141 | 
142 | (Link: [about propagation/expectations](https://github.com/publicsuffix/list/wiki/Guidelines#appropriate-expectations-on-derivative-propagation-use-or-inclusion))
143 | 
144 |  * [ ] *Yes, I understand*. I could break my organization's website cookies and cause other issues, and the rollback timing is acceptable. *Proceed anyways*.
145 | ---
146 | 
147 | <!--
148 | As you complete each item in the checklist please mark it with an X.
149 | 
150 | For example:
151 | * [x] Description of Organization
152 | -->
153 | 
154 | ## Description of Organization
155 | <!--
156 | Provide at least 3 sentences (the more the better) but
157 | avoid the promotional stuff about how wonderful it is, and 
158 | please do not copy and paste the mission statement or 
159 | elevator pitch from your org's website.
160 | 
161 | Also tell us who you (submitter) are and represent (i.e. 
162 | individual, non-profit volunteer, engineer at a business, etc.) 
163 | and what you do (i.e. DynDNS, hosting, etc.), and what your 
164 | role is as submitter with respect to the org and the 
165 | submission.
166 | 
167 | For the org description, there is less interest in the 
168 | promotional / marketing information about the org and more 
169 | a focus on having concise description of the core focus of 
170 | the submitting org, specifically with context/connection 
171 | to this request.
172 | -->
173 | 
174 | **Organization Website:**
175 | <!-- Provide the website address of the org as a full URL (i.e. https://example.com) -->
176 | 
177 | ## Reason for PSL Inclusion
178 | <!--
179 | Please tell us why your domain(s) should be listed in the PSL
180 | (i.e. Cookie Security, Let's Encrypt issuance, IOS/Facebook, 
181 | Cloudflare, etc.) and clearly confirm that any private section 
182 | names hold registration term longer than 2 years and shall 
183 | maintain more than 1 year term in order to remain listed.
184 | 
185 | If you are attempting to work around third party limits, use 
186 | this area to describe how and detail the manner in which you 
187 | have first attempted to engage those third parties on the 
188 | matter.
189 | 
190 | Please also reference any past issues or PRs 
191 | specifically related to this submission or section.
192 | 
193 | Provide three or more sentences here that describe the purpose 
194 | for which your PR should be included in the PSL. There is no 
195 | upper limit, but six paragraphs seems like a rational stop.
196 | -->
197 | 
198 | **Number of users this request is being made to serve:**
199 | <!-- Identify if this is current or an estimate. -->
200 | 
201 | ## DNS Verification
202 | <!--
203 | For each domain you'd like to add to the list please create
204 | a DNS verification record pointing to your pull request.
205 | 
206 | For example, if you'd like to add example.com and example.net
207 | you would need to provide the following verifications:
208 | 
209 | ```
210 | dig +short TXT _psl.example.com
211 | "https://github.com/publicsuffix/list/pull/XXXX"
212 | ```
213 | 
214 | ```
215 | dig +short TXT _psl.example.net
216 | "https://github.com/publicsuffix/list/pull/XXXX"
217 | ```
218 | 
219 | Note that XXXX is replaced with the number of your pull request.
220 | 
221 | We ask that you leave this record in place while you want 
222 | your entry to remain in the PSL, so that future (TBD) 
223 | automation can remove entries where the record is not present.
224 | -->
225 | 


--------------------------------------------------------------------------------
/tools/internal/parser/diff.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | )
  6 | 
  7 | // SetBaseVersion sets the list's base of comparison to old, and
  8 | // updates the changed/unchanged annotations on all Blocks to match.
  9 | //
 10 | // If wholeSuffixBlocks is true, any changed Suffix or Wildcard within
 11 | // a Suffixes block marks all suffixes and wildcards in that block as
 12 | // changed.
 13 | //
 14 | // Precise marking (wholeSuffixBlocks=false) is intended for
 15 | // maintainer and machine edits, where change-aware validators should
 16 | // exaine only the specific changed items.
 17 | //
 18 | // Expansive marking (wholeSuffixBlocks=true) is intended for external
 19 | // PRs from suffix block owners, to opportunistically point out more
 20 | // issues that they have the knowledge and authority to fix.
 21 | func (l *List) SetBaseVersion(old *List, wholeSuffixBlocks bool) {
 22 | 	diff := differ{
 23 | 		oldCnt:    map[string]int{},
 24 | 		inCurrent: map[string][][]Block{},
 25 | 		keys:      map[Block]string{},
 26 | 
 27 | 		wholeSuffixBlocks: wholeSuffixBlocks,
 28 | 	}
 29 | 
 30 | 	// Tree diff is an open area of research, and it's possible to use
 31 | 	// extremely fancy (and slow) algorithms. Thankfully, the PSL has
 32 | 	// some additional domain-specific properties that let us take
 33 | 	// shortcuts and implement something O(n).
 34 | 	//
 35 | 	// First, academic tree_diff(OLD,NEW) produces an "edit script" as
 36 | 	// the output, which describes how to add, delete, move and mutate
 37 | 	// tree nodes to transform the OLD tree into the NEW tree. For the
 38 | 	// PSL, we don't care about the exact structural changes, we just
 39 | 	// need to know if we can skip validation checks. So we have to
 40 | 	// answer a simple question: is a given block in NEW also present
 41 | 	// in OLD?
 42 | 	//
 43 | 	// Second, all nodes in a well-formed list have a stable unique
 44 | 	// identity. We can use this to answer the previous question in
 45 | 	// constant time, instead of having to do complex tree analysis to
 46 | 	// locate equivalent nodes.
 47 | 	//
 48 | 	// Node identities may be duplicated in an ill-formed List, for
 49 | 	// example a suffix block that lists the same suffix twice. We
 50 | 	// deal with this using brute force, and mark all duplicate
 51 | 	// identities as changed. This means that a malformed PSL file
 52 | 	// might report more changes than the strict minimum, but in
 53 | 	// practice it's not much more, and in exchange we don't have to
 54 | 	// do anything complex to decide what to revalidate.
 55 | 	//
 56 | 	// Third, how do we propagate child changes to parents? This is
 57 | 	// where academic algorithms quickly go into O(n^3)
 58 | 	// territory. Once again, we avoid this with brute force: a
 59 | 	// changed tree node marks all its parents as changed as
 60 | 	// well. That means that if you fix a typo in one Suffix, we say
 61 | 	// that the Suffix changed, but also its parent Suffixes, Section,
 62 | 	// and List nodes.
 63 | 	//
 64 | 	// We could theoretically dirty fewer nodes in some cases, but
 65 | 	// that introduces a risk of false negatives (we forget to re-run
 66 | 	// a necessary validation), and it makes the diff harder to reason
 67 | 	// about when writing validators. In practice, this slightly
 68 | 	// pessimistic dirtying is cheap for the currently-planned
 69 | 	// validators, so we stick with the behavior that is easy to
 70 | 	// reason about and simple to implement.
 71 | 	//
 72 | 	// Finally, we need to do something about deleted nodes. We can
 73 | 	// handle that with a single additional pass through the OLD list,
 74 | 	// thanks to the node identity property. Again for simplicity, we
 75 | 	// treat deletions similar to edits: all the parents of a deleted
 76 | 	// node are marked dirty. Again we could be more precise here, but
 77 | 	// in practice it's currently cheap to be pessimistic, and makes
 78 | 	// the code and mental model simpler.
 79 | 	//
 80 | 	// There are various optimizations possible for this code. The
 81 | 	// biggest would be doing something more efficient to track block
 82 | 	// identities, which are currently expressed as big strings
 83 | 	// because that makes them convenient to compare and use as map
 84 | 	// keys. However, this algorithm as currently implemented takes
 85 | 	// <100ms to diff a full PSL file, so for now we err on the side
 86 | 	// of simplicity.
 87 | 
 88 | 	// Compile the identities of all the blocks in old.
 89 | 	diff.scanOld(old, "")
 90 | 	// Mark unchanged blocks. Thanks to the previous step, each tree
 91 | 	// node can be checked in O(1) time.
 92 | 	diff.scanCurrent(l, "", nil)
 93 | 	// Dirty the parents of deleted blocks.
 94 | 	diff.markDeletions(old, "")
 95 | }
 96 | 
 97 | type differ struct {
 98 | 	// wholeSuffixBlocks is whether Suffix/Wildcard changes propagate
 99 | 	// to all children of the parent Suffixes block.
100 | 	wholeSuffixBlocks bool
101 | 
102 | 	// oldCnt counts the number of blocks in the old list with a given
103 | 	// identity key.
104 | 	oldCnt map[string]int
105 | 
106 | 	// inCurrent maps block identity keys to the tree paths in of the
107 | 	// current list with that identity. Given a block with identity K,
108 | 	// inCurrent[K] is a list of paths. In each path, path[0] is a
109 | 	// block with identity K, and path[1..n] are its parents going
110 | 	// back to the root of the tree.
111 | 	//
112 | 	// In a well-formed List, each cache entry has a single path, but
113 | 	// we track duplicates in order to function correctly on malformed
114 | 	// lists as well.
115 | 	inCurrent map[string][][]Block
116 | 
117 | 	// keys caches identity keys by block pointer. There are several
118 | 	// passes of traversal through trees, and when old and current are
119 | 	// nearly identical (the common case) this can save significant
120 | 	// CPU time.
121 | 	keys map[Block]string
122 | }
123 | 
124 | // scanOld records b and its children in d.oldCnt.
125 | func (d *differ) scanOld(b Block, parentKey string) {
126 | 	k := d.getKey(b, parentKey)
127 | 	d.oldCnt[k]++
128 | 	for _, child := range b.Children() {
129 | 		d.scanOld(child, k)
130 | 	}
131 | }
132 | 
133 | // scanCurrent adds b and all its children to b.inCurrent, and updates
134 | // their isUnchanged annotation based on the information in d.oldCnt.
135 | func (d *differ) scanCurrent(curBlock Block, parentKey string, parents []Block) {
136 | 	k := d.getKey(curBlock, parentKey)
137 | 
138 | 	path := make([]Block, 0, len(parents)+1)
139 | 	path = append(path, curBlock)
140 | 	path = append(path, parents...)
141 | 
142 | 	// Assume we're unchanged to start with. The job of the remaining
143 | 	// diff code is to falsify this claim and mark the node as changed
144 | 	// if needed.
145 | 	//
146 | 	// Setting this early and unconditionally lets us optimize the
147 | 	// logic in markChanged, by ensuring that each node transitions
148 | 	// false->true only once, before any possible true->false
149 | 	// transitions that affect it.
150 | 	curBlock.info().isUnchanged = true
151 | 
152 | 	// Record the path to the current block, and if it's a
153 | 	// doppelganger of some other Block, mark changed. Tracking diffs
154 | 	// of duplicates requires solving some hard theoretical problems
155 | 	// of tree diff, so we don't bother.
156 | 	//
157 | 	// Duplicate identities only happens on a malformed PSL, and we
158 | 	// can save a lot of pain by just over-rechecking such PSLs
159 | 	// slightly.
160 | 	d.inCurrent[k] = append(d.inCurrent[k], path)
161 | 	if l := len(d.inCurrent[k]); l == 2 {
162 | 		// This is the first duplicate, previous path didn't know it
163 | 		// wasn't unique. Mark both the current and earlier path as
164 | 		// changed.
165 | 		d.markChanged(d.inCurrent[k]...)
166 | 	} else if l > 2 {
167 | 		// Previous paths already marked, only curBlock's one needs
168 | 		// updating.
169 | 		d.markChanged(path)
170 | 	}
171 | 
172 | 	// This covers both the case where a block is new (oldCnt of 0),
173 | 	// and the case where this block isn't a dupe in current, but was
174 | 	// a dupe in old. In that case, like above we avoid algorithmic
175 | 	// headaches by just dirtying the block instead of trying to
176 | 	// resolve which version of the old dupes we're looking at.
177 | 	if d.oldCnt[k] != 1 {
178 | 		d.markChanged(path)
179 | 	}
180 | 
181 | 	// Scan through child subtrees. These subtrees may call
182 | 	// markChanged and set Unchanged=false on us.
183 | 	for _, child := range curBlock.Children() {
184 | 		d.scanCurrent(child, k, path)
185 | 	}
186 | 
187 | 	// If the caller requested, and we're changed anyway, see if we
188 | 	// should propagate the change back downwards again.
189 | 	if !curBlock.info().isUnchanged {
190 | 		d.maybeMarkWholeSuffixBlock(path)
191 | 	}
192 | }
193 | 
194 | // markDeletions marks parents of deleted nodes as changed in current.
195 | //
196 | // For example, if the diff contains a suffix deletion, this will mark
197 | // the enclosing Suffixes block as changed.
198 | func (d *differ) markDeletions(oldBlock Block, parentKey string) bool {
199 | 	k := d.getKey(oldBlock, parentKey)
200 | 
201 | 	pathsInCurrent, ok := d.inCurrent[k]
202 | 	if !ok {
203 | 		// oldBlock was deleted, report to caller.
204 | 		return true
205 | 	}
206 | 
207 | 	childDeleted := false
208 | 	for _, child := range oldBlock.Children() {
209 | 		if d.markDeletions(child, k) {
210 | 			// Note, can't short-circuit here because there may be
211 | 			// other paths under this block that also need to be
212 | 			// updated. We're not only trying to update oldBlock, but
213 | 			// also all of its children.
214 | 			childDeleted = true
215 | 		}
216 | 	}
217 | 
218 | 	// Children were deleted, mark ourselves changed. This implicitly
219 | 	// also marks the parent as changed, so no need to tell it that a
220 | 	// change happened, it'll just do extra no-op work.
221 | 	if childDeleted {
222 | 		d.markChanged(pathsInCurrent...)
223 | 	}
224 | 
225 | 	return false
226 | }
227 | 
228 | // maybeMarkWholeSuffixBlock calls markSuffixAndWildcardChanged on all
229 | // Suffixes in path, if the caller of MarkUnchanged requested
230 | // expansive marking.
231 | func (d *differ) maybeMarkWholeSuffixBlock(path []Block) {
232 | 	if !d.wholeSuffixBlocks {
233 | 		return
234 | 	}
235 | 
236 | 	switch path[0].(type) {
237 | 	case *Suffixes, *Suffix, *Wildcard:
238 | 		for i, parent := range path {
239 | 			if _, ok := parent.(*Suffixes); ok {
240 | 				d.markSuffixAndWildcardChanged(parent, path[i+1:])
241 | 			}
242 | 		}
243 | 	}
244 | }
245 | 
246 | // markSuffixAndWildcardChanged marks as changed all Suffix and
247 | // Wildcard blocks in the tree rooted at curBlock.
248 | func (d *differ) markSuffixAndWildcardChanged(curBlock Block, parents []Block) {
249 | 	path := append([]Block{curBlock}, parents...)
250 | 
251 | 	switch curBlock.(type) {
252 | 	case *Suffix, *Wildcard:
253 | 		d.markChanged(path)
254 | 	default:
255 | 		for _, child := range curBlock.Children() {
256 | 			d.markSuffixAndWildcardChanged(child, path)
257 | 		}
258 | 	}
259 | }
260 | 
261 | // markChanged marks as changed all the blocks in paths.
262 | func (d *differ) markChanged(paths ...[]Block) {
263 | pathLoop:
264 | 	for _, path := range paths {
265 | 		for _, b := range path {
266 | 			if b.info().isUnchanged == false {
267 | 				// We never mark a node as changed in isolation, we
268 | 				// always propagate the change to all its
269 | 				// parents. Therefore, we can stop the upwards
270 | 				// traversal in this path as soon as we find any node
271 | 				// that's already in the correct state.
272 | 				continue pathLoop
273 | 			}
274 | 			b.info().isUnchanged = false
275 | 		}
276 | 	}
277 | }
278 | 
279 | // getKey returns the identity key for blk, which must be a direct
280 | // child of parentKey. getKey keeps a cache of all keys built in the
281 | // lifetime of this differ, to make future calls more efficient.
282 | func (d *differ) getKey(blk Block, parentKey string) string {
283 | 	ret, ok := d.keys[blk]
284 | 	if !ok {
285 | 		ret = d.makeKey(blk, parentKey)
286 | 		d.keys[blk] = ret
287 | 	}
288 | 	return ret
289 | }
290 | 
291 | // makeKey builds the identity key of blk, which must be a child node
292 | // of parentKey.
293 | func (d *differ) makeKey(b Block, parentKey string) string {
294 | 	switch v := b.(type) {
295 | 	case *List:
296 | 		return fmt.Sprintf("%s;List", parentKey)
297 | 	case *Section:
298 | 		return fmt.Sprintf("%s;Section,%q", parentKey, v.Name)
299 | 	case *Suffixes:
300 | 		// Note parsed suffix metadata isn't included in the identity,
301 | 		// to avoid marking all suffixes in a block changed when
302 | 		// someone adjusts their URL or email. Such edits will still
303 | 		// indirectly dirty the block, because the metadata comment
304 | 		// includes the entire comment text in its identity, and will
305 | 		// dirty the parent Suffixes.
306 | 		ret := fmt.Sprintf("%s;Suffixes,%q", parentKey, v.Info.Name)
307 | 		return ret
308 | 	case *Suffix:
309 | 		return fmt.Sprintf("%s;Suffix,%q", parentKey, v.Domain)
310 | 	case *Wildcard:
311 | 		return fmt.Sprintf("%s;Wildcard,%q,%#v", parentKey, v.Domain, v.Exceptions)
312 | 	case *Comment:
313 | 		return fmt.Sprintf("%s;Comment,%#v", parentKey, v.Text)
314 | 	default:
315 | 		panic("unknown ast node")
316 | 	}
317 | }
318 | 


--------------------------------------------------------------------------------
/tools/internal/parser/parser_test.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"net/mail"
  5 | 	"net/url"
  6 | 	"os"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/publicsuffix/list/tools/internal/domain"
 10 | )
 11 | 
 12 | // TestParser runs a battery of synthetic parse and validation tests.
 13 | func TestParser(t *testing.T) {
 14 | 	// These test cases have a fair amount of repetition in them,
 15 | 	// since both errors and suffix blocks contain repeated nestings
 16 | 	// of blocks and Source objects. While it's tempting to try and
 17 | 	// reduce duplication through clever code, you are encouraged to
 18 | 	// resist the urge.
 19 | 	//
 20 | 	// Each test case is quite verbose, but being laid out with
 21 | 	// minimal indirection makes it easier to inspect and debug when a
 22 | 	// failure happens.
 23 | 
 24 | 	tests := []struct {
 25 | 		name               string
 26 | 		psl                []byte
 27 | 		downgradeToWarning func(error) bool
 28 | 		want               *List
 29 | 		wantErrs           []error
 30 | 	}{
 31 | 		{
 32 | 			name: "empty",
 33 | 			psl:  byteLines(""),
 34 | 			want: list(),
 35 | 		},
 36 | 
 37 | 		{
 38 | 			name: "just_comments",
 39 | 			psl: byteLines(
 40 | 				"// This is an empty PSL file.",
 41 | 				"",
 42 | 				"// Here is a second comment.",
 43 | 			),
 44 | 			want: list(
 45 | 				comment(0, "This is an empty PSL file."),
 46 | 				comment(2, "Here is a second comment."),
 47 | 			),
 48 | 		},
 49 | 
 50 | 		{
 51 | 			name: "just_suffixes_in_block",
 52 | 			psl: byteLines(
 53 | 				"// ===BEGIN PRIVATE DOMAINS===",
 54 | 				"",
 55 | 				"example.com",
 56 | 				"other.example.com",
 57 | 				"*.example.org",
 58 | 				"",
 59 | 				"// ===END PRIVATE DOMAINS===",
 60 | 			),
 61 | 			want: list(
 62 | 				section(0, 7, "PRIVATE DOMAINS",
 63 | 					suffixes(2, 5, noInfo,
 64 | 						suffix(2, "example.com"),
 65 | 						suffix(3, "other.example.com"),
 66 | 						wildcard(4, 5, "example.org"),
 67 | 					),
 68 | 				),
 69 | 			),
 70 | 		},
 71 | 
 72 | 		{
 73 | 			name: "empty_sections",
 74 | 			psl: byteLines(
 75 | 				"// ===BEGIN IMAGINARY DOMAINS===",
 76 | 				"// ===END IMAGINARY DOMAINS===",
 77 | 				"// ===BEGIN FAKE DOMAINS===",
 78 | 				"// ===END FAKE DOMAINS===",
 79 | 			),
 80 | 			want: list(
 81 | 				section(0, 2, "IMAGINARY DOMAINS"),
 82 | 				section(2, 4, "FAKE DOMAINS"),
 83 | 			),
 84 | 		},
 85 | 
 86 | 		{
 87 | 			name: "missing_section_end",
 88 | 			psl: byteLines(
 89 | 				"// ===BEGIN ICANN DOMAINS===",
 90 | 			),
 91 | 			want: list(
 92 | 				section(0, 1, "ICANN DOMAINS"),
 93 | 			),
 94 | 			wantErrs: []error{
 95 | 				ErrUnclosedSection{section(0, 1, "ICANN DOMAINS")},
 96 | 			},
 97 | 		},
 98 | 
 99 | 		{
100 | 			name: "nested_sections",
101 | 			psl: byteLines(
102 | 				"// ===BEGIN ICANN DOMAINS===",
103 | 				"// ===BEGIN SECRET DOMAINS===",
104 | 				"// ===END SECRET DOMAINS===",
105 | 				"// ===END ICANN DOMAINS===",
106 | 			),
107 | 			want: list(
108 | 				section(0, 4, "ICANN DOMAINS"),
109 | 			),
110 | 
111 | 			wantErrs: []error{
112 | 				ErrNestedSection{
113 | 					SourceRange: mkSrc(1, 3),
114 | 					Name:        "SECRET DOMAINS",
115 | 					Section:     section(0, 4, "ICANN DOMAINS"),
116 | 				},
117 | 			},
118 | 		},
119 | 
120 | 		{
121 | 			name: "unknown_section_header",
122 | 			psl: byteLines(
123 | 				"// ===TRANSFORM DOMAINS===",
124 | 			),
125 | 			want: list(),
126 | 			wantErrs: []error{
127 | 				ErrUnknownSectionMarker{mkSrc(0, 1)},
128 | 			},
129 | 		},
130 | 
131 | 		{
132 | 			name: "suffixes_with_section_marker_in_header",
133 | 			psl: byteLines(
134 | 				"// Just some suffixes",
135 | 				"// ===BEGIN ICANN DOMAINS===",
136 | 				"com",
137 | 				"org",
138 | 				"",
139 | 				"// ===END ICANN DOMAINS===",
140 | 			),
141 | 			want: list(
142 | 				comment(0, "Just some suffixes"),
143 | 				section(1, 6, "ICANN DOMAINS",
144 | 					suffixes(2, 4, noInfo,
145 | 						suffix(2, "com"),
146 | 						suffix(3, "org"),
147 | 					),
148 | 				),
149 | 			),
150 | 		},
151 | 
152 | 		{
153 | 			name: "suffixes_with_section_markers_inline",
154 | 			psl: byteLines(
155 | 				"// ===BEGIN ICANN DOMAINS===",
156 | 				"// Just some suffixes",
157 | 				"com",
158 | 				"// ===BEGIN OTHER DOMAINS===",
159 | 				"org",
160 | 				"// ===END OTHER DOMAINS===",
161 | 				"net",
162 | 				"",
163 | 				"// ===END ICANN DOMAINS===",
164 | 			),
165 | 			want: list(
166 | 				section(0, 9, "ICANN DOMAINS",
167 | 					suffixes(1, 7,
168 | 						info("Just some suffixes", nil, nil, nil, true),
169 | 						comment(1, "Just some suffixes"),
170 | 						suffix(2, "com"),
171 | 						suffix(4, "org"),
172 | 						suffix(6, "net"),
173 | 					),
174 | 				),
175 | 			),
176 | 			wantErrs: []error{
177 | 				ErrSectionInSuffixBlock{mkSrc(3, 4)},
178 | 				ErrSectionInSuffixBlock{mkSrc(5, 6)},
179 | 			},
180 | 		},
181 | 
182 | 		{
183 | 			name: "suffixes_with_unstructured_header",
184 | 			psl: byteLines(
185 | 				"// Unstructured header.",
186 | 				"// I'm just going on about random things.",
187 | 				"example.com",
188 | 				"example.org",
189 | 			),
190 | 			want: list(
191 | 				suffixes(0, 4,
192 | 					info(
193 | 						"Unstructured header.",
194 | 						nil,
195 | 						nil,
196 | 						[]string{"I'm just going on about random things."},
197 | 						true,
198 | 					),
199 | 					comment(0, "Unstructured header.", "I'm just going on about random things."),
200 | 					suffix(2, "example.com"),
201 | 					suffix(3, "example.org"),
202 | 				),
203 | 			),
204 | 		},
205 | 
206 | 		{
207 | 			name: "suffixes_with_canonical_private_header",
208 | 			psl: byteLines(
209 | 				"// DuckCorp Inc: https://example.com",
210 | 				"// Submitted by Not A Duck <duck@example.com>",
211 | 				"// Seriously, not a duck",
212 | 				"example.com",
213 | 				"example.org",
214 | 			),
215 | 			want: list(
216 | 				suffixes(0, 5,
217 | 					info(
218 | 						"DuckCorp Inc",
219 | 						urls("https://example.com"),
220 | 						emails("Not A Duck", "duck@example.com"),
221 | 						[]string{"Seriously, not a duck"},
222 | 						true),
223 | 					comment(0, "DuckCorp Inc: https://example.com", "Submitted by Not A Duck <duck@example.com>",
224 | 						"Seriously, not a duck"),
225 | 					suffix(3, "example.com"),
226 | 					suffix(4, "example.org"),
227 | 				),
228 | 			),
229 | 		},
230 | 
231 | 		{
232 | 			name: "suffixes_with_entity_and_submitter",
233 | 			psl: byteLines(
234 | 				"// DuckCorp Inc: submitted by Not A Duck <duck@example.com>",
235 | 				"example.com",
236 | 			),
237 | 			want: list(
238 | 				suffixes(0, 2,
239 | 					info(
240 | 						"DuckCorp Inc",
241 | 						nil,
242 | 						emails("Not A Duck", "duck@example.com"),
243 | 						nil,
244 | 						true),
245 | 					comment(0, "DuckCorp Inc: submitted by Not A Duck <duck@example.com>"),
246 | 					suffix(1, "example.com"),
247 | 				),
248 | 			),
249 | 		},
250 | 
251 | 		{
252 | 			name: "suffixes_with_all_separate_lines",
253 | 			psl: byteLines(
254 | 				"// DuckCorp Inc",
255 | 				"// https://example.com",
256 | 				"// Submitted by Not A Duck <duck@example.com>",
257 | 				"example.com",
258 | 			),
259 | 			want: list(
260 | 				suffixes(0, 4,
261 | 					info(
262 | 						"DuckCorp Inc",
263 | 						urls("https://example.com"),
264 | 						emails("Not A Duck", "duck@example.com"),
265 | 						nil,
266 | 						true),
267 | 					comment(0, "DuckCorp Inc", "https://example.com", `Submitted by Not A Duck <duck@example.com>`),
268 | 					suffix(3, "example.com"),
269 | 				),
270 | 			),
271 | 		},
272 | 
273 | 		{
274 | 			// Regression test for a few blocks that start with "name
275 | 			// (url)" instead of the more common "name: url".
276 | 			name: "url_in_parens",
277 | 			psl: byteLines(
278 | 				"// Parens Appreciation Society (https://example.org)",
279 | 				"example.com",
280 | 			),
281 | 			want: list(
282 | 				suffixes(0, 2,
283 | 					info(
284 | 						"Parens Appreciation Society",
285 | 						urls("https://example.org"),
286 | 						nil,
287 | 						nil,
288 | 						true),
289 | 					comment(0, "Parens Appreciation Society (https://example.org)"),
290 | 					suffix(1, "example.com"),
291 | 				),
292 | 			),
293 | 		},
294 | 
295 | 		{
296 | 			// Regression test for a sneaky bug during development:
297 | 			// when an entity name is found when parsing Suffixes
298 | 			// headers, don't keep trying to find it in subsequent
299 | 			// lines, or you might overwrite the correct answer with
300 | 			// someething else that happens to have the right shape.
301 | 			name: "accept_first_valid_entity",
302 | 			psl: byteLines(
303 | 				"// cd : https://en.wikipedia.org/wiki/.cd",
304 | 				"// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1",
305 | 				"cd",
306 | 			),
307 | 			want: list(
308 | 				suffixes(0, 3,
309 | 					info(
310 | 						"cd",
311 | 						urls("https://en.wikipedia.org/wiki/.cd"),
312 | 						nil,
313 | 						[]string{"see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"},
314 | 						true),
315 | 					comment(0, "cd : https://en.wikipedia.org/wiki/.cd",
316 | 						"see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"),
317 | 					suffix(2, "cd"),
318 | 				),
319 | 			),
320 | 		},
321 | 	}
322 | 
323 | 	for _, test := range tests {
324 | 		t.Run(test.name, func(t *testing.T) {
325 | 			got, errs := Parse(test.psl)
326 | 			checkDiff(t, "parse result", got, test.want)
327 | 			checkDiff(t, "parse errors", errs, test.wantErrs)
328 | 		})
329 | 	}
330 | }
331 | 
332 | // mkSrc returns a SourceRange with the given start and end.
333 | func mkSrc(start, end int) SourceRange {
334 | 	return SourceRange{start, end}
335 | }
336 | 
337 | // TestParseRealList checks that the real public suffix list can parse
338 | // without errors.
339 | func TestParseRealList(t *testing.T) {
340 | 	bs, err := os.ReadFile("../../../public_suffix_list.dat")
341 | 	if err != nil {
342 | 		t.Fatal(err)
343 | 	}
344 | 
345 | 	_, errs := Parse(bs)
346 | 
347 | 	for _, err := range errs {
348 | 		t.Errorf("Parse error: %v", err)
349 | 	}
350 | }
351 | 
352 | func list(blocks ...Block) *List {
353 | 	return &List{
354 | 		Blocks: blocks,
355 | 	}
356 | }
357 | 
358 | func comment(start int, lines ...string) *Comment {
359 | 	return &Comment{
360 | 		blockInfo: blockInfo{
361 | 			SourceRange: mkSrc(start, start+len(lines)),
362 | 		},
363 | 		Text: lines,
364 | 	}
365 | }
366 | 
367 | func section(start, end int, name string, blocks ...Block) *Section {
368 | 	return &Section{
369 | 		blockInfo: blockInfo{
370 | 			SourceRange: mkSrc(start, end),
371 | 		},
372 | 		Name:   name,
373 | 		Blocks: blocks,
374 | 	}
375 | }
376 | 
377 | func suffixes(start, end int, info MaintainerInfo, blocks ...Block) *Suffixes {
378 | 	return &Suffixes{
379 | 		blockInfo: blockInfo{
380 | 			SourceRange: mkSrc(start, end),
381 | 		},
382 | 		Info:   info,
383 | 		Blocks: blocks,
384 | 	}
385 | }
386 | 
387 | func info(name string, urls []*url.URL, emails []*mail.Address, other []string, editable bool) MaintainerInfo {
388 | 	return MaintainerInfo{
389 | 		Name:            name,
390 | 		URLs:            urls,
391 | 		Maintainers:     emails,
392 | 		Other:           other,
393 | 		MachineEditable: editable,
394 | 	}
395 | }
396 | 
397 | var noInfo = info("", nil, nil, nil, true)
398 | 
399 | func suffix(line int, domainStr string) *Suffix {
400 | 	domain, err := domain.Parse(domainStr)
401 | 	if err != nil {
402 | 		panic(err)
403 | 	}
404 | 	return &Suffix{
405 | 		blockInfo: blockInfo{
406 | 			SourceRange: mkSrc(line, line+1),
407 | 		},
408 | 		Domain: domain,
409 | 	}
410 | }
411 | 
412 | func wildcard(start, end int, base string, exceptions ...string) *Wildcard {
413 | 	dom, err := domain.Parse(base)
414 | 	if err != nil {
415 | 		panic(err)
416 | 	}
417 | 
418 | 	ret := &Wildcard{
419 | 		blockInfo: blockInfo{
420 | 			SourceRange: mkSrc(start, end),
421 | 		},
422 | 		Domain: dom,
423 | 	}
424 | 	for _, s := range exceptions {
425 | 		exc, err := domain.ParseLabel(s)
426 | 		if err != nil {
427 | 			panic(err)
428 | 		}
429 | 		ret.Exceptions = append(ret.Exceptions, exc)
430 | 	}
431 | 	return ret
432 | }
433 | 
434 | // zeroSourceRange destructively zeroes the SourceRange of the given
435 | // block and its children. We use a zero SourceRange to communicate
436 | // "this block did not exist in the original input", when adding
437 | // machine-generated blocks.
438 | func zeroSourceRange(b Block) Block {
439 | 	switch v := b.(type) {
440 | 	case *List:
441 | 		v.SourceRange = SourceRange{}
442 | 	case *Section:
443 | 		v.SourceRange = SourceRange{}
444 | 	case *Suffixes:
445 | 		v.SourceRange = SourceRange{}
446 | 	case *Suffix:
447 | 		v.SourceRange = SourceRange{}
448 | 	case *Wildcard:
449 | 		v.SourceRange = SourceRange{}
450 | 	case *Comment:
451 | 		v.SourceRange = SourceRange{}
452 | 	default:
453 | 		panic("unknown ast node")
454 | 	}
455 | 	for _, child := range b.Children() {
456 | 		zeroSourceRange(child)
457 | 	}
458 | 	return b
459 | }
460 | 
461 | // markUnchanged makes .Changed() return false for b. It does not
462 | // touch parent or child blocks.
463 | //
464 | // It's generic so that it works in places that require a specific
465 | // instance type, not just places that accept a Block interface.
466 | func markUnchanged[T Block](b T) T {
467 | 	b.info().isUnchanged = true
468 | 	return b
469 | }
470 | 


--------------------------------------------------------------------------------
/tools/internal/parser/file.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"cmp"
  5 | 	"fmt"
  6 | 	"net/mail"
  7 | 	"net/url"
  8 | 	"slices"
  9 | 
 10 | 	"github.com/publicsuffix/list/tools/internal/domain"
 11 | )
 12 | 
 13 | // A Block is a parsed chunk of a PSL file. Each block is one of the
 14 | // concrete types Comment, Section, Suffixes, Suffix, or Wildcard.
 15 | type Block interface {
 16 | 	// SrcRange returns the block's SourceRange.
 17 | 	SrcRange() SourceRange
 18 | 	// Children returns the block's direct children, if any.
 19 | 	Children() []Block
 20 | 	// Changed reports whether the tree rooted at block has changed
 21 | 	// since the base of comparison (see List.SetBaseVersion).
 22 | 	Changed() bool
 23 | 
 24 | 	info() *blockInfo
 25 | }
 26 | 
 27 | // BlocksOfType recursively collects and returns all blocks of
 28 | // concrete type T in the given parse tree.
 29 | //
 30 | // For example, BlocksOfType[*parser.Comment](ast) returns all comment
 31 | // nodes in ast.
 32 | func BlocksOfType[T Block](tree Block) []T {
 33 | 	var ret []T
 34 | 	blocksOfTypeRec(tree, &ret)
 35 | 	return ret
 36 | }
 37 | 
 38 | func blocksOfTypeRec[T Block](tree Block, out *[]T) {
 39 | 	if v, ok := tree.(T); ok {
 40 | 		*out = append(*out, v)
 41 | 	}
 42 | 	for _, child := range tree.Children() {
 43 | 		blocksOfTypeRec(child, out)
 44 | 	}
 45 | }
 46 | 
 47 | // blockInfo is common information shared by all Block types.
 48 | type blockInfo struct {
 49 | 	SourceRange
 50 | 
 51 | 	// isUnchanged records that a Block (including any children) is
 52 | 	// semantically unchanged from a past base point. The default base
 53 | 	// of comparison is a null List, meaning that Unchanged=false for
 54 | 	// all blocks. A different base of comparison can be set with
 55 | 	// List.Diff.
 56 | 	isUnchanged bool
 57 | }
 58 | 
 59 | func (b blockInfo) SrcRange() SourceRange {
 60 | 	return b.SourceRange
 61 | }
 62 | 
 63 | func (b blockInfo) Changed() bool {
 64 | 	return !b.isUnchanged
 65 | }
 66 | 
 67 | func (b *blockInfo) info() *blockInfo {
 68 | 	return b
 69 | }
 70 | 
 71 | // List is a parsed public suffix list.
 72 | type List struct {
 73 | 	blockInfo
 74 | 
 75 | 	// Blocks are the top-level elements of the list, in the order
 76 | 	// they appear.
 77 | 	Blocks []Block
 78 | }
 79 | 
 80 | func (l *List) Children() []Block { return l.Blocks }
 81 | 
 82 | // PublicSuffix returns the public suffix of n.
 83 | //
 84 | // This follows the PSL algorithm to the letter. Notably: a rule
 85 | // "*.foo.com" does not implicitly create a "foo.com" rule, and there
 86 | // is a hardcoded implicit "*" rule so that unknown TLDs are all
 87 | // public suffixes.
 88 | func (l *List) PublicSuffix(d domain.Name) domain.Name {
 89 | 	if d.NumLabels() == 0 {
 90 | 		// Edge case: zero domain.Name value
 91 | 		return d
 92 | 	}
 93 | 
 94 | 	// Look at wildcards first, because the PSL algorithm says that
 95 | 	// exceptions to wildcards take priority over all other rules. So,
 96 | 	// if we find a wildcard exception, we can halt early.
 97 | 	var (
 98 | 		ret          domain.Name
 99 | 		matchLen     int
100 | 		gotException bool
101 | 	)
102 | 	for _, w := range BlocksOfType[*Wildcard](l) {
103 | 		suf, isException, ok := w.PublicSuffix(d)
104 | 		switch {
105 | 		case !ok:
106 | 			continue
107 | 		case isException && !gotException:
108 | 			// First matching exception encountered.
109 | 			gotException = true
110 | 			matchLen = suf.NumLabels()
111 | 			ret = suf
112 | 		case isException:
113 | 			// Second or later exception match. According to the
114 | 			// format, only 0 or 1 exceptions can match,
115 | 			// multi-exception matches are undefined and unused. But
116 | 			// just to be safe, handle the N exception case by
117 | 			// accepting the longest matching exception.
118 | 			if nl := suf.NumLabels(); nl > matchLen {
119 | 				matchLen = nl
120 | 				ret = suf
121 | 			}
122 | 		case !gotException:
123 | 			// Non-exception match.
124 | 			if nl := suf.NumLabels(); nl > matchLen {
125 | 				matchLen = nl
126 | 				ret = suf
127 | 			}
128 | 		}
129 | 	}
130 | 	if gotException {
131 | 		return ret
132 | 	}
133 | 
134 | 	// Otherwise, keep scanning through the regular suffixes.
135 | 	for _, s := range BlocksOfType[*Suffix](l) {
136 | 		if suf, ok := s.PublicSuffix(d); ok && suf.NumLabels() > matchLen {
137 | 			matchLen = suf.NumLabels()
138 | 			ret = suf
139 | 		}
140 | 	}
141 | 
142 | 	if matchLen == 0 {
143 | 		// The PSL algorithm includes an implicit "*" to match every
144 | 		// TLD, in the absence of any matching explicit rule.
145 | 		labels := d.Labels()
146 | 		tld := labels[len(labels)-1].AsTLD()
147 | 		return tld
148 | 	}
149 | 
150 | 	return ret
151 | }
152 | 
153 | // RegisteredDomain returns the registered/registerable domain of
154 | // n. Returns (domain, true) when the input is a child of a public
155 | // suffix, and (zero, false) when the input is itself a public suffix.
156 | //
157 | // RegisteredDomain follows the PSL algorithm to the letter. Notably:
158 | // a rule "*.foo.com" does not implicitly create a "foo.com" rule, and
159 | // there is a hardcoded implicit "*" rule so that unknown TLDs are all
160 | // public suffixes.
161 | func (l *List) RegisteredDomain(d domain.Name) (domain.Name, bool) {
162 | 	suf := l.PublicSuffix(d)
163 | 	if suf.Equal(d) {
164 | 		return domain.Name{}, false
165 | 	}
166 | 
167 | 	next, ok := d.CutSuffix(suf)
168 | 	if !ok {
169 | 		panic(fmt.Sprintf("public suffix %q is not a suffix of domain %q", suf, d))
170 | 	}
171 | 	return suf.MustAddPrefix(next[len(next)-1]), true
172 | }
173 | 
174 | // Comment is a comment block, consisting of one or more contiguous
175 | // lines of commented text.
176 | type Comment struct {
177 | 	blockInfo
178 | 	// Text is the unprocessed content of the comment lines, with the
179 | 	// leading comment syntax removed.
180 | 	Text []string
181 | }
182 | 
183 | func (c *Comment) Children() []Block { return nil }
184 | 
185 | // Section is a named part of a PSL file, containing suffixes which
186 | // behave similarly.
187 | type Section struct {
188 | 	blockInfo
189 | 
190 | 	// Name is he section name. In a normal well-formed PSL file, the
191 | 	// names are "ICANN DOMAINS" and "PRIVATE DOMAINS".
192 | 	Name string
193 | 	// Blocks are the child blocks contained within the section.
194 | 	Blocks []Block
195 | }
196 | 
197 | func (s *Section) Children() []Block { return s.Blocks }
198 | 
199 | // Suffixes is a list of PSL domain suffixes with optional additional
200 | // metadata.
201 | //
202 | // Suffix sections consist of a header comment that contains a mix of
203 | // structured and unstructured information, followed by a list of
204 | // domain suffixes. The suffix list may contain additional
205 | // unstructured inline comments.
206 | type Suffixes struct {
207 | 	blockInfo
208 | 
209 | 	// Info is information about the authoritative maintainers for
210 | 	// this set of suffixes.
211 | 	Info MaintainerInfo
212 | 
213 | 	// Blocks are the child blocks contained within the section.
214 | 	Blocks []Block
215 | }
216 | 
217 | func (s *Suffixes) Children() []Block { return s.Blocks }
218 | 
219 | type MaintainerInfo struct {
220 | 	// Name is the name of the entity responsible for maintaining a
221 | 	// set of suffixes.
222 | 	//
223 | 	// For ICANN suffixes, this is typically the TLD name, or the name
224 | 	// of NIC that controls the TLD.
225 | 	//
226 | 	// For private domains this is the name of the legal entity
227 | 	// (usually a company, sometimes an individual) that owns all
228 | 	// domains in the block.
229 | 	//
230 | 	// In a well-formed PSL file, Name is non-empty for all suffix
231 | 	// blocks.
232 | 	Name string
233 | 
234 | 	// URLs are links to further information about the suffix block's
235 | 	// domains and its maintainer.
236 | 	//
237 | 	// For ICANN domains this is typically the NIC's information page
238 | 	// for the TLD, or failing that a general information page such as
239 | 	// a Wikipedia entry.
240 | 	//
241 | 	// For private domains this is usually the website for the owner
242 | 	// of the domains.
243 | 	//
244 | 	// May be empty when the block header doesn't have
245 | 	// machine-readable URLs.
246 | 	URLs []*url.URL
247 | 
248 | 	// Maintainer is the contact name and email address of the person
249 | 	// or persons responsible for maintaining a block.
250 | 	//
251 | 	// This field may be empty if there is no machine-readable contact
252 | 	// information.
253 | 	Maintainers []*mail.Address
254 | 
255 | 	// Other is some unstructured additional notes. They may contain
256 | 	// anything, including some of the above information that wasn't
257 | 	// in a known parseable form.
258 | 	Other []string
259 | 
260 | 	// MachineEditable is whether this information can be
261 | 	// machine-edited and written back out without loss of
262 | 	// information. The exact formatting of the information may
263 | 	// change, but no information will be lost.
264 | 	MachineEditable bool
265 | }
266 | 
267 | func (m *MaintainerInfo) Compare(n *MaintainerInfo) int {
268 | 	if r := compareCommentText(m.Name, n.Name); r != 0 {
269 | 		return r
270 | 	}
271 | 
272 | 	if r := cmp.Compare(len(m.URLs), len(n.URLs)); r != 0 {
273 | 		return r
274 | 	}
275 | 	for i := range m.URLs {
276 | 		if r := cmp.Compare(m.URLs[i].String(), n.URLs[i].String()); r != 0 {
277 | 			return r
278 | 		}
279 | 	}
280 | 
281 | 	if r := cmp.Compare(len(m.Maintainers), len(n.Maintainers)); r != 0 {
282 | 		return r
283 | 	}
284 | 	for i := range m.Maintainers {
285 | 		if r := cmp.Compare(m.Maintainers[i].String(), n.Maintainers[i].String()); r != 0 {
286 | 			return r
287 | 		}
288 | 	}
289 | 
290 | 	if r := slices.Compare(m.Other, n.Other); r != 0 {
291 | 		return r
292 | 	}
293 | 
294 | 	if m.MachineEditable == n.MachineEditable {
295 | 		return 0
296 | 	} else if !m.MachineEditable {
297 | 		return -1
298 | 	} else {
299 | 		return 1
300 | 	}
301 | }
302 | 
303 | // HasInfo reports whether m has any maintainer information at all.
304 | func (m MaintainerInfo) HasInfo() bool {
305 | 	return m.Name != "" || len(m.URLs) > 0 || len(m.Maintainers) > 0 || len(m.Other) > 0
306 | }
307 | 
308 | // Suffix is one public suffix, represented in the standard domain
309 | // name format.
310 | type Suffix struct {
311 | 	blockInfo
312 | 
313 | 	// Domain is the public suffix's domain name.
314 | 	Domain domain.Name
315 | }
316 | 
317 | func (s *Suffix) Children() []Block { return nil }
318 | 
319 | // PublicSuffix returns the public suffix of n according to this
320 | // Suffix rule taken in isolation. If n is not a child domain of s
321 | // PublicSuffix returns (zeroValue, false).
322 | func (s *Suffix) PublicSuffix(n domain.Name) (suffix domain.Name, ok bool) {
323 | 	if n.Equal(s.Domain) {
324 | 		return s.Domain, true
325 | 	}
326 | 	if _, ok := n.CutSuffix(s.Domain); ok {
327 | 		return s.Domain, true
328 | 	}
329 | 	return domain.Name{}, false
330 | }
331 | 
332 | // RegisteredDomain returns the registered/registerable domain of n
333 | // according to this Suffix rule taken in isolation. The registered
334 | // domain is defined as n's public suffix plus one more child
335 | // label. If n is not a child domain of s, RegisteredDomain returns
336 | // (zeroValue, false).
337 | func (s *Suffix) RegisteredDomain(n domain.Name) (regDomain domain.Name, ok bool) {
338 | 	if prefix, ok := n.CutSuffix(s.Domain); ok {
339 | 		return s.Domain.MustAddPrefix(prefix[len(prefix)-1]), true
340 | 	}
341 | 	return domain.Name{}, false
342 | }
343 | 
344 | // Wildcard is a wildcard public suffix, along with any exceptions to
345 | // that wildcard.
346 | type Wildcard struct {
347 | 	blockInfo
348 | 
349 | 	// Domain is the base of the wildcard public suffix, without the
350 | 	// leading "*" label.
351 | 	Domain domain.Name
352 | 	// Exceptions are the domain.Labels that, when they appear in the
353 | 	// wildcard position of Domain, cause a FQDN to _not_ match this
354 | 	// wildcard. For example, if Domain="foo.com" and Exceptions=[bar,
355 | 	// qux], zot.foo.com is a public suffix, but bar.foo.com and
356 | 	// qux.foo.com are not.
357 | 	Exceptions []domain.Label
358 | }
359 | 
360 | func (w *Wildcard) Children() []Block { return nil }
361 | 
362 | // PublicSuffix returns the public suffix of n according to this
363 | // Wildcard rule taken in isolation. If n is not a child domain of w
364 | // PublicSuffix returns (zeroValue, false).
365 | func (w *Wildcard) PublicSuffix(n domain.Name) (suffix domain.Name, isException, ok bool) {
366 | 	if prefix, ok := n.CutSuffix(w.Domain); ok {
367 | 		next := prefix[len(prefix)-1]
368 | 		if slices.Contains(w.Exceptions, next) {
369 | 			return w.Domain, true, true
370 | 		}
371 | 
372 | 		return w.Domain.MustAddPrefix(next), false, true
373 | 	}
374 | 	return domain.Name{}, false, false
375 | }
376 | 
377 | // RegisteredDomain returns the registered/registerable domain of n
378 | // according to this Suffix rule taken in isolation. The registered
379 | // domain is defined as n's public suffix plus one more child
380 | // label. If n is not a child domain of s, RegisteredDomain returns
381 | // (zeroValue, false).
382 | func (w *Wildcard) RegisteredDomain(n domain.Name) (regDomain domain.Name, isException, ok bool) {
383 | 	if prefix, ok := n.CutSuffix(w.Domain); ok && len(prefix) >= 2 {
384 | 		next := prefix[len(prefix)-1]
385 | 		if slices.Contains(w.Exceptions, next) {
386 | 			return w.Domain.MustAddPrefix(next), true, true
387 | 		}
388 | 
389 | 		return w.Domain.MustAddPrefix(prefix[len(prefix)-2:]...), false, true
390 | 	}
391 | 	return domain.Name{}, false, false
392 | }
393 | 


--------------------------------------------------------------------------------
/tools/internal/parser/parser.go:
--------------------------------------------------------------------------------
  1 | // Package parser implements a validating parser for the PSL files.
  2 | package parser
  3 | 
  4 | import (
  5 | 	"fmt"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/publicsuffix/list/tools/internal/domain"
  9 | )
 10 | 
 11 | // Parse parses bs as a PSL file and returns the parse result.
 12 | //
 13 | // The parser tries to keep going when it encounters errors. Parse and
 14 | // validation errors are accumulated in the Errors field of the
 15 | // returned File.
 16 | //
 17 | // If the returned File has a non-empty Errors field, the parsed file
 18 | // does not comply with the PSL format (documented at
 19 | // https://github.com/publicsuffix/list/wiki/Format), or with PSL
 20 | // submission guidelines
 21 | // (https://github.com/publicsuffix/list/wiki/Guidelines). A File with
 22 | // errors should not be used to calculate public suffixes for FQDNs.
 23 | func Parse(bs []byte) (*List, []error) {
 24 | 	lines, errs := normalizeToUTF8Lines(bs)
 25 | 	p := &parser{
 26 | 		input:     lines,
 27 | 		inputLine: 0,
 28 | 	}
 29 | 	for _, err := range errs {
 30 | 		p.addError(err)
 31 | 	}
 32 | 	ret := p.parseTopLevel()
 33 | 	return ret, p.errs
 34 | }
 35 | 
 36 | // parser is the state for a single PSL file parse.
 37 | type parser struct {
 38 | 	// input is the remaining unparsed and untokenized source text.
 39 | 	input []string
 40 | 	// inputLine is the offset for input[0]. That is, input[0] is line
 41 | 	// number inputLine of the source text.
 42 | 	inputLine int
 43 | 	// peekBuf is a buffer containing zero or one input tokens.
 44 | 	peekBuf any
 45 | 	// errs are the accumulated parse errors so far.
 46 | 	errs []error
 47 | }
 48 | 
 49 | // addError records err as a parse/validation error.
 50 | //
 51 | // If err matches a legacy exemption from current validation rules,
 52 | // err is recorded as a non-fatal warning instead.
 53 | func (p *parser) addError(err error) {
 54 | 	p.errs = append(p.errs, err)
 55 | }
 56 | 
 57 | // The following types and functions are the lexer portion of the
 58 | // parsing logic. This is a very simplistic lexer, since
 59 | // normalizeToUTF8Lines has already done a lot of heavy lifting to
 60 | // clean up the input. Each line of input is converted to a token for
 61 | // that line's content. The parser then assembles that stream of
 62 | // tokens into multiline blocks, and eventually into a parse tree.
 63 | 
 64 | const (
 65 | 	sectionStartPrefix = "// ===BEGIN "
 66 | 	sectionEndPrefix   = "// ===END "
 67 | 	sectionPrefix      = "// ==="
 68 | 	commentPrefix      = "// "
 69 | 	wildcardPrefix     = "*."
 70 | 	exceptionPrefix    = "!"
 71 | )
 72 | 
 73 | type line struct {
 74 | 	SourceRange
 75 | 	Text string
 76 | }
 77 | type tokenEOF struct{}
 78 | type tokenBlank struct{ line }
 79 | type tokenComment struct{ line }
 80 | type tokenSectionUnknown struct{ line }
 81 | type tokenSectionStart struct {
 82 | 	line
 83 | 	Name string
 84 | }
 85 | type tokenSectionEnd struct {
 86 | 	line
 87 | 	Name string
 88 | }
 89 | type tokenSuffix struct{ line }
 90 | type tokenWildcard struct {
 91 | 	line
 92 | 	Suffix string
 93 | }
 94 | type tokenException struct {
 95 | 	line
 96 | 	Suffix string
 97 | }
 98 | 
 99 | // next lexes the next token of input and returns it.
100 | func (p *parser) next() (ret any) {
101 | 	if p.peekBuf != nil {
102 | 		ret := p.peekBuf
103 | 		p.peekBuf = nil
104 | 		return ret
105 | 	}
106 | 
107 | 	if len(p.input) == 0 {
108 | 		return tokenEOF{}
109 | 	}
110 | 
111 | 	// No matter what, next is going to emit the next line of p.input,
112 | 	// the rest of the function is just to determine what kind of
113 | 	// token to return.
114 | 	src := line{
115 | 		SourceRange: SourceRange{p.inputLine, p.inputLine + 1},
116 | 		Text:        p.input[0],
117 | 	}
118 | 	p.input = p.input[1:]
119 | 	p.inputLine++
120 | 
121 | 	switch {
122 | 	case src.Text == "":
123 | 		return tokenBlank{src}
124 | 
125 | 	case strings.HasPrefix(src.Text, sectionStartPrefix):
126 | 		// To avoid repeated string processing in different portions
127 | 		// of the parser code, the lexer tears apart section markers
128 | 		// here to extract the section name.
129 | 		name := strings.TrimPrefix(src.Text, sectionStartPrefix)
130 | 		name, ok := strings.CutSuffix(name, "===")
131 | 		if !ok {
132 | 			return tokenSectionUnknown{src}
133 | 		}
134 | 		return tokenSectionStart{src, name}
135 | 	case strings.HasPrefix(src.Text, sectionEndPrefix):
136 | 		name := strings.TrimPrefix(src.Text, sectionEndPrefix)
137 | 		name, ok := strings.CutSuffix(name, "===")
138 | 		if !ok {
139 | 			return tokenSectionUnknown{src}
140 | 		}
141 | 		return tokenSectionEnd{src, name}
142 | 	case strings.HasPrefix(src.Text, sectionPrefix):
143 | 		return tokenSectionUnknown{src}
144 | 
145 | 	case strings.HasPrefix(src.Text, commentPrefix):
146 | 		// Similarly, the following do some light processing of the
147 | 		// input so that this doesn't need to be repeated in several
148 | 		// portions of the parser.
149 | 		src.Text = strings.TrimPrefix(src.Text, "// ")
150 | 		return tokenComment{src}
151 | 	case strings.HasPrefix(src.Text, wildcardPrefix):
152 | 		return tokenWildcard{src, strings.TrimPrefix(src.Text, wildcardPrefix)}
153 | 	case strings.HasPrefix(src.Text, exceptionPrefix):
154 | 		return tokenException{src, strings.TrimPrefix(src.Text, exceptionPrefix)}
155 | 
156 | 	default:
157 | 		return tokenSuffix{src}
158 | 	}
159 | }
160 | 
161 | // peek returns the next token of input, without consuming it.
162 | func (p *parser) peek() any {
163 | 	if p.peekBuf == nil {
164 | 		p.peekBuf = p.next()
165 | 	}
166 | 	return p.peekBuf
167 | }
168 | 
169 | // The rest of this file is the parser itself. It follows the common
170 | // recursive descent structure.
171 | 
172 | // blockEmitter returns a function that appends blocks to a given
173 | // output list, and also updates an output SourceRange to cover the
174 | // superset of all emitted blocks.
175 | //
176 | // This is a helper to make the functions that parse intermediate AST
177 | // nodes (which have to accumulate a list of children) more readable.
178 | func blockEmitter(out *[]Block, srcRange *SourceRange) func(...Block) {
179 | 
180 | 	return func(bs ...Block) {
181 | 		for _, b := range bs {
182 | 			if b == nil {
183 | 				// Sub-parsers sometimes return nil to indicate the
184 | 				// thing they tried to parse was bad and they have
185 | 				// nothing to contribute to the output.
186 | 				continue
187 | 			}
188 | 
189 | 			*out = append(*out, b)
190 | 
191 | 			if srcRange == nil {
192 | 				continue
193 | 			} else if *srcRange == (SourceRange{}) {
194 | 				// Zero value, this is the first emitted block.
195 | 				*srcRange = b.SrcRange()
196 | 			} else {
197 | 				*srcRange = (*srcRange).merge(b.SrcRange())
198 | 			}
199 | 		}
200 | 	}
201 | }
202 | 
203 | // parseTopLevel parses the top level of a PSL file.
204 | func (p *parser) parseTopLevel() *List {
205 | 	ret := &List{}
206 | 	emit := blockEmitter(&ret.Blocks, nil)
207 | 
208 | 	for {
209 | 		switch tok := p.peek().(type) {
210 | 		case tokenEOF:
211 | 			return ret
212 | 		case tokenBlank:
213 | 			p.next()
214 | 		case tokenComment:
215 | 			emit(p.parseCommentOrSuffixBlock())
216 | 		case tokenSectionStart:
217 | 			emit(p.parseSection())
218 | 		case tokenSectionEnd:
219 | 			p.addError(ErrUnstartedSection{tok.SourceRange, tok.Name})
220 | 			p.next()
221 | 		case tokenSectionUnknown:
222 | 			p.addError(ErrUnknownSectionMarker{tok.SourceRange})
223 | 			p.next()
224 | 		case tokenSuffix, tokenWildcard, tokenException:
225 | 			emit(p.parseSuffixBlock(nil))
226 | 		default:
227 | 			panic("unhandled token")
228 | 		}
229 | 	}
230 | }
231 | 
232 | // parseSection parses the contents of a PSL file section.
233 | func (p *parser) parseSection() *Section {
234 | 	// Initialize with the start-of-section marker's data.
235 | 	start := p.next().(tokenSectionStart)
236 | 	ret := &Section{
237 | 		blockInfo: blockInfo{
238 | 			SourceRange: start.SourceRange,
239 | 		},
240 | 		Name: start.Name,
241 | 	}
242 | 	emit := blockEmitter(&ret.Blocks, &ret.SourceRange)
243 | 
244 | 	for {
245 | 		switch tok := p.peek().(type) {
246 | 		case tokenEOF:
247 | 			p.addError(ErrUnclosedSection{ret})
248 | 			return ret
249 | 		case tokenBlank:
250 | 			p.next()
251 | 		case tokenComment:
252 | 			emit(p.parseCommentOrSuffixBlock())
253 | 		case tokenSectionStart:
254 | 			// The PSL doesn't allow nested sections, so we pretend
255 | 			// like the inner section never existed and grab all its
256 | 			// blocks for ourselves. Still record an error for the
257 | 			// nested section though.
258 | 			inner := p.parseSection()
259 | 			emit(inner.Blocks...)
260 | 			p.addError(ErrNestedSection{inner.SourceRange, inner.Name, ret})
261 | 		case tokenSectionEnd:
262 | 			p.next()
263 | 			if tok.Name != ret.Name {
264 | 				p.addError(ErrMismatchedSection{tok.SourceRange, tok.Name, ret})
265 | 			}
266 | 			ret.SourceRange.LastLine = tok.SourceRange.LastLine
267 | 			return ret
268 | 		case tokenSectionUnknown:
269 | 			p.next()
270 | 			p.addError(ErrUnknownSectionMarker{tok.SourceRange})
271 | 		case tokenSuffix, tokenWildcard, tokenException:
272 | 			emit(p.parseSuffixBlock(nil))
273 | 		default:
274 | 			panic("unhandled token")
275 | 		}
276 | 	}
277 | }
278 | 
279 | // parseCommentOrSuffixBlock parses a comment, then either returns it
280 | // as a lone comment or chains into suffix block parsing, depending on
281 | // what follows the comment.
282 | //
283 | // This is used to resolve an ambiguity in the PSL format when parsing
284 | // linearly: if we see a comment, that could be a standalone comment,
285 | // or it could be the beginning of a suffix block. In the latter case,
286 | // it's very important to attach the comment to the suffix block,
287 | // since it contains metadata about those suffixes.
288 | func (p *parser) parseCommentOrSuffixBlock() Block {
289 | 	comment := p.parseComment()
290 | 	switch p.peek().(type) {
291 | 	case tokenSuffix, tokenWildcard, tokenException:
292 | 		return p.parseSuffixBlock(comment)
293 | 	default:
294 | 		return comment
295 | 	}
296 | }
297 | 
298 | // parseSuffixBlock parses a suffix block, starting with the provided
299 | // optional initial comment.
300 | func (p *parser) parseSuffixBlock(initialComment *Comment) *Suffixes {
301 | 	ret := &Suffixes{
302 | 		Info: extractMaintainerInfo(initialComment),
303 | 	}
304 | 	emit := blockEmitter(&ret.Blocks, &ret.SourceRange)
305 | 
306 | 	if initialComment != nil {
307 | 		emit(initialComment)
308 | 	}
309 | 
310 | 	for {
311 | 		switch tok := p.peek().(type) {
312 | 		case tokenBlank:
313 | 			return ret
314 | 		case tokenComment:
315 | 			emit(p.parseComment())
316 | 		case tokenSectionUnknown:
317 | 			p.next()
318 | 			p.addError(ErrUnknownSectionMarker{tok.SourceRange})
319 | 		case tokenSectionStart:
320 | 			p.next()
321 | 			p.addError(ErrSectionInSuffixBlock{tok.SourceRange})
322 | 		case tokenSectionEnd:
323 | 			p.next()
324 | 			p.addError(ErrSectionInSuffixBlock{tok.SourceRange})
325 | 		case tokenSuffix:
326 | 			emit(p.parseSuffix())
327 | 		case tokenWildcard:
328 | 			emit(p.parseWildcard())
329 | 		case tokenException:
330 | 			// Note we don't emit here, exceptions receive a list of
331 | 			// existing blocks and attach the exception to the
332 | 			// corresponding wildcard entry.
333 | 			p.parseException(ret.Blocks)
334 | 		case tokenEOF:
335 | 			return ret
336 | 		default:
337 | 			panic("unhandled token")
338 | 		}
339 | 	}
340 | }
341 | 
342 | // parseSuffix parses a basic public suffix entry (i.e. not a wildcard
343 | // or an exception.
344 | func (p *parser) parseSuffix() Block {
345 | 	tok := p.next().(tokenSuffix)
346 | 
347 | 	domain, err := domain.Parse(tok.Text)
348 | 	if err != nil {
349 | 		p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Text, err})
350 | 		return nil
351 | 	}
352 | 
353 | 	return &Suffix{
354 | 		blockInfo: blockInfo{
355 | 			SourceRange: tok.SourceRange,
356 | 		},
357 | 		Domain: domain,
358 | 	}
359 | }
360 | 
361 | // parseWildcard parses a public suffix wildcard entry, of the form
362 | // "*.example.com".
363 | func (p *parser) parseWildcard() Block {
364 | 	tok := p.next().(tokenWildcard)
365 | 
366 | 	domain, err := domain.Parse(tok.Suffix)
367 | 	if err != nil {
368 | 		p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err})
369 | 		return nil
370 | 	}
371 | 
372 | 	return &Wildcard{
373 | 		blockInfo: blockInfo{
374 | 			SourceRange: tok.SourceRange,
375 | 		},
376 | 		Domain: domain,
377 | 	}
378 | }
379 | 
380 | // parseException parses a public suffix wildcard exception, of the
381 | // form "!foo.example.com". The parsed exception is attached to the
382 | // related Wildcard block in previous. If no such block exists, the
383 | // exception is dropped and a parse error recorded.
384 | func (p *parser) parseException(previous []Block) {
385 | 	tok := p.next().(tokenException)
386 | 
387 | 	domain, err := domain.Parse(tok.Suffix)
388 | 	if err != nil {
389 | 		p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err})
390 | 		return
391 | 	}
392 | 
393 | 	for _, block := range previous {
394 | 		w, ok := block.(*Wildcard)
395 | 		if !ok {
396 | 			continue
397 | 		}
398 | 
399 | 		if rest, ok := domain.CutSuffix(w.Domain); ok && len(rest) == 1 {
400 | 			w.Exceptions = append(w.Exceptions, domain.Labels()[0])
401 | 			return
402 | 		}
403 | 	}
404 | 	p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, fmt.Errorf("exception %q does not match any wildcard", tok.Suffix)})
405 | }
406 | 
407 | // parseComment parses a multiline comment block.
408 | func (p *parser) parseComment() *Comment {
409 | 	tok := p.next().(tokenComment)
410 | 	ret := &Comment{
411 | 		blockInfo: blockInfo{
412 | 			SourceRange: tok.SourceRange,
413 | 		},
414 | 		Text: []string{tok.Text},
415 | 	}
416 | 	for {
417 | 		if tok, ok := p.peek().(tokenComment); ok {
418 | 			p.next()
419 | 			ret.SourceRange = ret.SourceRange.merge(tok.SourceRange)
420 | 			ret.Text = append(ret.Text, tok.Text)
421 | 		} else {
422 | 			return ret
423 | 		}
424 | 	}
425 | }
426 | 


--------------------------------------------------------------------------------