├── linkcheck ├── test-fixtures │ └── sample-site │ │ ├── id-bad │ │ ├── b.html │ │ └── index.html │ │ ├── basic │ │ ├── basic-b.html │ │ └── index.html │ │ ├── id-ignore │ │ ├── b.html │ │ └── index.html │ │ ├── id-good │ │ ├── b.html │ │ └── index.html │ │ ├── external │ │ ├── bad.html │ │ ├── good.html │ │ └── excluded.html │ │ └── circular │ │ ├── circular-a.html │ │ ├── circular-b.html │ │ └── index.html ├── helpers.go ├── archive.go ├── linkcheck_test.go ├── datastructures.go └── linkcheck.go ├── main.go ├── .github └── workflows │ └── go.yml ├── go.mod ├── README.md ├── LICENSE.txt ├── xhtml └── xhtml.go └── go.sum /linkcheck/test-fixtures/sample-site/id-bad/b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | NO ID HERE 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/basic/basic-b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Basic link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/id-ignore/b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/basic/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Basic link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/id-bad/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Bad ID link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/id-good/b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Good IDs 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/id-good/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Good ID link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/external/bad.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Bad external link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/id-ignore/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | App ID link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/external/good.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Good external link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/external/excluded.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Bad external link 4 | 5 | 6 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/circular/circular-a.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Self link 4 | Circular link 5 | 6 | 7 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/circular/circular-b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Self link 4 | Circular link 5 | 6 | 7 | -------------------------------------------------------------------------------- /linkcheck/test-fixtures/sample-site/circular/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Circular link1 4 | Circular link2 5 | 6 | 7 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/carlmjohnson/exitcode" 7 | "github.com/spotlightpa/linkrot/linkcheck" 8 | ) 9 | 10 | func main() { 11 | exitcode.Exit(linkcheck.CLI(os.Args[1:])) 12 | } 13 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: [ push, pull_request ] 4 | 5 | jobs: 6 | 7 | build: 8 | name: Build 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-go@v3 13 | with: 14 | go-version: '1.21' 15 | cache: true 16 | - name: Get dependencies 17 | run: go mod download 18 | - name: Test 19 | run: go test -race -v ./... 20 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/spotlightpa/linkrot 2 | 3 | // +heroku goVersion go1.21 4 | // +heroku install ./... 5 | 6 | go 1.21 7 | 8 | toolchain go1.21.0 9 | 10 | require ( 11 | github.com/carlmjohnson/be v0.23.1 12 | github.com/carlmjohnson/exitcode v0.20.2 13 | github.com/carlmjohnson/flagx v0.22.2 14 | github.com/carlmjohnson/flowmatic v0.23.4 15 | github.com/carlmjohnson/requests v0.23.4 16 | github.com/carlmjohnson/versioninfo v0.22.5 17 | github.com/getsentry/sentry-go v0.23.0 18 | golang.org/x/net v0.14.0 19 | golang.org/x/time v0.3.0 20 | ) 21 | 22 | require ( 23 | github.com/carlmjohnson/deque v0.23.1 // indirect 24 | golang.org/x/sys v0.11.0 // indirect 25 | golang.org/x/text v0.12.0 // indirect 26 | ) 27 | -------------------------------------------------------------------------------- /linkcheck/helpers.go: -------------------------------------------------------------------------------- 1 | package linkcheck 2 | 3 | import ( 4 | "net/url" 5 | "sort" 6 | ) 7 | 8 | func removeFragment(link string) string { 9 | u, _ := url.Parse(link) 10 | u.Fragment = "" 11 | return u.String() 12 | } 13 | 14 | func splitFragment(linkIn string) (link, frag string) { 15 | u, _ := url.Parse(linkIn) 16 | frag = u.Fragment 17 | u.Fragment = "" 18 | link = u.String() 19 | return 20 | } 21 | 22 | func sliceToSet(ss []string) map[string]bool { 23 | set := make(map[string]bool, len(ss)) 24 | for _, s := range ss { 25 | set[s] = true 26 | } 27 | return set 28 | } 29 | 30 | func setToSlice(set map[string]bool) []string { 31 | ss := make([]string, 0, len(set)) 32 | for s := range set { 33 | ss = append(ss, s) 34 | } 35 | sort.Strings(ss) 36 | return ss 37 | } 38 | -------------------------------------------------------------------------------- /linkcheck/archive.go: -------------------------------------------------------------------------------- 1 | package linkcheck 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "net/http" 7 | "os" 8 | "os/signal" 9 | "time" 10 | 11 | "github.com/carlmjohnson/requests" 12 | "golang.org/x/time/rate" 13 | ) 14 | 15 | func (c *crawler) archiveAll(pages crawledPages) error { 16 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 17 | defer cancel() 18 | ctx, stop := signal.NotifyContext(ctx, os.Interrupt) 19 | defer stop() 20 | 21 | // See https://archive.org/details/toomanyrequests_20191110 22 | l := rate.NewLimiter(rate.Every(time.Minute/15), 1) 23 | 24 | // queue good URLs 25 | queue := make([]string, 0, len(pages)) 26 | for u, pi := range pages { 27 | if pi.err == nil { 28 | queue = append(queue, u) 29 | } 30 | } 31 | 32 | count := 0 33 | var errs []error 34 | for _, page := range queue { 35 | if err := l.Wait(ctx); err != nil { 36 | break 37 | } 38 | if err := c.archive(ctx, page); err != nil { 39 | errs = append(errs, err) 40 | c.l.Error("archive", "error", err, "url", page) 41 | continue 42 | } 43 | 44 | count++ 45 | c.l.Info("archive", "n", count, "total", len(queue), "url", page) 46 | } 47 | return errors.Join(errs...) 48 | } 49 | 50 | func (c *crawler) archive(ctx context.Context, page string) error { 51 | return requests. 52 | URL("https://web.archive.org"). 53 | Pathf("/save/%s", page). 54 | Head(). 55 | Client(&http.Client{ 56 | Timeout: 30 * time.Second, 57 | }). 58 | Fetch(ctx) 59 | } 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | linkrot 2 | ========= 3 | 4 | Linkrot takes a root URL and recurses down through the links it finds in the 5 | HTML pages, checking for broken links. Optionally, it can report broken links 6 | to Sentry. 7 | 8 | Usage 9 | ----- 10 | 11 | ``` 12 | $ linkrot -h 13 | Usage of linkrot (v0.21.0): 14 | 15 | linkrot [options] 16 | 17 | linkrot takes a root URL and recurses down through the links it finds 18 | in the HTML pages, checking for broken links (HTTP status != 200). 19 | 20 | Options may also be specified as env vars prefixed with "LINKROT_". 21 | 22 | Options: 23 | 24 | -crawlers int 25 | number of concurrent crawlers (default 8) 26 | -exclude URL prefix 27 | URL prefix to ignore; can repeat to exclude multiple URLs 28 | -sentry-dsn pseudo-URL 29 | Sentry DSN pseudo-URL 30 | -should-archive 31 | send links to archive.org 32 | -timeout duration 33 | timeout for requesting a URL (default 10s) 34 | -verbose 35 | verbose 36 | 37 | $ linkrot -verbose http://example.com 38 | linkrot 2019/07/23 10:40:54 starting 4 crawlers 39 | linkrot 2019/07/23 10:40:54 Got OK: http://example.com/ 40 | linkrot 2019/07/23 10:40:54 url http://example.com/ links to http://www.iana.org/domains/example 41 | linkrot 2019/07/23 10:40:55 Got OK: http://www.iana.org/domains/example 42 | ``` 43 | 44 | Installation 45 | ------------ 46 | 47 | Requires [Go](https://golang.org/) to be installed. 48 | 49 | ``` 50 | $ go install github.com/spotlightpa/linkrot@latest 51 | ``` 52 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2013 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /xhtml/xhtml.go: -------------------------------------------------------------------------------- 1 | // Package xhtml makes x/net/html easier 2 | package xhtml 3 | 4 | import ( 5 | "net/url" 6 | 7 | "golang.org/x/net/html" 8 | "golang.org/x/net/html/atom" 9 | ) 10 | 11 | const ( 12 | Done = false 13 | Continue = true 14 | ) 15 | 16 | func BreadFirst(n *html.Node, yield func(*html.Node) bool) bool { 17 | if yield(n) == Done { 18 | return Done 19 | } 20 | for c := n.FirstChild; c != nil; c = c.NextSibling { 21 | if BreadFirst(c, yield) == Done { 22 | return Done 23 | } 24 | } 25 | return Continue 26 | } 27 | 28 | func VisitAll(n *html.Node, callback func(*html.Node)) { 29 | BreadFirst(n, func(n *html.Node) bool { 30 | callback(n) 31 | return true 32 | }) 33 | } 34 | 35 | func IDs(doc *html.Node) (ids []string) { 36 | VisitAll(doc, func(n *html.Node) { 37 | ids = appendIDs(n, ids) 38 | }) 39 | 40 | return ids 41 | } 42 | 43 | func Links(pageurl *url.URL, doc *html.Node) (links []string) { 44 | VisitAll(doc, func(n *html.Node) { 45 | if link := linkFromAHref(pageurl, n); link != "" { 46 | links = append(links, link) 47 | } 48 | }) 49 | 50 | return links 51 | } 52 | 53 | func linkFromAHref(pageurl *url.URL, n *html.Node) (link string) { 54 | if n.DataAtom != atom.A { 55 | return 56 | } 57 | 58 | return resolveRef(pageurl, href(n)) 59 | } 60 | 61 | func appendIDs(n *html.Node, ids []string) []string { 62 | for _, attr := range n.Attr { 63 | if attr.Key == "id" { 64 | ids = append(ids, attr.Val) 65 | } 66 | // collect old fashioned anchors 67 | if n.DataAtom == atom.A && attr.Key == "name" { 68 | ids = append(ids, attr.Val) 69 | } 70 | } 71 | 72 | return ids 73 | } 74 | 75 | func href(n *html.Node) string { 76 | for _, attr := range n.Attr { 77 | if attr.Key == "href" { 78 | return attr.Val 79 | } 80 | } 81 | return "" 82 | } 83 | 84 | func resolveRef(baseurl *url.URL, ref string) string { 85 | u, err := baseurl.Parse(ref) 86 | if err != nil { 87 | return "" 88 | } 89 | return u.String() 90 | } 91 | -------------------------------------------------------------------------------- /linkcheck/linkcheck_test.go: -------------------------------------------------------------------------------- 1 | package linkcheck 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "log/slog" 7 | "net/http" 8 | "net/http/httptest" 9 | "testing" 10 | 11 | "github.com/carlmjohnson/be" 12 | ) 13 | 14 | func TestRun(t *testing.T) { 15 | // Silence during test 16 | log.SetOutput(io.Discard) 17 | 18 | // Special for excluded path test 19 | excludePaths := []string{"https://example.com/excluded-path"} 20 | 21 | // Test server for our known sites 22 | ts := httptest.NewServer(http.FileServer(http.Dir("test-fixtures/sample-site"))) 23 | defer ts.Close() 24 | 25 | var testcases = []struct { 26 | name string 27 | base string 28 | crawlers int 29 | errLen int 30 | contains string 31 | }{ 32 | {"basic failure", ts.URL + "/404", 1, 1, "unexpected status: 404"}, 33 | {"basic success", ts.URL + "/basic/", 1, 0, ""}, 34 | {"more crawlers failure", ts.URL + "/404", 5, 1, "unexpected status: 404"}, 35 | {"more crawlers success", ts.URL + "/basic/", 5, 0, ""}, 36 | {"circular success", ts.URL + "/circular/", 1, 0, ""}, 37 | {"good external link", ts.URL + "/external/good.html", 1, 0, ""}, 38 | {"bad external link", ts.URL + "/external/bad.html", 1, 1, "unexpected status: 404"}, 39 | {"excluded path", ts.URL + "external/excluded.html", 1, 0, ""}, 40 | {"good ID link", ts.URL + "/id-good/", 1, 0, ""}, 41 | {"bad ID link", ts.URL + "/id-bad/", 1, 1, "missing fragment"}, 42 | {"ignore ID link", ts.URL + "/id-ignore/", 1, 0, ""}, 43 | } 44 | 45 | for _, test := range testcases { 46 | test := test 47 | t.Run(test.name, func(t *testing.T) { 48 | c := crawler{ 49 | test.base, 50 | test.crawlers, 51 | excludePaths, 52 | slog.New(slog.NewTextHandler(io.Discard, nil)), 53 | ts.Client(), 54 | false, 55 | } 56 | 57 | pages, _ := c.crawl() 58 | errs := pages.toURLErrors(c.base) 59 | output := errs.String() 60 | 61 | be.DebugLog(t, "errors: %q", errs) 62 | be.Equal(t, test.errLen, len(errs)) 63 | 64 | if test.contains == "" { 65 | be.Zero(t, output) 66 | } else { 67 | be.In(t, test.contains, output) 68 | } 69 | }) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /linkcheck/datastructures.go: -------------------------------------------------------------------------------- 1 | package linkcheck 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "path" 7 | "strings" 8 | ) 9 | 10 | type pageInfo struct { 11 | ids map[string]bool 12 | links map[string]bool 13 | err error 14 | } 15 | 16 | type crawledPages map[string]*pageInfo 17 | 18 | func (cp crawledPages) toURLErrors(base string) urlErrors { 19 | if u, err := url.Parse(base); err == nil { 20 | u.Path = path.Dir(u.Path) 21 | base = u.String() 22 | } 23 | requestErrs := make(urlErrors) 24 | // Put all errors into errs 25 | for url, pi := range cp { 26 | if pi.err != nil { 27 | requestErrs[url] = &pageError{pi.err, nil, nil} 28 | } 29 | } 30 | // For each page, if one of its links is in errs, 31 | // add that to the back refs and check for its 32 | // link ids in frags 33 | fragErrs := make(urlErrors) 34 | for page, pi := range cp { 35 | // ignore pages off site 36 | if !strings.HasPrefix(page, base) { 37 | continue 38 | } 39 | for link := range pi.links { 40 | link, frag := splitFragment(link) 41 | if pe, ok := requestErrs[link]; ok { 42 | pe.refs = append(pe.refs, page) 43 | } 44 | // Ignore empty # and URLs that look like JS apps (#!, #/) 45 | if frag == "" || 46 | strings.HasPrefix(frag, "!") || 47 | strings.HasPrefix(frag, "/") || 48 | // Chrome Scroll to Text Fragment 49 | strings.HasPrefix(frag, ":~:") { 50 | continue 51 | } 52 | 53 | if target, ok := cp[link]; ok && target.ids[frag] { 54 | continue 55 | } 56 | if !strings.HasPrefix(link, base) { 57 | continue 58 | } 59 | // fragment was missing 60 | pe := fragErrs[link] 61 | if pe == nil { 62 | pe = &pageError{ErrMissingFragment, nil, make(map[string]bool)} 63 | fragErrs[link] = pe 64 | } 65 | pe.refs = append(pe.refs, page) 66 | pe.missingFragments[frag] = true 67 | } 68 | } 69 | // Merge errors 70 | for url, pe := range fragErrs { 71 | requestErrs[url] = pe 72 | } 73 | return requestErrs 74 | } 75 | 76 | type pageError struct { 77 | err error 78 | refs []string 79 | missingFragments map[string]bool 80 | } 81 | 82 | type urlErrors map[string]*pageError 83 | 84 | func (ue urlErrors) String() string { 85 | var buf strings.Builder 86 | for page, pe := range ue { 87 | fmt.Fprintf(&buf, "%q: %v\n", page, pe.err) 88 | if pe.err == ErrMissingFragment { 89 | fmt.Fprintf(&buf, "- ids: %s\n", 90 | strings.Join(setToSlice(pe.missingFragments), ", "), 91 | ) 92 | } 93 | fmt.Fprintf(&buf, " - refs: %s\n", strings.Join(pe.refs, ", ")) 94 | } 95 | return buf.String() 96 | } 97 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/carlmjohnson/be v0.23.1 h1:lmzkNRv25/mptDQ1ywXMvgQ6u6IZMYna/KHWAquM1II= 2 | github.com/carlmjohnson/be v0.23.1/go.mod h1:KAgPUh0HpzWYZZI+IABdo80wTgY43YhbdsiLYAaSI/Q= 3 | github.com/carlmjohnson/deque v0.23.1 h1:X2HOJM9xcglY03deMZ0oZ1V2xtbqYV7dJDnZiSZN4Ak= 4 | github.com/carlmjohnson/deque v0.23.1/go.mod h1:LF5NJjICBrEOPx84pxPL4nCimy5n9NQjxKi5cXkh+8U= 5 | github.com/carlmjohnson/exitcode v0.20.2 h1:vE6rmkCGNA4kO4m1qwWIa77PKlUBVg46cNjs22eAOXE= 6 | github.com/carlmjohnson/exitcode v0.20.2/go.mod h1:MZ6ThCDx517DQcrpYnnns1pLh8onjFl+B/AsrOrdmpc= 7 | github.com/carlmjohnson/flagx v0.22.2 h1:UXf7gL4Ffv5RIH/HKp8CGNzDyopgezFLrDO1m4F8jWc= 8 | github.com/carlmjohnson/flagx v0.22.2/go.mod h1:obobISvBnxgEXPLBITVXhRUOlSlzza1SGt34M64CPJc= 9 | github.com/carlmjohnson/flowmatic v0.23.4 h1:SfK6f+zKUlw4aga1ph+7/csqVeUAWnBxfqKN5gvQzzs= 10 | github.com/carlmjohnson/flowmatic v0.23.4/go.mod h1:Jpvyl591Dvkt9chYpnVupjxlKvqkZ9CtCmqL4wfQD7U= 11 | github.com/carlmjohnson/requests v0.23.4 h1:AxcvapfB9RPXLSyvAHk9YJoodQ43ZjzNHj6Ft3tQGdg= 12 | github.com/carlmjohnson/requests v0.23.4/go.mod h1:Qzp6tW4DQyainPP+tGwiJTzwxvElTIKm0B191TgTtOA= 13 | github.com/carlmjohnson/versioninfo v0.22.5 h1:O00sjOLUAFxYQjlN/bzYTuZiS0y6fWDQjMRvwtKgwwc= 14 | github.com/carlmjohnson/versioninfo v0.22.5/go.mod h1:QT9mph3wcVfISUKd0i9sZfVrPviHuSF+cUtLjm2WSf8= 15 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 16 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 17 | github.com/getsentry/sentry-go v0.23.0 h1:dn+QRCeJv4pPt9OjVXiMcGIBIefaTJPw/h0bZWO05nE= 18 | github.com/getsentry/sentry-go v0.23.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY= 19 | github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= 20 | github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= 21 | github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= 22 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 23 | github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= 24 | github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= 25 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 26 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 27 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 28 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 29 | github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= 30 | github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 31 | golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= 32 | golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= 33 | golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= 34 | golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 35 | golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= 36 | golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 37 | golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= 38 | golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= 39 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 40 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 41 | -------------------------------------------------------------------------------- /linkcheck/linkcheck.go: -------------------------------------------------------------------------------- 1 | // Original copyright/license below. 2 | // 3 | // Copyright 2013 The Go Authors. All rights reserved. 4 | // Use of this source code is governed by a BSD-style 5 | // license that can be found in the LICENSE file. 6 | 7 | // Package linkcheck finds missing links in the given website. 8 | // It crawls a URL recursively and notes URLs and URL fragments 9 | // that it's seen and prints a report of missing links at the end. 10 | package linkcheck 11 | 12 | import ( 13 | "context" 14 | "errors" 15 | "flag" 16 | "fmt" 17 | "log" 18 | "log/slog" 19 | "net" 20 | "net/http" 21 | "net/url" 22 | "os" 23 | "os/signal" 24 | "runtime" 25 | "strings" 26 | "time" 27 | 28 | "github.com/carlmjohnson/exitcode" 29 | "github.com/carlmjohnson/flagx" 30 | "github.com/carlmjohnson/flowmatic" 31 | "github.com/carlmjohnson/requests" 32 | "github.com/carlmjohnson/versioninfo" 33 | sentry "github.com/getsentry/sentry-go" 34 | "github.com/spotlightpa/linkrot/xhtml" 35 | "golang.org/x/net/html" 36 | ) 37 | 38 | // Errors native to linkcheck 39 | var ( 40 | ErrCancelled = exitcode.Set(errors.New("scraping canceled by SIGINT"), 3) 41 | ErrBadLinks = exitcode.Set(errors.New("found bad links"), 4) 42 | ErrMissingFragment = errors.New("page missing fragments") 43 | ) 44 | 45 | const ( 46 | safariUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Safari/605.1.15" 47 | ) 48 | 49 | // CLI runs the linkrot executable, equivalent to calling it on the command line. 50 | func CLI(args []string) error { 51 | fl := flag.NewFlagSet("linkrot", flag.ContinueOnError) 52 | fl.Usage = func() { 53 | const usage = `Usage of linkrot %s %s: 54 | 55 | linkrot [options] 56 | 57 | linkrot takes a root URL and recurses down through the links it finds 58 | in the HTML pages, checking for broken links (HTTP status != 200). 59 | 60 | Options may also be specified as env vars prefixed with "LINKROT_". 61 | 62 | Options: 63 | 64 | ` 65 | fmt.Fprintf(os.Stderr, usage, versioninfo.Version, versioninfo.Revision) 66 | fl.PrintDefaults() 67 | } 68 | versioninfo.AddFlag(fl) 69 | verbose := fl.Bool("verbose", false, "verbose") 70 | crawlers := fl.Int("crawlers", runtime.NumCPU(), "number of concurrent crawlers") 71 | timeout := fl.Duration("timeout", 10*time.Second, "timeout for requesting a URL") 72 | var excludePaths []string 73 | fl.Func("exclude", "`URL prefix` to ignore; can repeat to exclude multiple URLs", func(s string) error { 74 | excludePaths = append(excludePaths, strings.Split(s, ",")...) 75 | return nil 76 | }) 77 | dsn := fl.String("sentry-dsn", "", "Sentry DSN `pseudo-URL`") 78 | shouldArchive := fl.Bool("should-archive", false, "send links to archive.org") 79 | if err := fl.Parse(args); err != nil { 80 | return err 81 | } 82 | if err := flagx.ParseEnv(fl, "linkrot"); err != nil { 83 | return err 84 | } 85 | 86 | root := fl.Arg(0) 87 | if root == "" { 88 | root = "http://localhost:8000" 89 | } 90 | 91 | base, err := url.Parse(root) 92 | if err != nil { 93 | log.Printf("parsing root URL: %v", err) 94 | return err 95 | } 96 | 97 | if base.Path == "" { 98 | base.Path = "/" 99 | } 100 | 101 | if *crawlers < 1 { 102 | log.Printf("need at least one crawler") 103 | return fmt.Errorf("bad crawler count: %d", *crawlers) 104 | } 105 | var opt slog.HandlerOptions 106 | if *verbose { 107 | opt = slog.HandlerOptions{ 108 | Level: slog.LevelDebug, 109 | } 110 | } 111 | logger := slog.New(slog.NewTextHandler(os.Stderr, &opt)) 112 | 113 | cl := &http.Client{ 114 | Timeout: *timeout, 115 | Jar: requests.NewCookieJar(), 116 | } 117 | 118 | c := &crawler{ 119 | base.String(), 120 | *crawlers, 121 | excludePaths, 122 | logger, 123 | cl, 124 | *shouldArchive, 125 | } 126 | 127 | c.sentryInit(*dsn) 128 | 129 | return c.run() 130 | } 131 | 132 | type crawler struct { 133 | base string 134 | workers int 135 | excludePaths []string 136 | l *slog.Logger 137 | *http.Client 138 | shouldArchive bool 139 | } 140 | 141 | func (c *crawler) sentryInit(dsn string) { 142 | sentry.Init(sentry.ClientOptions{ 143 | Dsn: dsn, 144 | }) 145 | } 146 | 147 | func (c *crawler) run() error { 148 | pages, cancelled := c.crawl() 149 | urlerrs := pages.toURLErrors(c.base) 150 | c.reportToSentry(urlerrs) 151 | if c.shouldArchive { 152 | c.l.Info("start", "baseurl", c.base) 153 | if errs := c.archiveAll(pages); errs != nil { 154 | c.l.Error("error", "error", errs) 155 | } else { 156 | c.l.Info("done") 157 | } 158 | } 159 | 160 | var err error 161 | if cancelled { 162 | err = ErrCancelled 163 | } else if len(urlerrs) > 0 { 164 | err = ErrBadLinks 165 | } 166 | 167 | return err 168 | } 169 | 170 | func (c *crawler) crawl() (crawled crawledPages, cancelled bool) { 171 | c.l.Debug("crawl()", "workers", c.workers) 172 | // subscribe to SIGINT signals, so that we still output on early exit 173 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 174 | defer cancel() 175 | 176 | task := func(url string) (*pageInfo, error) { 177 | c.l.Debug("doFetch", "url", url) 178 | links, ids, err := c.doFetch(ctx, url) 179 | if err != nil { 180 | c.l.Error("doFetch", "error", err, "url", url) 181 | return &pageInfo{err: err}, err 182 | } 183 | c.l.Debug("doFetch done", 184 | "url", url, 185 | "links", len(links), 186 | "ids", len(ids)) 187 | return &pageInfo{ 188 | links: sliceToSet(links), 189 | ids: sliceToSet(ids), 190 | }, nil 191 | } 192 | 193 | queued := make(map[string]bool) 194 | crawled = make(crawledPages) 195 | 196 | manager := func(url string, info *pageInfo, err error) ([]string, bool) { 197 | crawled[url] = info 198 | if err != nil { 199 | return nil, true 200 | } 201 | var addToQueue []string 202 | for link := range info.links { 203 | link = removeFragment(link) 204 | if !queued[link] { 205 | addToQueue = append(addToQueue, link) 206 | queued[link] = true 207 | } 208 | } 209 | return addToQueue, true 210 | } 211 | 212 | flowmatic.ManageTasks(c.workers, task, manager, c.base) 213 | 214 | cancelled = ctx.Err() != nil 215 | 216 | return crawled, cancelled 217 | } 218 | 219 | func (c *crawler) doFetch(ctx context.Context, pageurl string) (links, ids []string, err error) { 220 | method := http.MethodGet 221 | if c.isExternal(pageurl) { 222 | method = http.MethodHead 223 | } 224 | var doc html.Node 225 | for { 226 | err = requests. 227 | URL(pageurl). 228 | Method(method). 229 | Accept("text/html,application/xhtml+xml,application/xml,*/*"). 230 | Header("Accept-Language", "en-US,en;q=0.9"). 231 | UserAgent(safariUserAgent). 232 | Client(c.Client). 233 | CheckStatus(http.StatusOK). 234 | CheckContentType( 235 | "text/html", 236 | "application/xhtml+xml", 237 | "text/xml", 238 | "text/plain", 239 | ). 240 | CheckPeek(512, func(b []byte) error { 241 | if ct := http.DetectContentType(b); !strings.Contains(ct, "html") { 242 | return fmt.Errorf("content-type is %s", ct) 243 | } 244 | return nil 245 | }). 246 | AddValidator(func(res *http.Response) error { 247 | // If we've been 30X redirected, pageurl will not be response URL 248 | pageurl = res.Request.URL.String() 249 | return nil 250 | }). 251 | Handle(requests.ToHTML(&doc)). 252 | Fetch(ctx) 253 | if method == http.MethodGet || err == nil { 254 | break 255 | } 256 | method = http.MethodGet 257 | } 258 | switch { 259 | case err == nil: 260 | break 261 | case 262 | // report 401, 404, 410; ignore temporary status errors 263 | requests.HasStatusErr(err, 264 | http.StatusUnauthorized, 265 | http.StatusNotFound, 266 | http.StatusGone), 267 | // Report DNS errors 268 | errors.As(err, new(*net.DNSError)): 269 | 270 | return nil, nil, err 271 | 272 | default: 273 | // Ignore other errors 274 | c.l.Debug("doFetch ignore", "url", pageurl, "err", err) 275 | return nil, nil, nil 276 | } 277 | 278 | if !c.isExternal(pageurl) { 279 | ids = xhtml.IDs(&doc) 280 | // must be a good URL coz I fetched it 281 | u, _ := url.Parse(pageurl) 282 | allLinks := xhtml.Links(u, &doc) 283 | for _, link := range allLinks { 284 | if !c.isExcluded(link) { 285 | links = append(links, link) 286 | } 287 | } 288 | } 289 | 290 | return links, ids, nil 291 | } 292 | 293 | func (c *crawler) isExternal(link string) bool { 294 | return !strings.HasPrefix(link, c.base) 295 | } 296 | 297 | func (c *crawler) isExcluded(link string) bool { 298 | if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") { 299 | return true 300 | } 301 | 302 | for _, prefixPath := range c.excludePaths { 303 | if strings.HasPrefix(link, prefixPath) { 304 | return true 305 | } 306 | } 307 | return false 308 | } 309 | 310 | func (c *crawler) reportToSentry(errs urlErrors) { 311 | defer sentry.Flush(10 * time.Second) 312 | for url, pe := range errs { 313 | sentry.WithScope(func(scope *sentry.Scope) { 314 | event := sentry.NewEvent() 315 | scope.SetFingerprint([]string{url}) 316 | scope.SetTag("URL", url) 317 | errType := "request error" 318 | if pe.err == ErrMissingFragment { 319 | errType = "missing page IDs" 320 | frags := setToSlice(pe.missingFragments) 321 | scope.SetExtra("missing page IDs", frags) 322 | } 323 | scope.SetTag("failure type", errType) 324 | scope.SetExtra("affected-pages", pe.refs) 325 | event.Exception = []sentry.Exception{{ 326 | Type: url, 327 | Value: pe.err.Error(), 328 | }} 329 | sentry.CaptureEvent(event) 330 | }) 331 | } 332 | } 333 | --------------------------------------------------------------------------------