├── linkcheck
├── test-fixtures
│ └── sample-site
│ │ ├── id-bad
│ │ ├── b.html
│ │ └── index.html
│ │ ├── basic
│ │ ├── basic-b.html
│ │ └── index.html
│ │ ├── id-ignore
│ │ ├── b.html
│ │ └── index.html
│ │ ├── id-good
│ │ ├── b.html
│ │ └── index.html
│ │ ├── external
│ │ ├── bad.html
│ │ ├── good.html
│ │ └── excluded.html
│ │ └── circular
│ │ ├── circular-a.html
│ │ ├── circular-b.html
│ │ └── index.html
├── helpers.go
├── archive.go
├── linkcheck_test.go
├── datastructures.go
└── linkcheck.go
├── main.go
├── .github
└── workflows
│ └── go.yml
├── go.mod
├── README.md
├── LICENSE.txt
├── xhtml
└── xhtml.go
└── go.sum
/linkcheck/test-fixtures/sample-site/id-bad/b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | NO ID HERE
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/basic/basic-b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Basic link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/id-ignore/b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/basic/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Basic link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/id-bad/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Bad ID link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/id-good/b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Good IDs
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/id-good/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Good ID link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/external/bad.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Bad external link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/id-ignore/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | App ID link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/external/good.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Good external link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/external/excluded.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Bad external link
4 |
5 |
6 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/circular/circular-a.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Self link
4 | Circular link
5 |
6 |
7 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/circular/circular-b.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Self link
4 | Circular link
5 |
6 |
7 |
--------------------------------------------------------------------------------
/linkcheck/test-fixtures/sample-site/circular/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Circular link1
4 | Circular link2
5 |
6 |
7 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/carlmjohnson/exitcode"
7 | "github.com/spotlightpa/linkrot/linkcheck"
8 | )
9 |
10 | func main() {
11 | exitcode.Exit(linkcheck.CLI(os.Args[1:]))
12 | }
13 |
--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on: [ push, pull_request ]
4 |
5 | jobs:
6 |
7 | build:
8 | name: Build
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v3
12 | - uses: actions/setup-go@v3
13 | with:
14 | go-version: '1.21'
15 | cache: true
16 | - name: Get dependencies
17 | run: go mod download
18 | - name: Test
19 | run: go test -race -v ./...
20 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/spotlightpa/linkrot
2 |
3 | // +heroku goVersion go1.21
4 | // +heroku install ./...
5 |
6 | go 1.21
7 |
8 | toolchain go1.21.0
9 |
10 | require (
11 | github.com/carlmjohnson/be v0.23.1
12 | github.com/carlmjohnson/exitcode v0.20.2
13 | github.com/carlmjohnson/flagx v0.22.2
14 | github.com/carlmjohnson/flowmatic v0.23.4
15 | github.com/carlmjohnson/requests v0.23.4
16 | github.com/carlmjohnson/versioninfo v0.22.5
17 | github.com/getsentry/sentry-go v0.23.0
18 | golang.org/x/net v0.14.0
19 | golang.org/x/time v0.3.0
20 | )
21 |
22 | require (
23 | github.com/carlmjohnson/deque v0.23.1 // indirect
24 | golang.org/x/sys v0.11.0 // indirect
25 | golang.org/x/text v0.12.0 // indirect
26 | )
27 |
--------------------------------------------------------------------------------
/linkcheck/helpers.go:
--------------------------------------------------------------------------------
1 | package linkcheck
2 |
3 | import (
4 | "net/url"
5 | "sort"
6 | )
7 |
8 | func removeFragment(link string) string {
9 | u, _ := url.Parse(link)
10 | u.Fragment = ""
11 | return u.String()
12 | }
13 |
14 | func splitFragment(linkIn string) (link, frag string) {
15 | u, _ := url.Parse(linkIn)
16 | frag = u.Fragment
17 | u.Fragment = ""
18 | link = u.String()
19 | return
20 | }
21 |
22 | func sliceToSet(ss []string) map[string]bool {
23 | set := make(map[string]bool, len(ss))
24 | for _, s := range ss {
25 | set[s] = true
26 | }
27 | return set
28 | }
29 |
30 | func setToSlice(set map[string]bool) []string {
31 | ss := make([]string, 0, len(set))
32 | for s := range set {
33 | ss = append(ss, s)
34 | }
35 | sort.Strings(ss)
36 | return ss
37 | }
38 |
--------------------------------------------------------------------------------
/linkcheck/archive.go:
--------------------------------------------------------------------------------
1 | package linkcheck
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "net/http"
7 | "os"
8 | "os/signal"
9 | "time"
10 |
11 | "github.com/carlmjohnson/requests"
12 | "golang.org/x/time/rate"
13 | )
14 |
15 | func (c *crawler) archiveAll(pages crawledPages) error {
16 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
17 | defer cancel()
18 | ctx, stop := signal.NotifyContext(ctx, os.Interrupt)
19 | defer stop()
20 |
21 | // See https://archive.org/details/toomanyrequests_20191110
22 | l := rate.NewLimiter(rate.Every(time.Minute/15), 1)
23 |
24 | // queue good URLs
25 | queue := make([]string, 0, len(pages))
26 | for u, pi := range pages {
27 | if pi.err == nil {
28 | queue = append(queue, u)
29 | }
30 | }
31 |
32 | count := 0
33 | var errs []error
34 | for _, page := range queue {
35 | if err := l.Wait(ctx); err != nil {
36 | break
37 | }
38 | if err := c.archive(ctx, page); err != nil {
39 | errs = append(errs, err)
40 | c.l.Error("archive", "error", err, "url", page)
41 | continue
42 | }
43 |
44 | count++
45 | c.l.Info("archive", "n", count, "total", len(queue), "url", page)
46 | }
47 | return errors.Join(errs...)
48 | }
49 |
50 | func (c *crawler) archive(ctx context.Context, page string) error {
51 | return requests.
52 | URL("https://web.archive.org").
53 | Pathf("/save/%s", page).
54 | Head().
55 | Client(&http.Client{
56 | Timeout: 30 * time.Second,
57 | }).
58 | Fetch(ctx)
59 | }
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | linkrot
2 | =========
3 |
4 | Linkrot takes a root URL and recurses down through the links it finds in the
5 | HTML pages, checking for broken links. Optionally, it can report broken links
6 | to Sentry.
7 |
8 | Usage
9 | -----
10 |
11 | ```
12 | $ linkrot -h
13 | Usage of linkrot (v0.21.0):
14 |
15 | linkrot [options]
16 |
17 | linkrot takes a root URL and recurses down through the links it finds
18 | in the HTML pages, checking for broken links (HTTP status != 200).
19 |
20 | Options may also be specified as env vars prefixed with "LINKROT_".
21 |
22 | Options:
23 |
24 | -crawlers int
25 | number of concurrent crawlers (default 8)
26 | -exclude URL prefix
27 | URL prefix to ignore; can repeat to exclude multiple URLs
28 | -sentry-dsn pseudo-URL
29 | Sentry DSN pseudo-URL
30 | -should-archive
31 | send links to archive.org
32 | -timeout duration
33 | timeout for requesting a URL (default 10s)
34 | -verbose
35 | verbose
36 |
37 | $ linkrot -verbose http://example.com
38 | linkrot 2019/07/23 10:40:54 starting 4 crawlers
39 | linkrot 2019/07/23 10:40:54 Got OK: http://example.com/
40 | linkrot 2019/07/23 10:40:54 url http://example.com/ links to http://www.iana.org/domains/example
41 | linkrot 2019/07/23 10:40:55 Got OK: http://www.iana.org/domains/example
42 | ```
43 |
44 | Installation
45 | ------------
46 |
47 | Requires [Go](https://golang.org/) to be installed.
48 |
49 | ```
50 | $ go install github.com/spotlightpa/linkrot@latest
51 | ```
52 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2013 The Go Authors. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | * Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 | * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 | * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/xhtml/xhtml.go:
--------------------------------------------------------------------------------
1 | // Package xhtml makes x/net/html easier
2 | package xhtml
3 |
4 | import (
5 | "net/url"
6 |
7 | "golang.org/x/net/html"
8 | "golang.org/x/net/html/atom"
9 | )
10 |
11 | const (
12 | Done = false
13 | Continue = true
14 | )
15 |
16 | func BreadFirst(n *html.Node, yield func(*html.Node) bool) bool {
17 | if yield(n) == Done {
18 | return Done
19 | }
20 | for c := n.FirstChild; c != nil; c = c.NextSibling {
21 | if BreadFirst(c, yield) == Done {
22 | return Done
23 | }
24 | }
25 | return Continue
26 | }
27 |
28 | func VisitAll(n *html.Node, callback func(*html.Node)) {
29 | BreadFirst(n, func(n *html.Node) bool {
30 | callback(n)
31 | return true
32 | })
33 | }
34 |
35 | func IDs(doc *html.Node) (ids []string) {
36 | VisitAll(doc, func(n *html.Node) {
37 | ids = appendIDs(n, ids)
38 | })
39 |
40 | return ids
41 | }
42 |
43 | func Links(pageurl *url.URL, doc *html.Node) (links []string) {
44 | VisitAll(doc, func(n *html.Node) {
45 | if link := linkFromAHref(pageurl, n); link != "" {
46 | links = append(links, link)
47 | }
48 | })
49 |
50 | return links
51 | }
52 |
53 | func linkFromAHref(pageurl *url.URL, n *html.Node) (link string) {
54 | if n.DataAtom != atom.A {
55 | return
56 | }
57 |
58 | return resolveRef(pageurl, href(n))
59 | }
60 |
61 | func appendIDs(n *html.Node, ids []string) []string {
62 | for _, attr := range n.Attr {
63 | if attr.Key == "id" {
64 | ids = append(ids, attr.Val)
65 | }
66 | // collect old fashioned anchors
67 | if n.DataAtom == atom.A && attr.Key == "name" {
68 | ids = append(ids, attr.Val)
69 | }
70 | }
71 |
72 | return ids
73 | }
74 |
75 | func href(n *html.Node) string {
76 | for _, attr := range n.Attr {
77 | if attr.Key == "href" {
78 | return attr.Val
79 | }
80 | }
81 | return ""
82 | }
83 |
84 | func resolveRef(baseurl *url.URL, ref string) string {
85 | u, err := baseurl.Parse(ref)
86 | if err != nil {
87 | return ""
88 | }
89 | return u.String()
90 | }
91 |
--------------------------------------------------------------------------------
/linkcheck/linkcheck_test.go:
--------------------------------------------------------------------------------
1 | package linkcheck
2 |
3 | import (
4 | "io"
5 | "log"
6 | "log/slog"
7 | "net/http"
8 | "net/http/httptest"
9 | "testing"
10 |
11 | "github.com/carlmjohnson/be"
12 | )
13 |
14 | func TestRun(t *testing.T) {
15 | // Silence during test
16 | log.SetOutput(io.Discard)
17 |
18 | // Special for excluded path test
19 | excludePaths := []string{"https://example.com/excluded-path"}
20 |
21 | // Test server for our known sites
22 | ts := httptest.NewServer(http.FileServer(http.Dir("test-fixtures/sample-site")))
23 | defer ts.Close()
24 |
25 | var testcases = []struct {
26 | name string
27 | base string
28 | crawlers int
29 | errLen int
30 | contains string
31 | }{
32 | {"basic failure", ts.URL + "/404", 1, 1, "unexpected status: 404"},
33 | {"basic success", ts.URL + "/basic/", 1, 0, ""},
34 | {"more crawlers failure", ts.URL + "/404", 5, 1, "unexpected status: 404"},
35 | {"more crawlers success", ts.URL + "/basic/", 5, 0, ""},
36 | {"circular success", ts.URL + "/circular/", 1, 0, ""},
37 | {"good external link", ts.URL + "/external/good.html", 1, 0, ""},
38 | {"bad external link", ts.URL + "/external/bad.html", 1, 1, "unexpected status: 404"},
39 | {"excluded path", ts.URL + "external/excluded.html", 1, 0, ""},
40 | {"good ID link", ts.URL + "/id-good/", 1, 0, ""},
41 | {"bad ID link", ts.URL + "/id-bad/", 1, 1, "missing fragment"},
42 | {"ignore ID link", ts.URL + "/id-ignore/", 1, 0, ""},
43 | }
44 |
45 | for _, test := range testcases {
46 | test := test
47 | t.Run(test.name, func(t *testing.T) {
48 | c := crawler{
49 | test.base,
50 | test.crawlers,
51 | excludePaths,
52 | slog.New(slog.NewTextHandler(io.Discard, nil)),
53 | ts.Client(),
54 | false,
55 | }
56 |
57 | pages, _ := c.crawl()
58 | errs := pages.toURLErrors(c.base)
59 | output := errs.String()
60 |
61 | be.DebugLog(t, "errors: %q", errs)
62 | be.Equal(t, test.errLen, len(errs))
63 |
64 | if test.contains == "" {
65 | be.Zero(t, output)
66 | } else {
67 | be.In(t, test.contains, output)
68 | }
69 | })
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/linkcheck/datastructures.go:
--------------------------------------------------------------------------------
1 | package linkcheck
2 |
3 | import (
4 | "fmt"
5 | "net/url"
6 | "path"
7 | "strings"
8 | )
9 |
10 | type pageInfo struct {
11 | ids map[string]bool
12 | links map[string]bool
13 | err error
14 | }
15 |
16 | type crawledPages map[string]*pageInfo
17 |
18 | func (cp crawledPages) toURLErrors(base string) urlErrors {
19 | if u, err := url.Parse(base); err == nil {
20 | u.Path = path.Dir(u.Path)
21 | base = u.String()
22 | }
23 | requestErrs := make(urlErrors)
24 | // Put all errors into errs
25 | for url, pi := range cp {
26 | if pi.err != nil {
27 | requestErrs[url] = &pageError{pi.err, nil, nil}
28 | }
29 | }
30 | // For each page, if one of its links is in errs,
31 | // add that to the back refs and check for its
32 | // link ids in frags
33 | fragErrs := make(urlErrors)
34 | for page, pi := range cp {
35 | // ignore pages off site
36 | if !strings.HasPrefix(page, base) {
37 | continue
38 | }
39 | for link := range pi.links {
40 | link, frag := splitFragment(link)
41 | if pe, ok := requestErrs[link]; ok {
42 | pe.refs = append(pe.refs, page)
43 | }
44 | // Ignore empty # and URLs that look like JS apps (#!, #/)
45 | if frag == "" ||
46 | strings.HasPrefix(frag, "!") ||
47 | strings.HasPrefix(frag, "/") ||
48 | // Chrome Scroll to Text Fragment
49 | strings.HasPrefix(frag, ":~:") {
50 | continue
51 | }
52 |
53 | if target, ok := cp[link]; ok && target.ids[frag] {
54 | continue
55 | }
56 | if !strings.HasPrefix(link, base) {
57 | continue
58 | }
59 | // fragment was missing
60 | pe := fragErrs[link]
61 | if pe == nil {
62 | pe = &pageError{ErrMissingFragment, nil, make(map[string]bool)}
63 | fragErrs[link] = pe
64 | }
65 | pe.refs = append(pe.refs, page)
66 | pe.missingFragments[frag] = true
67 | }
68 | }
69 | // Merge errors
70 | for url, pe := range fragErrs {
71 | requestErrs[url] = pe
72 | }
73 | return requestErrs
74 | }
75 |
76 | type pageError struct {
77 | err error
78 | refs []string
79 | missingFragments map[string]bool
80 | }
81 |
82 | type urlErrors map[string]*pageError
83 |
84 | func (ue urlErrors) String() string {
85 | var buf strings.Builder
86 | for page, pe := range ue {
87 | fmt.Fprintf(&buf, "%q: %v\n", page, pe.err)
88 | if pe.err == ErrMissingFragment {
89 | fmt.Fprintf(&buf, "- ids: %s\n",
90 | strings.Join(setToSlice(pe.missingFragments), ", "),
91 | )
92 | }
93 | fmt.Fprintf(&buf, " - refs: %s\n", strings.Join(pe.refs, ", "))
94 | }
95 | return buf.String()
96 | }
97 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/carlmjohnson/be v0.23.1 h1:lmzkNRv25/mptDQ1ywXMvgQ6u6IZMYna/KHWAquM1II=
2 | github.com/carlmjohnson/be v0.23.1/go.mod h1:KAgPUh0HpzWYZZI+IABdo80wTgY43YhbdsiLYAaSI/Q=
3 | github.com/carlmjohnson/deque v0.23.1 h1:X2HOJM9xcglY03deMZ0oZ1V2xtbqYV7dJDnZiSZN4Ak=
4 | github.com/carlmjohnson/deque v0.23.1/go.mod h1:LF5NJjICBrEOPx84pxPL4nCimy5n9NQjxKi5cXkh+8U=
5 | github.com/carlmjohnson/exitcode v0.20.2 h1:vE6rmkCGNA4kO4m1qwWIa77PKlUBVg46cNjs22eAOXE=
6 | github.com/carlmjohnson/exitcode v0.20.2/go.mod h1:MZ6ThCDx517DQcrpYnnns1pLh8onjFl+B/AsrOrdmpc=
7 | github.com/carlmjohnson/flagx v0.22.2 h1:UXf7gL4Ffv5RIH/HKp8CGNzDyopgezFLrDO1m4F8jWc=
8 | github.com/carlmjohnson/flagx v0.22.2/go.mod h1:obobISvBnxgEXPLBITVXhRUOlSlzza1SGt34M64CPJc=
9 | github.com/carlmjohnson/flowmatic v0.23.4 h1:SfK6f+zKUlw4aga1ph+7/csqVeUAWnBxfqKN5gvQzzs=
10 | github.com/carlmjohnson/flowmatic v0.23.4/go.mod h1:Jpvyl591Dvkt9chYpnVupjxlKvqkZ9CtCmqL4wfQD7U=
11 | github.com/carlmjohnson/requests v0.23.4 h1:AxcvapfB9RPXLSyvAHk9YJoodQ43ZjzNHj6Ft3tQGdg=
12 | github.com/carlmjohnson/requests v0.23.4/go.mod h1:Qzp6tW4DQyainPP+tGwiJTzwxvElTIKm0B191TgTtOA=
13 | github.com/carlmjohnson/versioninfo v0.22.5 h1:O00sjOLUAFxYQjlN/bzYTuZiS0y6fWDQjMRvwtKgwwc=
14 | github.com/carlmjohnson/versioninfo v0.22.5/go.mod h1:QT9mph3wcVfISUKd0i9sZfVrPviHuSF+cUtLjm2WSf8=
15 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
16 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
17 | github.com/getsentry/sentry-go v0.23.0 h1:dn+QRCeJv4pPt9OjVXiMcGIBIefaTJPw/h0bZWO05nE=
18 | github.com/getsentry/sentry-go v0.23.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY=
19 | github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
20 | github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
21 | github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
22 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
23 | github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
24 | github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
25 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
26 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
27 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
28 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
29 | github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
30 | github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
31 | golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14=
32 | golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI=
33 | golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
34 | golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
35 | golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc=
36 | golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
37 | golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
38 | golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
39 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
40 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
41 |
--------------------------------------------------------------------------------
/linkcheck/linkcheck.go:
--------------------------------------------------------------------------------
1 | // Original copyright/license below.
2 | //
3 | // Copyright 2013 The Go Authors. All rights reserved.
4 | // Use of this source code is governed by a BSD-style
5 | // license that can be found in the LICENSE file.
6 |
7 | // Package linkcheck finds missing links in the given website.
8 | // It crawls a URL recursively and notes URLs and URL fragments
9 | // that it's seen and prints a report of missing links at the end.
10 | package linkcheck
11 |
12 | import (
13 | "context"
14 | "errors"
15 | "flag"
16 | "fmt"
17 | "log"
18 | "log/slog"
19 | "net"
20 | "net/http"
21 | "net/url"
22 | "os"
23 | "os/signal"
24 | "runtime"
25 | "strings"
26 | "time"
27 |
28 | "github.com/carlmjohnson/exitcode"
29 | "github.com/carlmjohnson/flagx"
30 | "github.com/carlmjohnson/flowmatic"
31 | "github.com/carlmjohnson/requests"
32 | "github.com/carlmjohnson/versioninfo"
33 | sentry "github.com/getsentry/sentry-go"
34 | "github.com/spotlightpa/linkrot/xhtml"
35 | "golang.org/x/net/html"
36 | )
37 |
38 | // Errors native to linkcheck
39 | var (
40 | ErrCancelled = exitcode.Set(errors.New("scraping canceled by SIGINT"), 3)
41 | ErrBadLinks = exitcode.Set(errors.New("found bad links"), 4)
42 | ErrMissingFragment = errors.New("page missing fragments")
43 | )
44 |
45 | const (
46 | safariUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Safari/605.1.15"
47 | )
48 |
49 | // CLI runs the linkrot executable, equivalent to calling it on the command line.
50 | func CLI(args []string) error {
51 | fl := flag.NewFlagSet("linkrot", flag.ContinueOnError)
52 | fl.Usage = func() {
53 | const usage = `Usage of linkrot %s %s:
54 |
55 | linkrot [options]
56 |
57 | linkrot takes a root URL and recurses down through the links it finds
58 | in the HTML pages, checking for broken links (HTTP status != 200).
59 |
60 | Options may also be specified as env vars prefixed with "LINKROT_".
61 |
62 | Options:
63 |
64 | `
65 | fmt.Fprintf(os.Stderr, usage, versioninfo.Version, versioninfo.Revision)
66 | fl.PrintDefaults()
67 | }
68 | versioninfo.AddFlag(fl)
69 | verbose := fl.Bool("verbose", false, "verbose")
70 | crawlers := fl.Int("crawlers", runtime.NumCPU(), "number of concurrent crawlers")
71 | timeout := fl.Duration("timeout", 10*time.Second, "timeout for requesting a URL")
72 | var excludePaths []string
73 | fl.Func("exclude", "`URL prefix` to ignore; can repeat to exclude multiple URLs", func(s string) error {
74 | excludePaths = append(excludePaths, strings.Split(s, ",")...)
75 | return nil
76 | })
77 | dsn := fl.String("sentry-dsn", "", "Sentry DSN `pseudo-URL`")
78 | shouldArchive := fl.Bool("should-archive", false, "send links to archive.org")
79 | if err := fl.Parse(args); err != nil {
80 | return err
81 | }
82 | if err := flagx.ParseEnv(fl, "linkrot"); err != nil {
83 | return err
84 | }
85 |
86 | root := fl.Arg(0)
87 | if root == "" {
88 | root = "http://localhost:8000"
89 | }
90 |
91 | base, err := url.Parse(root)
92 | if err != nil {
93 | log.Printf("parsing root URL: %v", err)
94 | return err
95 | }
96 |
97 | if base.Path == "" {
98 | base.Path = "/"
99 | }
100 |
101 | if *crawlers < 1 {
102 | log.Printf("need at least one crawler")
103 | return fmt.Errorf("bad crawler count: %d", *crawlers)
104 | }
105 | var opt slog.HandlerOptions
106 | if *verbose {
107 | opt = slog.HandlerOptions{
108 | Level: slog.LevelDebug,
109 | }
110 | }
111 | logger := slog.New(slog.NewTextHandler(os.Stderr, &opt))
112 |
113 | cl := &http.Client{
114 | Timeout: *timeout,
115 | Jar: requests.NewCookieJar(),
116 | }
117 |
118 | c := &crawler{
119 | base.String(),
120 | *crawlers,
121 | excludePaths,
122 | logger,
123 | cl,
124 | *shouldArchive,
125 | }
126 |
127 | c.sentryInit(*dsn)
128 |
129 | return c.run()
130 | }
131 |
132 | type crawler struct {
133 | base string
134 | workers int
135 | excludePaths []string
136 | l *slog.Logger
137 | *http.Client
138 | shouldArchive bool
139 | }
140 |
141 | func (c *crawler) sentryInit(dsn string) {
142 | sentry.Init(sentry.ClientOptions{
143 | Dsn: dsn,
144 | })
145 | }
146 |
147 | func (c *crawler) run() error {
148 | pages, cancelled := c.crawl()
149 | urlerrs := pages.toURLErrors(c.base)
150 | c.reportToSentry(urlerrs)
151 | if c.shouldArchive {
152 | c.l.Info("start", "baseurl", c.base)
153 | if errs := c.archiveAll(pages); errs != nil {
154 | c.l.Error("error", "error", errs)
155 | } else {
156 | c.l.Info("done")
157 | }
158 | }
159 |
160 | var err error
161 | if cancelled {
162 | err = ErrCancelled
163 | } else if len(urlerrs) > 0 {
164 | err = ErrBadLinks
165 | }
166 |
167 | return err
168 | }
169 |
170 | func (c *crawler) crawl() (crawled crawledPages, cancelled bool) {
171 | c.l.Debug("crawl()", "workers", c.workers)
172 | // subscribe to SIGINT signals, so that we still output on early exit
173 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
174 | defer cancel()
175 |
176 | task := func(url string) (*pageInfo, error) {
177 | c.l.Debug("doFetch", "url", url)
178 | links, ids, err := c.doFetch(ctx, url)
179 | if err != nil {
180 | c.l.Error("doFetch", "error", err, "url", url)
181 | return &pageInfo{err: err}, err
182 | }
183 | c.l.Debug("doFetch done",
184 | "url", url,
185 | "links", len(links),
186 | "ids", len(ids))
187 | return &pageInfo{
188 | links: sliceToSet(links),
189 | ids: sliceToSet(ids),
190 | }, nil
191 | }
192 |
193 | queued := make(map[string]bool)
194 | crawled = make(crawledPages)
195 |
196 | manager := func(url string, info *pageInfo, err error) ([]string, bool) {
197 | crawled[url] = info
198 | if err != nil {
199 | return nil, true
200 | }
201 | var addToQueue []string
202 | for link := range info.links {
203 | link = removeFragment(link)
204 | if !queued[link] {
205 | addToQueue = append(addToQueue, link)
206 | queued[link] = true
207 | }
208 | }
209 | return addToQueue, true
210 | }
211 |
212 | flowmatic.ManageTasks(c.workers, task, manager, c.base)
213 |
214 | cancelled = ctx.Err() != nil
215 |
216 | return crawled, cancelled
217 | }
218 |
219 | func (c *crawler) doFetch(ctx context.Context, pageurl string) (links, ids []string, err error) {
220 | method := http.MethodGet
221 | if c.isExternal(pageurl) {
222 | method = http.MethodHead
223 | }
224 | var doc html.Node
225 | for {
226 | err = requests.
227 | URL(pageurl).
228 | Method(method).
229 | Accept("text/html,application/xhtml+xml,application/xml,*/*").
230 | Header("Accept-Language", "en-US,en;q=0.9").
231 | UserAgent(safariUserAgent).
232 | Client(c.Client).
233 | CheckStatus(http.StatusOK).
234 | CheckContentType(
235 | "text/html",
236 | "application/xhtml+xml",
237 | "text/xml",
238 | "text/plain",
239 | ).
240 | CheckPeek(512, func(b []byte) error {
241 | if ct := http.DetectContentType(b); !strings.Contains(ct, "html") {
242 | return fmt.Errorf("content-type is %s", ct)
243 | }
244 | return nil
245 | }).
246 | AddValidator(func(res *http.Response) error {
247 | // If we've been 30X redirected, pageurl will not be response URL
248 | pageurl = res.Request.URL.String()
249 | return nil
250 | }).
251 | Handle(requests.ToHTML(&doc)).
252 | Fetch(ctx)
253 | if method == http.MethodGet || err == nil {
254 | break
255 | }
256 | method = http.MethodGet
257 | }
258 | switch {
259 | case err == nil:
260 | break
261 | case
262 | // report 401, 404, 410; ignore temporary status errors
263 | requests.HasStatusErr(err,
264 | http.StatusUnauthorized,
265 | http.StatusNotFound,
266 | http.StatusGone),
267 | // Report DNS errors
268 | errors.As(err, new(*net.DNSError)):
269 |
270 | return nil, nil, err
271 |
272 | default:
273 | // Ignore other errors
274 | c.l.Debug("doFetch ignore", "url", pageurl, "err", err)
275 | return nil, nil, nil
276 | }
277 |
278 | if !c.isExternal(pageurl) {
279 | ids = xhtml.IDs(&doc)
280 | // must be a good URL coz I fetched it
281 | u, _ := url.Parse(pageurl)
282 | allLinks := xhtml.Links(u, &doc)
283 | for _, link := range allLinks {
284 | if !c.isExcluded(link) {
285 | links = append(links, link)
286 | }
287 | }
288 | }
289 |
290 | return links, ids, nil
291 | }
292 |
293 | func (c *crawler) isExternal(link string) bool {
294 | return !strings.HasPrefix(link, c.base)
295 | }
296 |
297 | func (c *crawler) isExcluded(link string) bool {
298 | if !strings.HasPrefix(link, "http://") && !strings.HasPrefix(link, "https://") {
299 | return true
300 | }
301 |
302 | for _, prefixPath := range c.excludePaths {
303 | if strings.HasPrefix(link, prefixPath) {
304 | return true
305 | }
306 | }
307 | return false
308 | }
309 |
310 | func (c *crawler) reportToSentry(errs urlErrors) {
311 | defer sentry.Flush(10 * time.Second)
312 | for url, pe := range errs {
313 | sentry.WithScope(func(scope *sentry.Scope) {
314 | event := sentry.NewEvent()
315 | scope.SetFingerprint([]string{url})
316 | scope.SetTag("URL", url)
317 | errType := "request error"
318 | if pe.err == ErrMissingFragment {
319 | errType = "missing page IDs"
320 | frags := setToSlice(pe.missingFragments)
321 | scope.SetExtra("missing page IDs", frags)
322 | }
323 | scope.SetTag("failure type", errType)
324 | scope.SetExtra("affected-pages", pe.refs)
325 | event.Exception = []sentry.Exception{{
326 | Type: url,
327 | Value: pe.err.Error(),
328 | }}
329 | sentry.CaptureEvent(event)
330 | })
331 | }
332 | }
333 |
--------------------------------------------------------------------------------