├── .github └── workflows │ ├── audit.yml │ └── ci.yml ├── LICENSE ├── README.md ├── cmd └── weaver │ └── main.go ├── go.mod ├── go.sum ├── weaver.go ├── weaver.png └── weaver_test.go /.github/workflows/audit.yml: -------------------------------------------------------------------------------- 1 | name: Security audit 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | 7 | jobs: 8 | security_audit: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: golang/govulncheck-action@v1 12 | with: 13 | go-version-input: 'stable' 14 | check-latest: true 15 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/mvdan/github-actions-golang 2 | on: [pull_request, workflow_dispatch] 3 | name: CI 4 | jobs: 5 | test: 6 | strategy: 7 | matrix: 8 | go-version: ['stable'] 9 | os: [ubuntu-latest, macos-latest, windows-latest] 10 | runs-on: ${{ matrix.os }} 11 | steps: 12 | - uses: actions/setup-go@v4 13 | with: 14 | go-version: ${{ matrix.go-version }} 15 | - uses: actions/checkout@v3 16 | - run: go test ./... 17 | 18 | gocritic: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/setup-go@v4 22 | - uses: actions/checkout@v3 23 | - run: | 24 | go install github.com/go-critic/go-critic/cmd/gocritic@latest 25 | gocritic check . 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 John Arundel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://pkg.go.dev/github.com/bitfield/weaver) 2 | [](https://goreportcard.com/report/github.com/bitfield/weaver) 3 |  4 |  5 | 6 | # Weaver 7 | 8 |  9 | 10 | `weaver` is a command-line tool for checking links on websites. 11 | 12 | > *Old stories would tell how Weavers would kill each other over aesthetic disagreements, such as whether it was prettier to destroy an army of a thousand men or to leave it be, or whether a particular dandelion should or should not be plucked. For a Weaver, to think was to think aesthetically. To act—to Weave—was to bring about more pleasing patterns. They did not eat physical food: they seemed to subsist on the appreciation of beauty.*\ 13 | —China Miéville, [“Perdido Street Station”](https://amzn.to/4603LLS) 14 | 15 | 16 | Here's how to install it: 17 | 18 | ```sh 19 | go install github.com/bitfield/weaver/cmd/weaver@latest 20 | ``` 21 | 22 | To run it: 23 | 24 | ```sh 25 | weaver https://example.com 26 | ``` 27 | ``` 28 | Links: 2 (2 OK, 0 errors, 0 warnings) [1s] 29 | ``` 30 | 31 | ## Verbose mode 32 | 33 | To see more information about what's going on, use the `-v` flag: 34 | 35 | ```sh 36 | weaver -v https://example.com 37 | ``` 38 | ``` 39 | [OKAY] https://example.com (200 OK) (referrer: START) 40 | [OKAY] https://www.iana.org/domains/example (200 OK) (referrer: https://example.com) 41 | 42 | Links: 2 (2 OK, 0 errors, 0 warnings) [800ms] 43 | ``` 44 | 45 | ## How it works 46 | 47 | The program checks the status of the specified URL. If the server responds with an HTML page, the program will parse this page for links, and check each new link for its status. 48 | 49 | If the link points to the same domain as the original URL, it is also parsed for further links, and so on recursively until all links on the site have been visited. 50 | 51 | Any broken links will be reported, together with the referring page: 52 | 53 | ``` 54 | [DEAD] https://example.com/bogus (404 Not Found) (referrer: https://example.com/) 55 | ``` 56 | 57 | ## Rate limiting 58 | 59 | The program attempts to continuously adapt its request rate to suit the server. On receiving a `429 Too Many Requests` response, it will reduce the current request rate. After a while with no further 429 responses, it will steadily increase the rate until it trips the rate limit once again. 60 | 61 | Even without receiving any 429 responses, the program limits itself to a maximum of 5 requests per second, to be respectful of server resources. 62 | -------------------------------------------------------------------------------- /cmd/weaver/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/bitfield/weaver" 7 | ) 8 | 9 | func main() { 10 | os.Exit(weaver.Main()) 11 | } 12 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/bitfield/weaver 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.24.0 6 | 7 | require ( 8 | github.com/antchfx/htmlquery v1.3.1 9 | github.com/fatih/color v1.16.0 10 | github.com/google/go-cmp v0.6.0 11 | golang.org/x/time v0.5.0 12 | ) 13 | 14 | require ( 15 | github.com/antchfx/xpath v1.3.0 // indirect 16 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 17 | github.com/mattn/go-colorable v0.1.13 // indirect 18 | github.com/mattn/go-isatty v0.0.20 // indirect 19 | golang.org/x/net v0.38.0 // indirect 20 | golang.org/x/sys v0.31.0 // indirect 21 | golang.org/x/text v0.23.0 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= 2 | github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= 3 | github.com/antchfx/htmlquery v1.3.1 h1:wm0LxjLMsZhRHfQKKZscDf2COyH4vDYA3wyH+qZ+Ylc= 4 | github.com/antchfx/htmlquery v1.3.1/go.mod h1:PTj+f1V2zksPlwNt7uVvZPsxpKNa7mlVliCRxLX6Nx8= 5 | github.com/antchfx/xpath v1.2.3 h1:CCZWOzv5bAqjVv0offZ2LVgVYFbeldKQVuLNbViZdes= 6 | github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= 7 | github.com/antchfx/xpath v1.3.0 h1:nTMlzGAK3IJ0bPpME2urTuFL76o4A96iYvoKFHRXJgc= 8 | github.com/antchfx/xpath v1.3.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= 9 | github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= 10 | github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= 11 | github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= 12 | github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= 13 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= 14 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 15 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 16 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 17 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 18 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 19 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 20 | github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= 21 | github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 22 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 23 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 24 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 25 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 26 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 27 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 28 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 29 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 30 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 31 | golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw= 32 | golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= 33 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 34 | golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= 35 | golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= 36 | golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= 37 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 38 | golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= 39 | golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= 40 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 41 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 42 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 43 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 44 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 45 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 47 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 48 | golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 49 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 50 | golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= 51 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 52 | golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= 53 | golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 54 | golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= 55 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 56 | golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= 57 | golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 58 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 59 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 60 | golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= 61 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 62 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 63 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 64 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 65 | golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k= 66 | golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 67 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 68 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= 69 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 70 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= 71 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 72 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= 73 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= 74 | golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= 75 | golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= 76 | golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= 77 | golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 78 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 79 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 80 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 81 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 82 | -------------------------------------------------------------------------------- /weaver.go: -------------------------------------------------------------------------------- 1 | package weaver 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "errors" 7 | "flag" 8 | "fmt" 9 | "io" 10 | "net/http" 11 | "net/url" 12 | "os" 13 | "os/signal" 14 | "strings" 15 | "time" 16 | 17 | "github.com/antchfx/htmlquery" 18 | "github.com/fatih/color" 19 | "golang.org/x/time/rate" 20 | ) 21 | 22 | const ( 23 | maxRate rate.Limit = 5 24 | fakeUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" 25 | ) 26 | 27 | type Checker struct { 28 | Verbose bool 29 | Output io.Writer 30 | BaseURL *url.URL 31 | HTTPClient *http.Client 32 | Limiter *AdaptiveRateLimiter 33 | results []Result 34 | visited map[string]bool 35 | } 36 | 37 | func NewChecker() *Checker { 38 | return &Checker{ 39 | Verbose: false, 40 | Output: os.Stdout, 41 | HTTPClient: &http.Client{ 42 | Timeout: 5 * time.Second, 43 | }, 44 | Limiter: NewAdaptiveRateLimiter(), 45 | visited: map[string]bool{}, 46 | } 47 | } 48 | 49 | func (c *Checker) Check(ctx context.Context, site string) { 50 | base, err := url.Parse(site) 51 | if err != nil { 52 | c.RecordResult(site, "START", err, nil) 53 | return 54 | } 55 | c.BaseURL = base 56 | if !strings.HasSuffix(site, "/") { 57 | site += "/" 58 | } 59 | c.visited[site] = true 60 | c.Crawl(ctx, base, "START") 61 | } 62 | 63 | func (c *Checker) Crawl(ctx context.Context, page *url.URL, referrer string) { 64 | c.Limiter.Wait(ctx) 65 | req, err := http.NewRequest("GET", page.String(), nil) 66 | if err != nil { 67 | c.RecordResult(page.String(), referrer, err, nil) 68 | return 69 | } 70 | req.Header.Set("User-Agent", fakeUserAgent) 71 | resp, err := c.HTTPClient.Do(req) 72 | if err != nil { 73 | c.RecordResult(page.String(), referrer, err, resp) 74 | return 75 | } 76 | defer resp.Body.Close() 77 | if resp.StatusCode == http.StatusTooManyRequests { 78 | c.Limiter.ReduceLimit() 79 | if c.Verbose { 80 | fmt.Fprintf(c.Output, "[INFO] reducing rate limit to %.2fr/s\n", c.Limiter.Limit()) 81 | } 82 | c.Crawl(ctx, page, referrer) 83 | return 84 | } 85 | if c.Limiter.GraduallyIncreaseRateLimit() && c.Verbose { 86 | fmt.Fprintf(c.Output, "[INFO] increasing rate limit to %.2fr/s\n", c.Limiter.Limit()) 87 | } 88 | c.RecordResult(page.String(), referrer, err, resp) 89 | if page.Host != c.BaseURL.Host { 90 | return // skip parsing offsite pages 91 | } 92 | doc, err := htmlquery.Parse(resp.Body) 93 | if err != nil { 94 | return // skip invalid HTML 95 | } 96 | list := htmlquery.Find(doc, "//a/@href") 97 | for _, anchor := range list { 98 | link := htmlquery.SelectAttr(anchor, "href") 99 | u, err := url.Parse(link) 100 | if err != nil { 101 | c.RecordResult(link, page.String(), err, nil) 102 | return 103 | } 104 | if u.Scheme == "mailto" { 105 | continue 106 | } 107 | target := page.ResolveReference(u) 108 | if !c.visited[target.String()] { 109 | c.visited[target.String()] = true 110 | c.Crawl(ctx, target, page.String()) 111 | } 112 | } 113 | } 114 | 115 | func (c *Checker) RecordResult(link, referrer string, err error, resp *http.Response) { 116 | res := Result{ 117 | Status: StatusError, 118 | Link: link, 119 | Referrer: referrer, 120 | } 121 | if err != nil { 122 | res.Message = err.Error() 123 | var e *tls.CertificateVerificationError 124 | if errors.As(err, &e) { 125 | res.Status = StatusWarning 126 | } 127 | fmt.Fprintln(c.Output, res) 128 | c.results = append(c.results, res) 129 | return 130 | } 131 | res.Message = resp.Status 132 | switch resp.StatusCode { 133 | case http.StatusOK: 134 | res.Status = StatusOK 135 | case http.StatusNotFound, 136 | http.StatusNotAcceptable, 137 | http.StatusGone, 138 | http.StatusUnauthorized, 139 | http.StatusBadRequest, 140 | http.StatusForbidden: 141 | res.Status = StatusError 142 | default: 143 | res.Status = StatusWarning 144 | } 145 | if res.Status == StatusError || res.Status == StatusWarning || c.Verbose { 146 | fmt.Fprintln(c.Output, res) 147 | } 148 | c.results = append(c.results, res) 149 | } 150 | 151 | func (c *Checker) Results() []Result { 152 | return c.results 153 | } 154 | 155 | type Result struct { 156 | Link string 157 | Status Status 158 | Message string 159 | Referrer string 160 | } 161 | 162 | func (r Result) String() string { 163 | return fmt.Sprintf("[%s] %s (%s) — referrer: %s", 164 | r.Status, 165 | r.Link, 166 | r.Message, 167 | r.Referrer, 168 | ) 169 | } 170 | 171 | type Status string 172 | 173 | func (s Status) String() string { 174 | msg := string(s) 175 | switch s { 176 | case StatusOK, StatusSkipped: 177 | return color.GreenString(msg) 178 | case StatusWarning: 179 | return color.YellowString(msg) 180 | case StatusError: 181 | return color.RedString(msg) 182 | default: 183 | return msg 184 | } 185 | } 186 | 187 | const ( 188 | StatusOK Status = "OKAY" 189 | StatusWarning Status = "WARN" 190 | StatusError Status = "DEAD" 191 | StatusSkipped Status = "SKIP" 192 | ) 193 | 194 | var usage = `Usage: weaver [-v] URL 195 | 196 | Checks the website at URL, following all links and reporting any broken links or errors. 197 | 198 | In verbose mode (-v), reports all links found.` 199 | 200 | func Main() int { 201 | verbose := flag.Bool("v", false, "verbose output") 202 | flag.Parse() 203 | if len(flag.Args()) == 0 { 204 | fmt.Println(usage) 205 | return 0 206 | } 207 | site := flag.Args()[0] 208 | ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) 209 | defer cancel() 210 | c := NewChecker() 211 | c.Verbose = *verbose 212 | start := time.Now() 213 | go func() { 214 | c.Check(ctx, site) 215 | cancel() 216 | }() 217 | <-ctx.Done() 218 | results := c.Results() 219 | ok, errors, warnings := 0, 0, 0 220 | if len(results) > 0 { 221 | for _, link := range results { 222 | switch link.Status { 223 | case StatusOK, StatusSkipped: 224 | ok++ 225 | case StatusError: 226 | errors++ 227 | case StatusWarning: 228 | warnings++ 229 | } 230 | } 231 | } 232 | fmt.Printf("\nLinks: %d (%d OK, %d errors, %d warnings) [%s]\n", 233 | len(results), ok, errors, warnings, 234 | time.Since(start).Round(100*time.Millisecond), 235 | ) 236 | return 0 237 | } 238 | 239 | type AdaptiveRateLimiter struct { 240 | limiter *rate.Limiter 241 | limitLastUpdated time.Time 242 | } 243 | 244 | func NewAdaptiveRateLimiter() *AdaptiveRateLimiter { 245 | return &AdaptiveRateLimiter{ 246 | limiter: rate.NewLimiter(maxRate, 1), 247 | limitLastUpdated: time.Now(), 248 | } 249 | } 250 | 251 | func (a *AdaptiveRateLimiter) Wait(ctx context.Context) { 252 | a.limiter.Wait(ctx) 253 | } 254 | 255 | func (a *AdaptiveRateLimiter) GraduallyIncreaseRateLimit() (increased bool) { 256 | curLimit := a.limiter.Limit() 257 | if curLimit >= maxRate { 258 | return false 259 | } 260 | if time.Since(a.limitLastUpdated) <= 10*time.Second { 261 | return false 262 | } 263 | curLimit *= 1.5 264 | if curLimit > maxRate { 265 | curLimit = maxRate 266 | } 267 | a.limiter.SetLimit(curLimit) 268 | a.limitLastUpdated = time.Now() 269 | return true 270 | } 271 | 272 | func (a *AdaptiveRateLimiter) ReduceLimit() { 273 | curLimit := a.limiter.Limit() 274 | a.limiter.SetLimit(curLimit / 2) 275 | a.limitLastUpdated = time.Now() 276 | } 277 | 278 | func (a AdaptiveRateLimiter) Limit() rate.Limit { 279 | return a.limiter.Limit() 280 | } 281 | 282 | func (a AdaptiveRateLimiter) SetLimit(r rate.Limit) { 283 | a.limiter.SetLimit(r) 284 | } 285 | -------------------------------------------------------------------------------- /weaver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bitfield/weaver/290ed6573ea32e6354b4fd683139ac987e0ba5d6/weaver.png -------------------------------------------------------------------------------- /weaver_test.go: -------------------------------------------------------------------------------- 1 | package weaver_test 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "log" 7 | "net/http" 8 | "net/http/httptest" 9 | "testing" 10 | "testing/fstest" 11 | 12 | "github.com/bitfield/weaver" 13 | "github.com/google/go-cmp/cmp" 14 | "golang.org/x/time/rate" 15 | ) 16 | 17 | func TestCrawlReturnsExpectedResults(t *testing.T) { 18 | t.Parallel() 19 | ts := httptest.NewTLSServer( 20 | http.FileServerFS(testFS), 21 | ) 22 | defer ts.Close() 23 | c := weaver.NewChecker() 24 | c.HTTPClient = ts.Client() 25 | c.Output = io.Discard 26 | c.Limiter.SetLimit(rate.Inf) 27 | c.Check(context.Background(), ts.URL) 28 | want := []weaver.Result{ 29 | { 30 | Link: ts.URL, 31 | Status: weaver.StatusOK, 32 | Message: "200 OK", 33 | Referrer: "START", 34 | }, 35 | { 36 | Link: ts.URL + "/go/sucks.html", 37 | Status: weaver.StatusOK, 38 | Message: "200 OK", 39 | Referrer: ts.URL, 40 | }, 41 | { 42 | Link: ts.URL + "/bogus", 43 | Status: weaver.StatusError, 44 | Message: "404 Not Found", 45 | Referrer: ts.URL + "/go/sucks.html", 46 | }, 47 | { 48 | Link: ts.URL + "/go/post.html", 49 | Status: weaver.StatusOK, 50 | Message: "200 OK", 51 | Referrer: ts.URL + "/go/sucks.html", 52 | }, 53 | { 54 | Link: ts.URL + "/rust_rules.html", 55 | Status: weaver.StatusError, 56 | Message: "404 Not Found", 57 | Referrer: ts.URL, 58 | }, 59 | { 60 | Link: ts.URL + "/invalid_links.html", 61 | Status: weaver.StatusOK, 62 | Message: "200 OK", 63 | Referrer: ts.URL, 64 | }, 65 | { 66 | Link: "httq://invalid_scheme.html", 67 | Status: weaver.StatusError, 68 | Message: `Get "httq://invalid_scheme.html": unsupported protocol scheme "httq"`, 69 | Referrer: ts.URL + "/invalid_links.html", 70 | }, 71 | { 72 | Link: "http:// /", 73 | Status: weaver.StatusError, 74 | Message: `parse "http:// /": invalid character " " in host name`, 75 | Referrer: ts.URL + "/invalid_links.html", 76 | }, 77 | } 78 | got := c.Results() 79 | if !cmp.Equal(want, got) { 80 | t.Error(cmp.Diff(want, got)) 81 | } 82 | } 83 | 84 | func TestReduceLimit_SetsCorrectLimit(t *testing.T) { 85 | t.Parallel() 86 | a := weaver.NewAdaptiveRateLimiter() 87 | a.SetLimit(4) 88 | a.ReduceLimit() 89 | want := rate.Limit(2) 90 | got := a.Limit() 91 | if want != got { 92 | t.Errorf("want %.2f, got %.2f", want, got) 93 | } 94 | } 95 | 96 | func TestCertVerifyFailuresAreRecordedAsWarnings(t *testing.T) { 97 | t.Parallel() 98 | ts := httptest.NewTLSServer(nil) 99 | defer ts.Close() 100 | ts.Config.ErrorLog = log.New(io.Discard, "", 0) 101 | c := weaver.NewChecker() 102 | c.Output = io.Discard 103 | c.Check(context.Background(), ts.URL) 104 | got := c.Results() 105 | if len(got) != 1 { 106 | t.Fatalf("unexpected result set %v", got) 107 | } 108 | res := got[0] 109 | if res.Link != ts.URL { 110 | t.Errorf("want URL %q, got %q", ts.URL, res.Link) 111 | } 112 | if res.Status != weaver.StatusWarning { 113 | t.Errorf("want status %q, got %q", weaver.StatusWarning, res.Status) 114 | } 115 | } 116 | 117 | var testFS = fstest.MapFS{ 118 | "go/sucks.html": { 119 | Data: []byte(`