├── .github └── FUNDING.yml ├── .gitignore ├── go.mod ├── .travis.yml ├── example ├── short │ └── main.go ├── shortauto │ └── main.go └── full │ └── main.go ├── misc └── git │ └── pre-commit ├── LICENSE ├── go.sum ├── iq_slice.go ├── cmd.go ├── handler_test.go ├── doc.go ├── README.md ├── handler.go ├── cmd_test.go ├── fetch.go └── fetch_test.go /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [mna] 2 | custom: ["https://www.buymeacoffee.com/mna"] 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-* 2 | .DS_Store 3 | *.swp 4 | *.swo 5 | #*.*# 6 | tags 7 | fetchbot.test 8 | example/full/full 9 | example/short/short 10 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/PuerkitoBio/fetchbot 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.6.1 7 | github.com/temoto/robotstxt v1.1.1 8 | ) 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: go 3 | 4 | go: 5 | - 1.3.x 6 | - 1.4.x 7 | - 1.5.x 8 | - 1.6.x 9 | - 1.7.x 10 | - 1.8.x 11 | - 1.9.x 12 | - "1.10.x" 13 | - "1.11.x" 14 | - "1.12.x" 15 | - "1.13.x" 16 | - tip 17 | -------------------------------------------------------------------------------- /example/short/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | 7 | "github.com/PuerkitoBio/fetchbot" 8 | ) 9 | 10 | func main() { 11 | f := fetchbot.New(fetchbot.HandlerFunc(handler)) 12 | queue := f.Start() 13 | queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") 14 | queue.Close() 15 | } 16 | 17 | func handler(ctx *fetchbot.Context, res *http.Response, err error) { 18 | if err != nil { 19 | fmt.Printf("error: %s\n", err) 20 | return 21 | } 22 | fmt.Printf("[%d] %s %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL()) 23 | } 24 | -------------------------------------------------------------------------------- /example/shortauto/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "time" 7 | 8 | "github.com/PuerkitoBio/fetchbot" 9 | ) 10 | 11 | func main() { 12 | f := fetchbot.New(fetchbot.HandlerFunc(handler)) 13 | f.AutoClose = true 14 | f.WorkerIdleTTL = time.Second 15 | queue := f.Start() 16 | queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") 17 | queue.Block() 18 | } 19 | 20 | func handler(ctx *fetchbot.Context, res *http.Response, err error) { 21 | if err != nil { 22 | fmt.Printf("error: %s\n", err) 23 | return 24 | } 25 | fmt.Printf("[%d] %s %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL()) 26 | } 27 | -------------------------------------------------------------------------------- /misc/git/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2012 The Go Authors. All rights reserved. 3 | # Use of this source code is governed by a BSD-style 4 | # license that can be found in the LICENSE file. 5 | 6 | # git gofmt pre-commit hook 7 | # 8 | # To use, store as .git/hooks/pre-commit inside your repository and make sure 9 | # it has execute permissions. 10 | # 11 | # This script does not handle file names that contain spaces. 12 | 13 | # golint is purely informational, it doesn't fail with exit code != 0 if it finds something, 14 | # because it may find a lot of false positives. Just print out its result for information. 15 | echo "> lint" 16 | golint . 17 | echo "< lint" 18 | echo 19 | 20 | # go vet returns 1 if an error was found. Exit the hook with this exit code. 21 | echo "> vet" 22 | go vet ./... 23 | vetres=$? 24 | echo "< vet" 25 | echo 26 | 27 | # Check for gofmt problems and report if any. 28 | gofiles=$(git diff --cached --name-only --diff-filter=ACM | grep '.go$') 29 | [ -z "$gofiles" ] && echo "EXIT $vetres" && exit $vetres 30 | 31 | unformatted=$(gofmt -l $gofiles) 32 | [ -z "$unformatted" ] && echo "EXIT $vetres" && exit $vetres 33 | 34 | # Some files are not gofmt'd. Print message and fail. 35 | 36 | echo "> fmt" 37 | echo >&2 "Go files must be formatted with gofmt. Please run:" 38 | for fn in $unformatted; do 39 | echo >&2 " gofmt -w $PWD/$fn" 40 | done 41 | echo "< fmt" 42 | echo 43 | 44 | echo "EXIT 1" 45 | exit 1 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Martin Angers & Contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= 2 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 3 | github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk= 4 | github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 5 | github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= 6 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 7 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 8 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 10 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 11 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 12 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 13 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 14 | github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= 15 | github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 16 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 17 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 18 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= 19 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 20 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 21 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 22 | -------------------------------------------------------------------------------- /iq_slice.go: -------------------------------------------------------------------------------- 1 | /* 2 | https://github.com/kylelemons/iq 3 | 4 | Copyright 2010 Kyle Lemons 5 | Copyright 2011 Google, Inc. (for changes on or after Feb. 22, 2011) 6 | 7 | The accompanying software is licensed under the Common Development and 8 | Distribution License, Version 1.0 (CDDL-1.0, the "License"); you may not use 9 | any part of this software except in compliance with the License. 10 | 11 | You may obtain a copy of the License at 12 | http://opensource.org/licenses/CDDL-1.0 13 | More information about the CDDL can be found at 14 | http://hub.opensolaris.org/bin/view/Main/licensing_faq 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 18 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 19 | License for the specific language governing permissions and limitations under 20 | the License. 21 | */ 22 | 23 | package fetchbot 24 | 25 | // sliceIQ creates an infinite buffered channel taking input on 26 | // in and sending output to next. SliceIQ should be run in its 27 | // own goroutine. 28 | func sliceIQ(in <-chan Command, next chan<- Command) { 29 | defer close(next) 30 | 31 | // pending events (this is the "infinite" part) 32 | pending := []Command{} 33 | 34 | recv: 35 | for { 36 | // Ensure that pending always has values so the select can 37 | // multiplex between the receiver and sender properly 38 | if len(pending) == 0 { 39 | v, ok := <-in 40 | if !ok { 41 | // in is closed, flush values 42 | break 43 | } 44 | 45 | // We now have something to send 46 | pending = append(pending, v) 47 | } 48 | 49 | select { 50 | // Queue incoming values 51 | case v, ok := <-in: 52 | if !ok { 53 | // in is closed, flush values 54 | break recv 55 | } 56 | pending = append(pending, v) 57 | 58 | // Send queued values 59 | case next <- pending[0]: 60 | pending[0] = nil 61 | pending = pending[1:] 62 | } 63 | } 64 | 65 | // After in is closed, we may still have events to send 66 | for _, v := range pending { 67 | next <- v 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /cmd.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "io" 9 | "net/http" 10 | "net/url" 11 | ) 12 | 13 | // Command interface defines the methods required by the Fetcher to request 14 | // a resource. 15 | type Command interface { 16 | URL() *url.URL 17 | Method() string 18 | } 19 | 20 | // BasicAuthProvider interface gets the credentials to use to perform the request 21 | // with Basic Authentication. 22 | type BasicAuthProvider interface { 23 | BasicAuth() (user string, pwd string) 24 | } 25 | 26 | // ReaderProvider interface gets the Reader to use as the Body of the request. It has 27 | // higher priority than the ValuesProvider interface, so that if both interfaces are implemented, 28 | // the ReaderProvider is used. 29 | type ReaderProvider interface { 30 | Reader() io.Reader 31 | } 32 | 33 | // ValuesProvider interface gets the values to send as the Body of the request. It has 34 | // lower priority than the ReaderProvider interface, so that if both interfaces are implemented, 35 | // the ReaderProvider is used. If the request has no explicit Content-Type set, it will be automatically 36 | // set to "application/x-www-form-urlencoded". 37 | type ValuesProvider interface { 38 | Values() url.Values 39 | } 40 | 41 | // CookiesProvider interface gets the cookies to send with the request. 42 | type CookiesProvider interface { 43 | Cookies() []*http.Cookie 44 | } 45 | 46 | // HeaderProvider interface gets the headers to set on the request. If an Authorization 47 | // header is set, it will be overridden by the BasicAuthProvider, if implemented. 48 | type HeaderProvider interface { 49 | Header() http.Header 50 | } 51 | 52 | // Cmd defines a basic Command implementation. 53 | type Cmd struct { 54 | U *url.URL 55 | M string 56 | } 57 | 58 | // URL returns the resource targeted by this command. 59 | func (c *Cmd) URL() *url.URL { 60 | return c.U 61 | } 62 | 63 | // Method returns the HTTP verb to use to process this command (i.e. "GET", "HEAD", etc.). 64 | func (c *Cmd) Method() string { 65 | return c.M 66 | } 67 | 68 | // HandlerCmd is a basic Command with its own Handler function that is called 69 | // to handle the HTTP response. 70 | type HandlerCmd struct { 71 | *Cmd 72 | HandlerFunc 73 | } 74 | 75 | // NewHandlerCmd creates a HandlerCmd for the provided request and callback 76 | // handler function. 77 | func NewHandlerCmd(method, rawURL string, fn func(*Context, *http.Response, error)) (*HandlerCmd, error) { 78 | parsedURL, err := url.Parse(rawURL) 79 | if err != nil { 80 | return nil, err 81 | } 82 | return &HandlerCmd{&Cmd{parsedURL, method}, HandlerFunc(fn)}, nil 83 | } 84 | 85 | // robotCommand is a "sentinel type" used to distinguish the automatically enqueued robots.txt 86 | // command from the user-enqueued commands. 87 | type robotCommand struct { 88 | *Cmd 89 | } 90 | -------------------------------------------------------------------------------- /handler_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "fmt" 9 | "net/http" 10 | "net/http/httptest" 11 | "net/url" 12 | "strings" 13 | "sync" 14 | "testing" 15 | ) 16 | 17 | type traceCmd struct { 18 | *Cmd 19 | Trace string 20 | } 21 | 22 | // Avoid data races 23 | var mu sync.Mutex 24 | 25 | func setTrace(l string) Handler { 26 | return HandlerFunc(func(ctx *Context, res *http.Response, err error) { 27 | if err != nil && testing.Verbose() { 28 | fmt.Println(err) 29 | } 30 | mu.Lock() 31 | defer mu.Unlock() 32 | ctx.Cmd.(*traceCmd).Trace = l 33 | }) 34 | } 35 | 36 | func TestMux(t *testing.T) { 37 | // Start the servers 38 | srv1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 39 | switch r.URL.Path { 40 | case "/robots.txt": 41 | w.Write([]byte(` 42 | User-agent: * 43 | Disallow: /deny 44 | `)) 45 | return 46 | case "/204": 47 | w.Header().Set("Content-Type", "text/plain") 48 | w.WriteHeader(204) 49 | w.Write(nil) 50 | return 51 | case "/4xx": 52 | w.Header().Set("Content-Type", "text/plain") 53 | w.WriteHeader(404) 54 | w.Write(nil) 55 | case "/json": 56 | w.Header().Set("Content-Type", "application/json") 57 | default: 58 | w.Header().Set("Content-Type", "text/html; charset=utf-8") 59 | } 60 | w.Write([]byte("1")) 61 | })) 62 | defer srv1.Close() 63 | srv2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 64 | w.Write([]byte("ok")) 65 | })) 66 | defer srv2.Close() 67 | srvu, err := url.Parse(srv1.URL) 68 | if err != nil { 69 | t.Fatal(err) 70 | } 71 | srv1Host := srvu.Host 72 | srvu, err = url.Parse(srv2.URL) 73 | if err != nil { 74 | t.Fatal(err) 75 | } 76 | srv2Host := srvu.Host 77 | 78 | // List of test cases 79 | cases := []struct { 80 | url string 81 | trace string 82 | }{ 83 | 0: {srv2.URL + "/none", "d"}, // no specific handler, use default 84 | 1: {srv1.URL + "/json", "j"}, // json-specific handler 85 | 2: {srv1.URL + "/deny", "a"}, // ErrDisallowed, use any errors handler 86 | 3: {srv1.URL + "/a", "g"}, // GET text/html 87 | 4: {srv1.URL + "/204", "s"}, // status 204 88 | 5: {srv1.URL + "/4xx", "r"}, // status range 4xx 89 | 6: {srv1.URL + "/b", "p"}, // path-specific handler 90 | 7: {srv1.URL + "/baba", "q"}, // path-specific handler 91 | 8: {srv1.URL + "/b/c", "p"}, // path-specific handler 92 | 9: {srv2.URL + "/zz", "r"}, // custom predicate 93 | } 94 | // Start the fetcher 95 | mux := NewMux() 96 | mux.HandleError(ErrEmptyHost, setTrace("e")) 97 | mux.HandleErrors(setTrace("a")) 98 | mux.DefaultHandler = setTrace("d") 99 | mux.Response().ContentType("application/json").Handler(setTrace("j")) 100 | mux.Response().Host(srv1Host).ContentType("text/html").Method("GET").Handler(setTrace("g")) 101 | mux.Response().ContentType("text/html").Method("HEAD").Handler(setTrace("h")) 102 | mux.Response().Host(srv1Host).Status(204).Handler(setTrace("s")) 103 | mux.Response().Host(srv1Host).StatusRange(400, 499).Handler(setTrace("r")) 104 | mux.Response().Path("/b").Handler(setTrace("p")) 105 | mux.Response().Path("/ba").Handler(setTrace("q")) 106 | mux.Response().Custom(func(res *http.Response) bool { 107 | return strings.Contains(res.Request.URL.Path, "zz") 108 | }).Handler(setTrace("r")) 109 | mux.Response().Host(srv2Host).Path("/b").Handler(setTrace("z")) 110 | f := New(mux) 111 | f.CrawlDelay = 0 112 | for i, c := range cases { 113 | parsed, err := url.Parse(c.url) 114 | if err != nil { 115 | t.Errorf("%d: error parsing url: %s", i, err) 116 | continue 117 | } 118 | cmd := &traceCmd{&Cmd{U: parsed, M: "GET"}, ""} 119 | 120 | q := f.Start() 121 | if err := q.Send(cmd); err != nil { 122 | t.Fatal(err) 123 | } 124 | q.Close() 125 | 126 | // make sure the call is out of the mux's handler 127 | mux.mu.Lock() 128 | mux.mu.Unlock() 129 | 130 | mu.Lock() 131 | if cmd.Trace != c.trace { 132 | t.Errorf("%d: expected trace '%s', got '%s'", i, c.trace, cmd.Trace) 133 | } 134 | mu.Unlock() 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /example/full/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "net/http" 9 | "net/url" 10 | "runtime" 11 | "strings" 12 | "sync" 13 | "time" 14 | 15 | "github.com/PuerkitoBio/fetchbot" 16 | "github.com/PuerkitoBio/goquery" 17 | ) 18 | 19 | var ( 20 | // Protect access to dup 21 | mu sync.Mutex 22 | // Duplicates table 23 | dup = map[string]bool{} 24 | 25 | // Command-line flags 26 | seed = flag.String("seed", "http://golang.org", "seed URL") 27 | cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time") 28 | cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL") 29 | stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time") 30 | stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL") 31 | memStats = flag.Duration("memstats", 0, "display memory statistics at a given interval") 32 | ) 33 | 34 | func main() { 35 | flag.Parse() 36 | 37 | // Parse the provided seed 38 | u, err := url.Parse(*seed) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | // Create the muxer 44 | mux := fetchbot.NewMux() 45 | 46 | // Handle all errors the same 47 | mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { 48 | fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) 49 | })) 50 | 51 | // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD 52 | // requests. 53 | mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc( 54 | func(ctx *fetchbot.Context, res *http.Response, err error) { 55 | // Process the body to find the links 56 | doc, err := goquery.NewDocumentFromResponse(res) 57 | if err != nil { 58 | fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) 59 | return 60 | } 61 | // Enqueue all links as HEAD requests 62 | enqueueLinks(ctx, doc) 63 | })) 64 | 65 | // Handle HEAD requests for html responses coming from the source host - we don't want 66 | // to crawl links from other hosts. 67 | mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc( 68 | func(ctx *fetchbot.Context, res *http.Response, err error) { 69 | if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { 70 | fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) 71 | } 72 | })) 73 | 74 | // Create the Fetcher, handle the logging first, then dispatch to the Muxer 75 | h := logHandler(mux) 76 | if *stopAtURL != "" || *cancelAtURL != "" { 77 | stopURL := *stopAtURL 78 | if *cancelAtURL != "" { 79 | stopURL = *cancelAtURL 80 | } 81 | h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux)) 82 | } 83 | f := fetchbot.New(h) 84 | 85 | // First mem stat print must be right after creating the fetchbot 86 | if *memStats > 0 { 87 | // Print starting stats 88 | printMemStats(nil) 89 | // Run at regular intervals 90 | runMemStats(f, *memStats) 91 | // On exit, print ending stats after a GC 92 | defer func() { 93 | runtime.GC() 94 | printMemStats(nil) 95 | }() 96 | } 97 | 98 | // Start processing 99 | q := f.Start() 100 | 101 | // if a stop or cancel is requested after some duration, launch the goroutine 102 | // that will stop or cancel. 103 | if *stopAfter > 0 || *cancelAfter > 0 { 104 | after := *stopAfter 105 | stopFunc := q.Close 106 | if *cancelAfter != 0 { 107 | after = *cancelAfter 108 | stopFunc = q.Cancel 109 | } 110 | 111 | go func() { 112 | c := time.After(after) 113 | <-c 114 | stopFunc() 115 | }() 116 | } 117 | 118 | // Enqueue the seed, which is the first entry in the dup map 119 | dup[*seed] = true 120 | _, err = q.SendStringGet(*seed) 121 | if err != nil { 122 | fmt.Printf("[ERR] GET %s - %s\n", *seed, err) 123 | } 124 | q.Block() 125 | } 126 | 127 | func runMemStats(f *fetchbot.Fetcher, tick time.Duration) { 128 | var mu sync.Mutex 129 | var di *fetchbot.DebugInfo 130 | 131 | // Start goroutine to collect fetchbot debug info 132 | go func() { 133 | for v := range f.Debug() { 134 | mu.Lock() 135 | di = v 136 | mu.Unlock() 137 | } 138 | }() 139 | // Start ticker goroutine to print mem stats at regular intervals 140 | go func() { 141 | c := time.Tick(tick) 142 | for _ = range c { 143 | mu.Lock() 144 | printMemStats(di) 145 | mu.Unlock() 146 | } 147 | }() 148 | } 149 | 150 | func printMemStats(di *fetchbot.DebugInfo) { 151 | var mem runtime.MemStats 152 | runtime.ReadMemStats(&mem) 153 | buf := bytes.NewBuffer(nil) 154 | buf.WriteString(strings.Repeat("=", 72) + "\n") 155 | buf.WriteString("Memory Profile:\n") 156 | buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024)) 157 | buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024)) 158 | buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC)) 159 | buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine())) 160 | if di != nil { 161 | buf.WriteString(fmt.Sprintf("\tNumHosts: %d\n", di.NumHosts)) 162 | } 163 | buf.WriteString(strings.Repeat("=", 72)) 164 | fmt.Println(buf.String()) 165 | } 166 | 167 | // stopHandler stops the fetcher if the stopurl is reached. Otherwise it dispatches 168 | // the call to the wrapped Handler. 169 | func stopHandler(stopurl string, cancel bool, wrapped fetchbot.Handler) fetchbot.Handler { 170 | return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { 171 | if ctx.Cmd.URL().String() == stopurl { 172 | fmt.Printf(">>>>> STOP URL %s\n", ctx.Cmd.URL()) 173 | // generally not a good idea to stop/block from a handler goroutine 174 | // so do it in a separate goroutine 175 | go func() { 176 | if cancel { 177 | ctx.Q.Cancel() 178 | } else { 179 | ctx.Q.Close() 180 | } 181 | }() 182 | return 183 | } 184 | wrapped.Handle(ctx, res, err) 185 | }) 186 | } 187 | 188 | // logHandler prints the fetch information and dispatches the call to the wrapped Handler. 189 | func logHandler(wrapped fetchbot.Handler) fetchbot.Handler { 190 | return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { 191 | if err == nil { 192 | fmt.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type")) 193 | } 194 | wrapped.Handle(ctx, res, err) 195 | }) 196 | } 197 | 198 | func enqueueLinks(ctx *fetchbot.Context, doc *goquery.Document) { 199 | mu.Lock() 200 | doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { 201 | val, _ := s.Attr("href") 202 | // Resolve address 203 | u, err := ctx.Cmd.URL().Parse(val) 204 | if err != nil { 205 | fmt.Printf("error: resolve URL %s - %s\n", val, err) 206 | return 207 | } 208 | if !dup[u.String()] { 209 | if _, err := ctx.Q.SendStringHead(u.String()); err != nil { 210 | fmt.Printf("error: enqueue head %s - %s\n", u, err) 211 | } else { 212 | dup[u.String()] = true 213 | } 214 | } 215 | }) 216 | mu.Unlock() 217 | } 218 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | /* 6 | Package fetchbot provides a simple and flexible web crawler that follows the robots.txt 7 | policies and crawl delays. 8 | 9 | It is very much a rewrite of gocrawl (https://github.com/PuerkitoBio/gocrawl) with a 10 | simpler API, less features built-in, but at the same time more flexibility. As for Go 11 | itself, sometimes less is more! 12 | 13 | Installation 14 | 15 | To install, simply run in a terminal: 16 | 17 | go get github.com/PuerkitoBio/fetchbot 18 | 19 | The package has a single external dependency, robotstxt 20 | (https://github.com/temoto/robotstxt). It also integrates code from the iq package 21 | (https://github.com/kylelemons/iq). 22 | 23 | The API documentation is available on godoc.org 24 | (http://godoc.org/github.com/PuerkitoBio/fetchbot). 25 | 26 | Usage 27 | 28 | The following example (taken from /example/short/main.go) shows how to create and 29 | start a Fetcher, one way to send commands, and how to stop the fetcher once all 30 | commands have been handled. 31 | 32 | package main 33 | 34 | import ( 35 | "fmt" 36 | "net/http" 37 | 38 | "github.com/PuerkitoBio/fetchbot" 39 | ) 40 | 41 | func main() { 42 | f := fetchbot.New(fetchbot.HandlerFunc(handler)) 43 | queue := f.Start() 44 | queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") 45 | queue.Close() 46 | } 47 | 48 | func handler(ctx *fetchbot.Context, res *http.Response, err error) { 49 | if err != nil { 50 | fmt.Printf("error: %s\n", err) 51 | return 52 | } 53 | fmt.Printf("[%d] %s %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL()) 54 | } 55 | 56 | A more complex and complete example can be found in the repository, at /example/full/. 57 | 58 | Fetcher 59 | 60 | Basically, a Fetcher is an instance of a web crawler, independent of other Fetchers. 61 | It receives Commands via the Queue, executes the requests, and calls a Handler to 62 | process the responses. A Command is an interface that tells the Fetcher which URL to 63 | fetch, and which HTTP method to use (i.e. "GET", "HEAD", ...). 64 | 65 | A call to Fetcher.Start() returns the Queue associated with this Fetcher. This is the 66 | thread-safe object that can be used to send commands, or to stop the crawler. 67 | 68 | Both the Command and the Handler are interfaces, and may be implemented in various ways. 69 | They are defined like so: 70 | 71 | type Command interface { 72 | URL() *url.URL 73 | Method() string 74 | } 75 | type Handler interface { 76 | Handle(*Context, *http.Response, error) 77 | } 78 | 79 | A Context is a struct that holds the Command and the Queue, so that the Handler always 80 | knows which Command initiated this call, and has a handle to the Queue. 81 | 82 | A Handler is similar to the net/http Handler, and middleware-style combinations can 83 | be built on top of it. A HandlerFunc type is provided so that simple functions 84 | with the right signature can be used as Handlers (like net/http.HandlerFunc), and there 85 | is also a multiplexer Mux that can be used to dispatch calls to different Handlers 86 | based on some criteria. 87 | 88 | Command-related Interfaces 89 | 90 | The Fetcher recognizes a number of interfaces that the Command may implement, for 91 | more advanced needs. 92 | 93 | * BasicAuthProvider: Implement this interface to specify the basic authentication 94 | credentials to set on the request. 95 | 96 | * CookiesProvider: If the Command implements this interface, the provided Cookies 97 | will be set on the request. 98 | 99 | * HeaderProvider: Implement this interface to specify the headers to set on the 100 | request. 101 | 102 | * ReaderProvider: Implement this interface to set the body of the request, via 103 | an io.Reader. 104 | 105 | * ValuesProvider: Implement this interface to set the body of the request, as 106 | form-encoded values. If the Content-Type is not specifically set via a HeaderProvider, 107 | it is set to "application/x-www-form-urlencoded". ReaderProvider and ValuesProvider 108 | should be mutually exclusive as they both set the body of the request. If both are 109 | implemented, the ReaderProvider interface is used. 110 | 111 | * Handler: Implement this interface if the Command's response should be handled 112 | by a specific callback function. By default, the response is handled by the Fetcher's 113 | Handler, but if the Command implements this, this handler function takes precedence 114 | and the Fetcher's Handler is ignored. 115 | 116 | Since the Command is an interface, it can be a custom struct that holds additional 117 | information, such as an ID for the URL (e.g. from a database), or a depth counter 118 | so that the crawling stops at a certain depth, etc. For basic commands that don't 119 | require additional information, the package provides the Cmd struct that implements 120 | the Command interface. This is the Command implementation used when using the 121 | various Queue.SendString\* methods. 122 | 123 | There is also a convenience HandlerCmd struct for the commands that should be handled 124 | by a specific callback function. It is a Command with a Handler interface implementation. 125 | 126 | Fetcher Options 127 | 128 | The Fetcher has a number of fields that provide further customization: 129 | 130 | * HttpClient : By default, the Fetcher uses the net/http default Client to make requests. A 131 | different client can be set on the Fetcher.HttpClient field. 132 | 133 | * CrawlDelay : That value is used only if there is no delay specified 134 | by the robots.txt of a given host. 135 | 136 | * UserAgent : Sets the user agent string to use for the requests and to validate 137 | against the robots.txt entries. 138 | 139 | * WorkerIdleTTL : Sets the duration that a worker goroutine can wait without receiving 140 | new commands to fetch. If the idle time-to-live is reached, the worker goroutine 141 | is stopped and its resources are released. This can be especially useful for 142 | long-running crawlers. 143 | 144 | * AutoClose : If true, closes the queue automatically once the number of active hosts 145 | reach 0. 146 | 147 | * DisablePoliteness : If true, ignores the robots.txt policies of the hosts. 148 | 149 | What fetchbot doesn't do - especially compared to gocrawl - is that it doesn't 150 | keep track of already visited URLs, and it doesn't normalize the URLs. This is outside 151 | the scope of this package - all commands sent on the Queue will be fetched. 152 | Normalization can easily be done (e.g. using 153 | https://github.com/PuerkitoBio/purell) before sending the Command to the Fetcher. 154 | How to keep track of visited URLs depends on the use-case of the specific crawler, 155 | but for an example, see /example/full/main.go. 156 | 157 | License 158 | 159 | The BSD 3-Clause license (http://opensource.org/licenses/BSD-3-Clause), the same as 160 | the Go language. The iq_slice.go file is under the CDDL-1.0 license (details in 161 | the source file). 162 | */ 163 | package fetchbot 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fetchbot [![build status](https://secure.travis-ci.org/PuerkitoBio/fetchbot.svg)](http://travis-ci.org/PuerkitoBio/fetchbot) [![Go Reference](https://pkg.go.dev/badge/github.com/PuerkitoBio/fetchbot.svg)](https://pkg.go.dev/github.com/PuerkitoBio/fetchbot) 2 | 3 | Package fetchbot provides a simple and flexible web crawler that follows the robots.txt 4 | policies and crawl delays. 5 | 6 | It is very much a rewrite of [gocrawl](https://github.com/PuerkitoBio/gocrawl) with a 7 | simpler API, less features built-in, but at the same time more flexibility. As for Go 8 | itself, sometimes less is more! 9 | 10 | ## Installation 11 | 12 | To install, simply run in a terminal: 13 | 14 | go get github.com/PuerkitoBio/fetchbot 15 | 16 | The package has a single external dependency, [robotstxt](https://github.com/temoto/robotstxt). It also integrates code from the [iq package](https://github.com/kylelemons/iq). 17 | 18 | The [API documentation is available on godoc.org](http://godoc.org/github.com/PuerkitoBio/fetchbot). 19 | 20 | ## Changes 21 | 22 | * 2019-09-11 (v1.2.0): update robotstxt dependency (import path/repo URL has changed, issue #31, thanks to [@michael-stevens][michael-stevens] for raising the issue). 23 | * 2017-09-04 (v1.1.1): fix a goroutine leak when cancelling a Queue (issue #26, thanks to [@ryu-koui][ryu] for raising the issue). 24 | * 2017-07-06 (v1.1.0): add `Queue.Done` to get the done channel on the queue, allowing to wait in a `select` statement (thanks to [@DennisDenuto][denuto]). 25 | * 2015-07-25 (v1.0.0) : add `Cancel` method on the `Queue`, to close and drain without requesting any pending commands, unlike `Close` that waits for all pending commands to be processed (thanks to [@buro9][buro9] for the feature request). 26 | * 2015-07-24 : add `HandlerCmd` and call the Command's `Handler` function if it implements the `Handler` interface, bypassing the `Fetcher`'s handler. Support a `Custom` matcher on the `Mux`, using a predicate. (thanks to [@mmcdole][mmcdole] for the feature requests). 27 | * 2015-06-18 : add `Scheme` criteria on the muxer (thanks to [@buro9][buro9]). 28 | * 2015-06-10 : add `DisablePoliteness` field on the `Fetcher` to optionally bypass robots.txt checks (thanks to [@oli-g][oli]). 29 | * 2014-07-04 : change the type of Fetcher.HttpClient from `*http.Client` to the `Doer` interface. Low chance of breaking existing code, but it's a possibility if someone used the fetcher's client to run other requests (e.g. `f.HttpClient.Get(...)`). 30 | 31 | ## Usage 32 | 33 | The following example (taken from /example/short/main.go) shows how to create and 34 | start a Fetcher, one way to send commands, and how to stop the fetcher once all 35 | commands have been handled. 36 | 37 | ```go 38 | package main 39 | 40 | import ( 41 | "fmt" 42 | "net/http" 43 | 44 | "github.com/PuerkitoBio/fetchbot" 45 | ) 46 | 47 | func main() { 48 | f := fetchbot.New(fetchbot.HandlerFunc(handler)) 49 | queue := f.Start() 50 | queue.SendStringHead("http://google.com", "http://golang.org", "http://golang.org/doc") 51 | queue.Close() 52 | } 53 | 54 | func handler(ctx *fetchbot.Context, res *http.Response, err error) { 55 | if err != nil { 56 | fmt.Printf("error: %s\n", err) 57 | return 58 | } 59 | fmt.Printf("[%d] %s %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL()) 60 | } 61 | ``` 62 | 63 | A more complex and complete example can be found in the repository, at /example/full/. 64 | 65 | ### Fetcher 66 | 67 | Basically, a **Fetcher** is an instance of a web crawler, independent of other Fetchers. 68 | It receives Commands via the **Queue**, executes the requests, and calls a **Handler** to 69 | process the responses. A **Command** is an interface that tells the Fetcher which URL to 70 | fetch, and which HTTP method to use (i.e. "GET", "HEAD", ...). 71 | 72 | A call to Fetcher.Start() returns the Queue associated with this Fetcher. This is the 73 | thread-safe object that can be used to send commands, or to stop the crawler. 74 | 75 | Both the Command and the Handler are interfaces, and may be implemented in various ways. 76 | They are defined like so: 77 | 78 | ```go 79 | type Command interface { 80 | URL() *url.URL 81 | Method() string 82 | } 83 | type Handler interface { 84 | Handle(*Context, *http.Response, error) 85 | } 86 | ``` 87 | 88 | A **Context** is a struct that holds the Command and the Queue, so that the Handler always 89 | knows which Command initiated this call, and has a handle to the Queue. 90 | 91 | A Handler is similar to the net/http Handler, and middleware-style combinations can 92 | be built on top of it. A HandlerFunc type is provided so that simple functions 93 | with the right signature can be used as Handlers (like net/http.HandlerFunc), and there 94 | is also a multiplexer Mux that can be used to dispatch calls to different Handlers 95 | based on some criteria. 96 | 97 | ### Command-related Interfaces 98 | 99 | The Fetcher recognizes a number of interfaces that the Command may implement, for 100 | more advanced needs. 101 | 102 | * `BasicAuthProvider`: Implement this interface to specify the basic authentication 103 | credentials to set on the request. 104 | 105 | * `CookiesProvider`: If the Command implements this interface, the provided Cookies 106 | will be set on the request. 107 | 108 | * `HeaderProvider`: Implement this interface to specify the headers to set on the 109 | request. 110 | 111 | * `ReaderProvider`: Implement this interface to set the body of the request, via 112 | an `io.Reader`. 113 | 114 | * `ValuesProvider`: Implement this interface to set the body of the request, as 115 | form-encoded values. If the Content-Type is not specifically set via a `HeaderProvider`, 116 | it is set to "application/x-www-form-urlencoded". `ReaderProvider` and `ValuesProvider` 117 | should be mutually exclusive as they both set the body of the request. If both are 118 | implemented, the `ReaderProvider` interface is used. 119 | 120 | * `Handler`: Implement this interface if the Command's response should be handled 121 | by a specific callback function. By default, the response is handled by the Fetcher's 122 | Handler, but if the Command implements this, this handler function takes precedence 123 | and the Fetcher's Handler is ignored. 124 | 125 | Since the Command is an interface, it can be a custom struct that holds additional 126 | information, such as an ID for the URL (e.g. from a database), or a depth counter 127 | so that the crawling stops at a certain depth, etc. For basic commands that don't 128 | require additional information, the package provides the Cmd struct that implements 129 | the Command interface. This is the Command implementation used when using the 130 | various Queue.SendString\* methods. 131 | 132 | There is also a convenience `HandlerCmd` struct for the commands that should be handled 133 | by a specific callback function. It is a Command with a Handler interface implementation. 134 | 135 | ### Fetcher Options 136 | 137 | The Fetcher has a number of fields that provide further customization: 138 | 139 | * HttpClient : By default, the Fetcher uses the net/http default Client to make requests. A 140 | different client can be set on the Fetcher.HttpClient field. 141 | 142 | * CrawlDelay : That value is used only if there is no delay specified 143 | by the robots.txt of a given host. 144 | 145 | * UserAgent : Sets the user agent string to use for the requests and to validate 146 | against the robots.txt entries. 147 | 148 | * WorkerIdleTTL : Sets the duration that a worker goroutine can wait without receiving 149 | new commands to fetch. If the idle time-to-live is reached, the worker goroutine 150 | is stopped and its resources are released. This can be especially useful for 151 | long-running crawlers. 152 | 153 | * AutoClose : If true, closes the queue automatically once the number of active hosts 154 | reach 0. 155 | 156 | * DisablePoliteness : If true, ignores the robots.txt policies of the hosts. 157 | 158 | What fetchbot doesn't do - especially compared to gocrawl - is that it doesn't 159 | keep track of already visited URLs, and it doesn't normalize the URLs. This is outside 160 | the scope of this package - all commands sent on the Queue will be fetched. 161 | Normalization can easily be done (e.g. using [purell](https://github.com/PuerkitoBio/purell)) before sending the Command to the Fetcher. 162 | How to keep track of visited URLs depends on the use-case of the specific crawler, 163 | but for an example, see /example/full/main.go. 164 | 165 | ## License 166 | 167 | The [BSD 3-Clause license](http://opensource.org/licenses/BSD-3-Clause), the same as 168 | the Go language. The iq package source code is under the CDDL-1.0 license (details in 169 | the source file). 170 | 171 | [oli]: https://github.com/oli-g 172 | [buro9]: https://github.com/buro9 173 | [mmcdole]: https://github.com/mmcdole 174 | [denuto]: https://github.com/DennisDenuto 175 | [ryu]: https://github.com/ryu-koui 176 | [michael-stevens]: https://github.com/michael-stevens 177 | 178 | -------------------------------------------------------------------------------- /handler.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "net/http" 9 | "strings" 10 | "sync" 11 | ) 12 | 13 | // Context is a Command's fetch context, passed to the Handler. It gives access to the 14 | // original Command and the associated Queue. 15 | type Context struct { 16 | Cmd Command 17 | Q *Queue 18 | } 19 | 20 | // The Handler interface is used to process the Fetcher's requests. It is similar to the 21 | // net/http.Handler interface. 22 | type Handler interface { 23 | Handle(*Context, *http.Response, error) 24 | } 25 | 26 | // A HandlerFunc is a function signature that implements the Handler interface. A function 27 | // with this signature can thus be used as a Handler. 28 | type HandlerFunc func(*Context, *http.Response, error) 29 | 30 | // Handle is the Handler interface implementation for the HandlerFunc type. 31 | func (h HandlerFunc) Handle(ctx *Context, res *http.Response, err error) { 32 | h(ctx, res, err) 33 | } 34 | 35 | // Mux is a simple multiplexer for the Handler interface, similar to net/http.ServeMux. 36 | // It is itself a Handler, and dispatches the calls to the matching Handlers. 37 | // 38 | // For error Handlers, if there is a Handler registered for the same error value, 39 | // it will be called. Otherwise, if there is a Handler registered for any error, 40 | // this Handler will be called. 41 | // 42 | // For Response Handlers, a match with a path criteria has higher priority than other 43 | // matches, and the longer path match will get called. 44 | // 45 | // If multiple Response handlers with the same path length (or no path criteria) 46 | // match a response, the actual handler called is undefined, but one and only one 47 | // will be called. 48 | // 49 | // In any case, if no Handler matches, the DefaultHandler is called, and it 50 | // defaults to a no-op. 51 | type Mux struct { 52 | DefaultHandler Handler 53 | 54 | mu sync.RWMutex 55 | errm map[error]Handler 56 | res map[*ResponseMatcher]bool // a set of entries 57 | } 58 | 59 | // NewMux returns an initialized Mux. 60 | func NewMux() *Mux { 61 | return &Mux{ 62 | // Default handler is a no-op 63 | DefaultHandler: HandlerFunc(func(ctx *Context, res *http.Response, err error) {}), 64 | errm: make(map[error]Handler), 65 | res: make(map[*ResponseMatcher]bool), 66 | } 67 | } 68 | 69 | // Handle is the Handler interface implementation for Mux. It dispatches the calls 70 | // to the matching Handler. 71 | func (mux *Mux) Handle(ctx *Context, res *http.Response, err error) { 72 | mux.mu.RLock() 73 | defer mux.mu.RUnlock() 74 | if err != nil { 75 | // Find a matching error handler 76 | if h, ok := mux.errm[err]; ok { 77 | h.Handle(ctx, res, err) 78 | return 79 | } 80 | if h, ok := mux.errm[nil]; ok { 81 | h.Handle(ctx, res, err) 82 | return 83 | } 84 | } else { 85 | // Find a matching response handler 86 | var h Handler 87 | var n = -1 88 | for r := range mux.res { 89 | if ok, cnt := r.match(res); ok { 90 | if cnt > n { 91 | h, n = r.h, cnt 92 | } 93 | } 94 | } 95 | if h != nil { 96 | h.Handle(ctx, res, err) 97 | return 98 | } 99 | } 100 | mux.DefaultHandler.Handle(ctx, res, err) 101 | } 102 | 103 | // HandleError registers a Handler for a specific error value. Multiple calls 104 | // with the same error value override previous calls. As a special case, a nil 105 | // error value registers a Handler for any error that doesn't have a specific 106 | // Handler. 107 | func (mux *Mux) HandleError(err error, h Handler) { 108 | mux.mu.Lock() 109 | defer mux.mu.Unlock() 110 | mux.errm[err] = h 111 | } 112 | 113 | // HandleErrors registers a Handler for any error that doesn't have a specific 114 | // Handler. 115 | func (mux *Mux) HandleErrors(h Handler) { 116 | mux.HandleError(nil, h) 117 | } 118 | 119 | // Response initializes an entry for a Response Handler based on various criteria. 120 | // The Response Handler is not registered until Handle is called. 121 | func (mux *Mux) Response() *ResponseMatcher { 122 | return &ResponseMatcher{mux: mux} 123 | } 124 | 125 | // A ResponseMatcher holds the criteria for a response Handler. 126 | type ResponseMatcher struct { 127 | method string 128 | contentType string 129 | minStatus int 130 | maxStatus int 131 | scheme string 132 | host string 133 | path string 134 | predicate func(*http.Response) bool 135 | h Handler 136 | mux *Mux 137 | } 138 | 139 | // match indicates if the response Handler matches the provided response, and if so, 140 | // and if a path criteria is specified, it also indicates the length of the path match. 141 | func (r *ResponseMatcher) match(res *http.Response) (bool, int) { 142 | if r.method != "" { 143 | if r.method != res.Request.Method { 144 | return false, 0 145 | } 146 | } 147 | if r.contentType != "" { 148 | if r.contentType != getContentType(res.Header.Get("Content-Type")) { 149 | return false, 0 150 | } 151 | } 152 | if r.minStatus != 0 || r.maxStatus != 0 { 153 | if res.StatusCode < r.minStatus || res.StatusCode > r.maxStatus { 154 | return false, 0 155 | } 156 | } 157 | if r.scheme != "" { 158 | if res.Request.URL.Scheme != r.scheme { 159 | return false, 0 160 | } 161 | } 162 | if r.host != "" { 163 | if res.Request.URL.Host != r.host { 164 | return false, 0 165 | } 166 | } 167 | if r.predicate != nil { 168 | if !r.predicate(res) { 169 | return false, 0 170 | } 171 | } 172 | if r.path != "" { 173 | if strings.HasPrefix(res.Request.URL.Path, r.path) { 174 | return true, len(r.path) 175 | } 176 | return false, 0 177 | } 178 | return true, 0 179 | } 180 | 181 | // Returns the content type stripped of any additional parameters (following the ;). 182 | func getContentType(val string) string { 183 | args := strings.SplitN(val, ";", 2) 184 | if len(args) > 0 { 185 | return strings.TrimSpace(args[0]) 186 | } 187 | return val 188 | } 189 | 190 | // Method sets a method criteria for the Response Handler. Its Handler will only be called 191 | // if it has this HTTP method (i.e. "GET", "HEAD", ...). 192 | func (r *ResponseMatcher) Method(m string) *ResponseMatcher { 193 | r.mux.mu.Lock() 194 | defer r.mux.mu.Unlock() 195 | r.method = m 196 | return r 197 | } 198 | 199 | // ContentType sets a criteria based on the Content-Type header for the Response Handler. 200 | // Its Handler will only be called if it has this content type, ignoring any additional 201 | // parameter on the Header value (following the semicolon, i.e. "text/html; charset=utf-8"). 202 | func (r *ResponseMatcher) ContentType(ct string) *ResponseMatcher { 203 | r.mux.mu.Lock() 204 | defer r.mux.mu.Unlock() 205 | r.contentType = ct 206 | return r 207 | } 208 | 209 | // Status sets a criteria based on the Status code of the response for the Response Handler. 210 | // Its Handler will only be called if the response has this status code. 211 | func (r *ResponseMatcher) Status(code int) *ResponseMatcher { 212 | r.mux.mu.Lock() 213 | defer r.mux.mu.Unlock() 214 | r.minStatus = code 215 | r.maxStatus = code 216 | return r 217 | } 218 | 219 | // StatusRange sets a criteria based on the Status code of the response for the Response Handler. 220 | // Its Handler will only be called if the response has a status code between the min and max. 221 | // If min is greater than max, the values are switched. 222 | func (r *ResponseMatcher) StatusRange(min, max int) *ResponseMatcher { 223 | if min > max { 224 | min, max = max, min 225 | } 226 | r.mux.mu.Lock() 227 | defer r.mux.mu.Unlock() 228 | r.minStatus = min 229 | r.maxStatus = max 230 | return r 231 | } 232 | 233 | // Scheme sets a criteria based on the scheme of the URL for the Response Handler. Its Handler 234 | // will only be called if the scheme of the URL matches exactly the specified scheme. 235 | func (r *ResponseMatcher) Scheme(scheme string) *ResponseMatcher { 236 | r.mux.mu.Lock() 237 | defer r.mux.mu.Unlock() 238 | r.scheme = scheme 239 | return r 240 | } 241 | 242 | // Host sets a criteria based on the host of the URL for the Response Handler. Its Handler 243 | // will only be called if the host of the URL matches exactly the specified host. 244 | func (r *ResponseMatcher) Host(host string) *ResponseMatcher { 245 | r.mux.mu.Lock() 246 | defer r.mux.mu.Unlock() 247 | r.host = host 248 | return r 249 | } 250 | 251 | // Path sets a criteria based on the path of the URL for the Response Handler. Its Handler 252 | // will only be called if the path of the URL starts with this path. Longer matches 253 | // have priority over shorter ones. 254 | func (r *ResponseMatcher) Path(p string) *ResponseMatcher { 255 | r.mux.mu.Lock() 256 | defer r.mux.mu.Unlock() 257 | r.path = p 258 | return r 259 | } 260 | 261 | // Custom sets a criteria based on a function that receives the HTTP response 262 | // and returns true if the matcher should be used to handle this response, 263 | // false otherwise. 264 | func (r *ResponseMatcher) Custom(predicate func(*http.Response) bool) *ResponseMatcher { 265 | r.mux.mu.Lock() 266 | defer r.mux.mu.Unlock() 267 | r.predicate = predicate 268 | return r 269 | } 270 | 271 | // Handler sets the Handler to be called when this Response Handler is the match for 272 | // a given response. It registers the Response Handler in its parent Mux. 273 | func (r *ResponseMatcher) Handler(h Handler) *ResponseMatcher { 274 | r.mux.mu.Lock() 275 | defer r.mux.mu.Unlock() 276 | r.h = h 277 | if !r.mux.res[r] { 278 | r.mux.res[r] = true 279 | } 280 | return r 281 | } 282 | -------------------------------------------------------------------------------- /cmd_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "encoding/base64" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "net/http" 13 | "net/http/httptest" 14 | "net/url" 15 | "sort" 16 | "strings" 17 | "sync/atomic" 18 | "testing" 19 | ) 20 | 21 | type basicAuthCmd struct { 22 | *Cmd 23 | user, pwd string 24 | } 25 | 26 | func (ba *basicAuthCmd) BasicAuth() (string, string) { 27 | return ba.user, ba.pwd 28 | } 29 | 30 | func TestBasicAuth(t *testing.T) { 31 | creds := base64.StdEncoding.EncodeToString([]byte("me:you")) 32 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 33 | auth := req.Header.Get("Authorization") 34 | if auth != "Basic "+creds { 35 | w.Header().Set("WWW-Authenticate", "Basic realm=\"Authorization Required\"") 36 | http.Error(w, "Unauthorized", http.StatusUnauthorized) 37 | return 38 | } 39 | w.Write([]byte("ok")) 40 | })) 41 | defer srv.Close() 42 | cases := []struct { 43 | cmd Command 44 | status int 45 | }{ 46 | 0: { 47 | &basicAuthCmd{&Cmd{U: mustParse(t, srv.URL+"/a"), M: "GET"}, "me", "you"}, 48 | http.StatusOK, 49 | }, 50 | 1: { 51 | &Cmd{U: mustParse(t, srv.URL+"/b"), M: "GET"}, 52 | http.StatusUnauthorized, 53 | }, 54 | 2: { 55 | &basicAuthCmd{&Cmd{U: mustParse(t, srv.URL+"/c"), M: "GET"}, "some", "other"}, 56 | http.StatusUnauthorized, 57 | }, 58 | 3: { 59 | &readerCmd{&Cmd{U: mustParse(t, srv.URL+"/d"), M: "POST"}, 60 | strings.NewReader("a")}, 61 | http.StatusUnauthorized, 62 | }, 63 | 4: { 64 | &valuesCmd{&Cmd{U: mustParse(t, srv.URL+"/e"), M: "POST"}, 65 | url.Values{"k": {"v"}}}, 66 | http.StatusUnauthorized, 67 | }, 68 | } 69 | sh := &spyHandler{} 70 | f := New(sh) 71 | f.CrawlDelay = 0 72 | q := f.Start() 73 | for i, c := range cases { 74 | if err := q.Send(c.cmd); err != nil { 75 | t.Errorf("%d: error sending command: %s", i, err) 76 | } 77 | } 78 | q.Close() 79 | var urls []string 80 | for i, c := range cases { 81 | urls = append(urls, c.cmd.URL().String()) 82 | if st := sh.StatusFor(c.cmd.URL().String()); st != c.status { 83 | t.Errorf("%d: expected status %d, got %d", i, c.status, st) 84 | } 85 | } 86 | if !sh.CalledWithExactly(urls...) { 87 | t.Error("expected handler to be called for all cases") 88 | } 89 | if cnt := sh.Errors(); cnt > 0 { 90 | t.Errorf("expected no error, got %d", cnt) 91 | } 92 | } 93 | 94 | type readerCmd struct { 95 | *Cmd 96 | r io.Reader 97 | } 98 | 99 | func (rc *readerCmd) Reader() io.Reader { 100 | return rc.r 101 | } 102 | 103 | type valuesCmd struct { 104 | *Cmd 105 | vals url.Values 106 | } 107 | 108 | func (vc *valuesCmd) Values() url.Values { 109 | return vc.vals 110 | } 111 | 112 | type cookiesCmd struct { 113 | *Cmd 114 | cooks []*http.Cookie 115 | } 116 | 117 | func (cc *cookiesCmd) Cookies() []*http.Cookie { 118 | return cc.cooks 119 | } 120 | 121 | func TestBody(t *testing.T) { 122 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 123 | cooks := req.Cookies() 124 | if len(cooks) == 0 { 125 | b, err := ioutil.ReadAll(req.Body) 126 | if err != nil { 127 | w.WriteHeader(http.StatusInternalServerError) 128 | w.Write([]byte(err.Error())) 129 | return 130 | } 131 | w.Write(b) 132 | } else { 133 | for i, c := range cooks { 134 | if i > 0 { 135 | w.Write([]byte{'&'}) 136 | } 137 | w.Write([]byte(c.Name)) 138 | } 139 | } 140 | })) 141 | defer srv.Close() 142 | cases := []struct { 143 | cmd Command 144 | body string 145 | }{ 146 | 0: { 147 | &readerCmd{&Cmd{U: mustParse(t, srv.URL+"/a"), M: "POST"}, 148 | strings.NewReader("a")}, 149 | "a", 150 | }, 151 | 1: { 152 | &valuesCmd{&Cmd{U: mustParse(t, srv.URL+"/b"), M: "POST"}, 153 | url.Values{"k": {"v"}}}, 154 | "k=v", 155 | }, 156 | 2: { 157 | &Cmd{U: mustParse(t, srv.URL+"/c"), M: "POST"}, 158 | "", 159 | }, 160 | 3: { 161 | &basicAuthCmd{&Cmd{U: mustParse(t, srv.URL+"/d"), M: "POST"}, "me", "you"}, 162 | "", 163 | }, 164 | 4: { 165 | &cookiesCmd{&Cmd{U: mustParse(t, srv.URL+"/e"), M: "GET"}, 166 | []*http.Cookie{&http.Cookie{Name: "e"}}}, 167 | "e", 168 | }, 169 | 5: { 170 | &cookiesCmd{&Cmd{U: mustParse(t, srv.URL+"/f"), M: "GET"}, 171 | []*http.Cookie{&http.Cookie{Name: "f1"}, &http.Cookie{Name: "f2"}}}, 172 | "f1&f2", 173 | }, 174 | } 175 | sh := &spyHandler{} 176 | f := New(sh) 177 | f.CrawlDelay = 0 178 | q := f.Start() 179 | for i, c := range cases { 180 | if err := q.Send(c.cmd); err != nil { 181 | t.Errorf("%d: error sending command: %s", i, err) 182 | } 183 | } 184 | q.Close() 185 | var urls []string 186 | for i, c := range cases { 187 | urls = append(urls, c.cmd.URL().String()) 188 | if b := sh.BodyFor(c.cmd.URL().String()); b != c.body { 189 | t.Errorf("%d: expected body '%s', got '%s'", i, c.body, b) 190 | } 191 | } 192 | if !sh.CalledWithExactly(urls...) { 193 | t.Error("expected handler to be called for all cases") 194 | } 195 | if cnt := sh.Errors(); cnt > 0 { 196 | t.Errorf("expected no error, got %d", cnt) 197 | } 198 | } 199 | 200 | type headerCmd struct { 201 | *Cmd 202 | hdr http.Header 203 | } 204 | 205 | func (hc *headerCmd) Header() http.Header { 206 | return hc.hdr 207 | } 208 | 209 | func TestHeader(t *testing.T) { 210 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 211 | // Write headers in lexical order so that result is predictable 212 | keys := make([]string, 0, len(req.Header)) 213 | for k := range req.Header { 214 | if len(k) == 1 { 215 | keys = append(keys, k) 216 | } 217 | } 218 | sort.Strings(keys) 219 | for _, k := range keys { 220 | w.Write([]byte(fmt.Sprintf("%s:%s\n", k, req.Header[k][0]))) 221 | } 222 | })) 223 | defer srv.Close() 224 | cases := []struct { 225 | cmd Command 226 | body string 227 | }{ 228 | 0: { 229 | &headerCmd{&Cmd{U: mustParse(t, srv.URL+"/a"), M: "GET"}, 230 | http.Header{"A": {"a"}}}, 231 | "A:a\n", 232 | }, 233 | 1: { 234 | &Cmd{U: mustParse(t, srv.URL+"/b"), M: "GET"}, 235 | "", 236 | }, 237 | 2: { 238 | &headerCmd{&Cmd{U: mustParse(t, srv.URL+"/c"), M: "GET"}, 239 | http.Header{"C": {"c"}, "D": {"d"}}}, 240 | "C:c\nD:d\n", 241 | }, 242 | } 243 | sh := &spyHandler{} 244 | f := New(sh) 245 | f.CrawlDelay = 0 246 | q := f.Start() 247 | for i, c := range cases { 248 | if err := q.Send(c.cmd); err != nil { 249 | t.Errorf("%d: error sending command: %s", i, err) 250 | } 251 | } 252 | q.Close() 253 | var urls []string 254 | for i, c := range cases { 255 | urls = append(urls, c.cmd.URL().String()) 256 | if b := sh.BodyFor(c.cmd.URL().String()); b != c.body { 257 | t.Errorf("%d: expected body '%s', got '%s'", i, c.body, b) 258 | } 259 | } 260 | if !sh.CalledWithExactly(urls...) { 261 | t.Error("expected handler to be called for all cases") 262 | } 263 | if cnt := sh.Errors(); cnt > 0 { 264 | t.Errorf("expected no error, got %d", cnt) 265 | } 266 | } 267 | 268 | type fullCmd struct { 269 | *Cmd 270 | user, pwd string 271 | r io.Reader 272 | vals url.Values 273 | cooks []*http.Cookie 274 | hdr http.Header 275 | } 276 | 277 | func (f *fullCmd) BasicAuth() (string, string) { 278 | return f.user, f.pwd 279 | } 280 | 281 | func (f *fullCmd) Reader() io.Reader { 282 | return f.r 283 | } 284 | 285 | func (f *fullCmd) Values() url.Values { 286 | return f.vals 287 | } 288 | 289 | func (f *fullCmd) Cookies() []*http.Cookie { 290 | return f.cooks 291 | } 292 | 293 | func (f *fullCmd) Header() http.Header { 294 | return f.hdr 295 | } 296 | 297 | func TestFullCmd(t *testing.T) { 298 | creds := base64.StdEncoding.EncodeToString([]byte("me:you")) 299 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 300 | // Basic auth 301 | auth := req.Header.Get("Authorization") 302 | if auth != "Basic "+creds { 303 | w.Header().Set("WWW-Authenticate", "Basic realm=\"Authorization Required\"") 304 | http.Error(w, "Unauthorized", http.StatusUnauthorized) 305 | return 306 | } 307 | // Cookies 308 | for i, c := range req.Cookies() { 309 | if i > 0 { 310 | w.Write([]byte{'&'}) 311 | } 312 | w.Write([]byte(c.Name)) 313 | } 314 | // Header 315 | for k, v := range req.Header { 316 | if len(k) == 1 { 317 | w.Write([]byte(fmt.Sprintf("%s:%s\n", k, v[0]))) 318 | } 319 | } 320 | // Body 321 | b, err := ioutil.ReadAll(req.Body) 322 | if err != nil { 323 | t.Fatal(err) 324 | } 325 | w.Write(b) 326 | })) 327 | defer srv.Close() 328 | 329 | sh := &spyHandler{} 330 | f := New(sh) 331 | f.CrawlDelay = 0 332 | q := f.Start() 333 | cmd := &fullCmd{ 334 | &Cmd{U: mustParse(t, srv.URL+"/a"), M: "POST"}, 335 | "me", "you", 336 | strings.NewReader("body"), 337 | url.Values{"ignored": {"val"}}, 338 | []*http.Cookie{&http.Cookie{Name: "a"}}, 339 | http.Header{"A": {"a"}}, 340 | } 341 | if err := q.Send(cmd); err != nil { 342 | t.Fatal(err) 343 | } 344 | q.Close() 345 | // Assert 200 status 346 | if st := sh.StatusFor(cmd.URL().String()); st != 200 { 347 | t.Errorf("expected status %d, got %d", 200, st) 348 | } 349 | // Assert body (Cookies + Header) 350 | exp := "aA:a\nbody" 351 | if b := sh.BodyFor(cmd.URL().String()); b != exp { 352 | t.Errorf("expected body '%s', got '%s'", exp, b) 353 | } 354 | } 355 | 356 | func TestHandlerCmd(t *testing.T) { 357 | var result int32 358 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {})) 359 | defer srv.Close() 360 | 361 | cases := []struct { 362 | cmd Command 363 | want int32 364 | }{ 365 | 0: { 366 | mustCmd(NewHandlerCmd("GET", srv.URL+"/a", func(ctx *Context, res *http.Response, err error) { 367 | atomic.AddInt32(&result, 1) 368 | })), 1, 369 | }, 370 | 1: { 371 | &Cmd{U: mustParse(t, srv.URL+"/b"), M: "GET"}, -1, 372 | }, 373 | } 374 | 375 | f := New(HandlerFunc(func(ctx *Context, res *http.Response, err error) { 376 | atomic.AddInt32(&result, -1) 377 | })) 378 | f.CrawlDelay = 0 379 | 380 | for i, c := range cases { 381 | result = 0 382 | q := f.Start() 383 | if err := q.Send(c.cmd); err != nil { 384 | t.Errorf("%d: error sending command: %s", i, err) 385 | } 386 | q.Close() 387 | 388 | if result != c.want { 389 | t.Errorf("%d: want %d, got %d", i, c.want, result) 390 | } 391 | } 392 | } 393 | 394 | func mustCmd(cmd Command, err error) Command { 395 | if err != nil { 396 | panic(err) 397 | } 398 | return cmd 399 | } 400 | 401 | func mustParse(t *testing.T, raw string) *url.URL { 402 | parsed, err := url.Parse(raw) 403 | if err != nil { 404 | t.Fatal(err) 405 | } 406 | return parsed 407 | } 408 | -------------------------------------------------------------------------------- /fetch.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "errors" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "net/http" 13 | "net/url" 14 | "os" 15 | "strings" 16 | "sync" 17 | "time" 18 | 19 | "github.com/temoto/robotstxt" 20 | ) 21 | 22 | var ( 23 | // ErrEmptyHost is returned if a command to be enqueued has an URL with an empty host. 24 | ErrEmptyHost = errors.New("fetchbot: invalid empty host") 25 | 26 | // ErrDisallowed is returned when the requested URL is disallowed by the robots.txt 27 | // policy. 28 | ErrDisallowed = errors.New("fetchbot: disallowed by robots.txt") 29 | 30 | // ErrQueueClosed is returned when a Send call is made on a closed Queue. 31 | ErrQueueClosed = errors.New("fetchbot: send on a closed queue") 32 | ) 33 | 34 | // Parse the robots.txt relative path a single time at startup, this can't 35 | // return an error. 36 | var robotsTxtParsedPath, _ = url.Parse("/robots.txt") 37 | 38 | const ( 39 | // DefaultCrawlDelay represents the delay to use if there is no robots.txt 40 | // specified delay. 41 | DefaultCrawlDelay = 5 * time.Second 42 | 43 | // DefaultUserAgent is the default user agent string. 44 | DefaultUserAgent = "Fetchbot (https://github.com/PuerkitoBio/fetchbot)" 45 | 46 | // DefaultWorkerIdleTTL is the default time-to-live of an idle host worker goroutine. 47 | // If no URL is sent for a given host within this duration, this host's goroutine 48 | // is disposed of. 49 | DefaultWorkerIdleTTL = 30 * time.Second 50 | ) 51 | 52 | // Doer defines the method required to use a type as HttpClient. 53 | // The net/*http.Client type satisfies this interface. 54 | type Doer interface { 55 | Do(*http.Request) (*http.Response, error) 56 | } 57 | 58 | // A Fetcher defines the parameters for running a web crawler. 59 | type Fetcher struct { 60 | // The Handler to be called for each request. All successfully enqueued requests 61 | // produce a Handler call. 62 | Handler Handler 63 | 64 | // DisablePoliteness disables fetching and using the robots.txt policies of 65 | // hosts. 66 | DisablePoliteness bool 67 | 68 | // Default delay to use between requests to a same host if there is no robots.txt 69 | // crawl delay or if DisablePoliteness is true. 70 | CrawlDelay time.Duration 71 | 72 | // The *http.Client to use for the requests. If nil, defaults to the net/http 73 | // package's default client. Should be HTTPClient to comply with go lint, but 74 | // this is a breaking change, won't fix. 75 | HttpClient Doer 76 | 77 | // The user-agent string to use for robots.txt validation and URL fetching. 78 | UserAgent string 79 | 80 | // The time a host-dedicated worker goroutine can stay idle, with no Command to enqueue, 81 | // before it is stopped and cleared from memory. 82 | WorkerIdleTTL time.Duration 83 | 84 | // AutoClose makes the fetcher close its queue automatically once the number 85 | // of hosts reach 0. A host is removed once it has been idle for WorkerIdleTTL 86 | // duration. 87 | AutoClose bool 88 | 89 | // q holds the Queue to send data to the fetcher and optionally close (stop) it. 90 | q *Queue 91 | // dbg is a channel used to push debug information. 92 | dbgmu sync.Mutex 93 | dbg chan *DebugInfo 94 | debugging bool 95 | 96 | // hosts maps the host names to its dedicated requests channel, and mu protects 97 | // concurrent access to the hosts field. 98 | mu sync.Mutex 99 | hosts map[string]chan Command 100 | } 101 | 102 | // The DebugInfo holds information to introspect the Fetcher's state. 103 | type DebugInfo struct { 104 | NumHosts int 105 | } 106 | 107 | // New returns an initialized Fetcher. 108 | func New(h Handler) *Fetcher { 109 | return &Fetcher{ 110 | Handler: h, 111 | CrawlDelay: DefaultCrawlDelay, 112 | HttpClient: http.DefaultClient, 113 | UserAgent: DefaultUserAgent, 114 | WorkerIdleTTL: DefaultWorkerIdleTTL, 115 | dbg: make(chan *DebugInfo, 1), 116 | } 117 | } 118 | 119 | // Queue offers methods to send Commands to the Fetcher, and to Stop the crawling process. 120 | // It is safe to use from concurrent goroutines. 121 | type Queue struct { 122 | ch chan Command 123 | 124 | // signal channels 125 | closed, cancelled, done chan struct{} 126 | 127 | wg sync.WaitGroup 128 | } 129 | 130 | // Close closes the Queue so that no more Commands can be sent. It blocks until 131 | // the Fetcher drains all pending commands. After the call, the Fetcher is stopped. 132 | // Attempts to enqueue new URLs after Close has been called will always result in 133 | // a ErrQueueClosed error. 134 | func (q *Queue) Close() error { 135 | // Make sure it is not already closed, as this is a run-time panic 136 | select { 137 | case <-q.closed: 138 | // Already closed, no-op 139 | return nil 140 | default: 141 | // Close the signal-channel 142 | close(q.closed) 143 | // Send a nil Command to make sure the processQueue method sees the close signal. 144 | q.ch <- nil 145 | // Wait for the Fetcher to drain. 146 | q.wg.Wait() 147 | // Unblock any callers waiting on q.Block 148 | close(q.done) 149 | return nil 150 | } 151 | } 152 | 153 | // Block blocks the current goroutine until the Queue is closed and all pending 154 | // commands are drained. 155 | func (q *Queue) Block() { 156 | <-q.done 157 | } 158 | 159 | // Done returns a channel that is closed when the Queue is closed (either 160 | // via Close or Cancel). Multiple calls always return the same channel. 161 | func (q *Queue) Done() <-chan struct{} { 162 | return q.done 163 | } 164 | 165 | // Cancel closes the Queue and drains the pending commands without processing 166 | // them, allowing for a fast "stop immediately"-ish operation. 167 | func (q *Queue) Cancel() error { 168 | select { 169 | case <-q.cancelled: 170 | // already cancelled, no-op 171 | return nil 172 | default: 173 | // mark the queue as cancelled 174 | close(q.cancelled) 175 | // Close the Queue, that will wait for pending commands to drain 176 | // will unblock any callers waiting on q.Block 177 | return q.Close() 178 | } 179 | } 180 | 181 | // Send enqueues a Command into the Fetcher. If the Queue has been closed, it 182 | // returns ErrQueueClosed. The Command's URL must have a Host. 183 | func (q *Queue) Send(c Command) error { 184 | if c == nil { 185 | return ErrEmptyHost 186 | } 187 | if u := c.URL(); u == nil || u.Host == "" { 188 | return ErrEmptyHost 189 | } 190 | select { 191 | case <-q.closed: 192 | return ErrQueueClosed 193 | default: 194 | q.ch <- c 195 | } 196 | return nil 197 | } 198 | 199 | // SendString enqueues a method and some URL strings into the Fetcher. It returns an error 200 | // if the URL string cannot be parsed, or if the Queue has been closed. 201 | // The first return value is the number of URLs successfully enqueued. 202 | func (q *Queue) SendString(method string, rawurl ...string) (int, error) { 203 | return q.sendWithMethod(method, rawurl) 204 | } 205 | 206 | // SendStringHead enqueues the URL strings to be fetched with a HEAD method. 207 | // It returns an error if the URL string cannot be parsed, or if the Queue has been closed. 208 | // The first return value is the number of URLs successfully enqueued. 209 | func (q *Queue) SendStringHead(rawurl ...string) (int, error) { 210 | return q.sendWithMethod("HEAD", rawurl) 211 | } 212 | 213 | // SendStringGet enqueues the URL strings to be fetched with a GET method. 214 | // It returns an error if the URL string cannot be parsed, or if the Queue has been closed. 215 | // The first return value is the number of URLs successfully enqueued. 216 | func (q *Queue) SendStringGet(rawurl ...string) (int, error) { 217 | return q.sendWithMethod("GET", rawurl) 218 | } 219 | 220 | // Parses the URL strings and enqueues them as *Cmd. It returns the number of URLs 221 | // successfully enqueued, and an error if the URL string cannot be parsed or 222 | // the Queue has been closed. 223 | func (q *Queue) sendWithMethod(method string, rawurl []string) (int, error) { 224 | for i, v := range rawurl { 225 | parsed, err := url.Parse(v) 226 | if err != nil { 227 | return i, err 228 | } 229 | if err := q.Send(&Cmd{U: parsed, M: method}); err != nil { 230 | return i, err 231 | } 232 | } 233 | return len(rawurl), nil 234 | } 235 | 236 | // Start starts the Fetcher, and returns the Queue to use to send Commands to be fetched. 237 | func (f *Fetcher) Start() *Queue { 238 | f.hosts = make(map[string]chan Command) 239 | 240 | f.q = &Queue{ 241 | ch: make(chan Command, 1), 242 | closed: make(chan struct{}), 243 | cancelled: make(chan struct{}), 244 | done: make(chan struct{}), 245 | } 246 | 247 | // Start the one and only queue processing goroutine. 248 | f.q.wg.Add(1) 249 | go f.processQueue() 250 | 251 | return f.q 252 | } 253 | 254 | // Debug returns the channel to use to receive the debugging information. It is not intended 255 | // to be used by package users. 256 | func (f *Fetcher) Debug() <-chan *DebugInfo { 257 | f.dbgmu.Lock() 258 | defer f.dbgmu.Unlock() 259 | f.debugging = true 260 | return f.dbg 261 | } 262 | 263 | // processQueue runs the queue in its own goroutine. 264 | func (f *Fetcher) processQueue() { 265 | loop: 266 | for v := range f.q.ch { 267 | if v == nil { 268 | // Special case, when the Queue is closed, a nil command is sent, use this 269 | // indicator to check for the closed signal, instead of looking on every loop. 270 | select { 271 | case <-f.q.closed: 272 | // Close signal, exit loop 273 | break loop 274 | default: 275 | // Keep going 276 | } 277 | } 278 | select { 279 | case <-f.q.cancelled: 280 | // queue got cancelled, drain 281 | continue 282 | default: 283 | // go on 284 | } 285 | 286 | // Get the URL to enqueue 287 | u := v.URL() 288 | 289 | // Check if a channel is already started for this host 290 | f.mu.Lock() 291 | in, ok := f.hosts[u.Host] 292 | if !ok { 293 | // Start a new channel and goroutine for this host. 294 | 295 | var rob *url.URL 296 | if !f.DisablePoliteness { 297 | // Must send the robots.txt request. 298 | rob = u.ResolveReference(robotsTxtParsedPath) 299 | } 300 | 301 | // Create the infinite queue: the in channel to send on, and the out channel 302 | // to read from in the host's goroutine, and add to the hosts map 303 | var out chan Command 304 | in, out = make(chan Command, 1), make(chan Command, 1) 305 | f.hosts[u.Host] = in 306 | f.mu.Unlock() 307 | f.q.wg.Add(1) 308 | // Start the infinite queue goroutine for this host 309 | go sliceIQ(in, out) 310 | // Start the working goroutine for this host 311 | go f.processChan(out, u.Host) 312 | 313 | if !f.DisablePoliteness { 314 | // Enqueue the robots.txt request first. 315 | in <- robotCommand{&Cmd{U: rob, M: "GET"}} 316 | } 317 | } else { 318 | f.mu.Unlock() 319 | } 320 | // Send the request 321 | in <- v 322 | 323 | // Send debug info, but do not block if full 324 | f.dbgmu.Lock() 325 | if f.debugging { 326 | f.mu.Lock() 327 | select { 328 | case f.dbg <- &DebugInfo{len(f.hosts)}: 329 | default: 330 | } 331 | f.mu.Unlock() 332 | } 333 | f.dbgmu.Unlock() 334 | } 335 | 336 | // Close all host channels now that it is impossible to send on those. Those are the `in` 337 | // channels of the infinite queue. It will then drain any pending events, triggering the 338 | // handlers for each in the worker goro, and then the infinite queue goro will terminate 339 | // and close the `out` channel, which in turn will terminate the worker goro. 340 | f.mu.Lock() 341 | for _, ch := range f.hosts { 342 | close(ch) 343 | } 344 | f.hosts = make(map[string]chan Command) 345 | f.mu.Unlock() 346 | 347 | f.q.wg.Done() 348 | } 349 | 350 | // Goroutine for a host's worker, processing requests for all its URLs. 351 | func (f *Fetcher) processChan(ch <-chan Command, hostKey string) { 352 | var ( 353 | agent *robotstxt.Group 354 | wait <-chan time.Time 355 | ttl <-chan time.Time 356 | delay = f.CrawlDelay 357 | ) 358 | 359 | loop: 360 | for { 361 | select { 362 | case <-f.q.cancelled: 363 | break loop 364 | case v, ok := <-ch: 365 | if !ok { 366 | // Terminate this goroutine, channel is closed 367 | break loop 368 | } 369 | 370 | // Wait for the prescribed delay 371 | if wait != nil { 372 | <-wait 373 | } 374 | 375 | // was it cancelled during the wait? check again 376 | select { 377 | case <-f.q.cancelled: 378 | break loop 379 | default: 380 | // go on 381 | } 382 | 383 | switch r, ok := v.(robotCommand); { 384 | case ok: 385 | // This is the robots.txt request 386 | agent = f.getRobotAgent(r) 387 | // Initialize the crawl delay 388 | if agent != nil && agent.CrawlDelay > 0 { 389 | delay = agent.CrawlDelay 390 | } 391 | wait = time.After(delay) 392 | 393 | case agent == nil || agent.Test(v.URL().Path): 394 | // Path allowed, process the request 395 | res, err := f.doRequest(v) 396 | f.visit(v, res, err) 397 | // No delay on error - the remote host was not reached 398 | if err == nil { 399 | wait = time.After(delay) 400 | } else { 401 | wait = nil 402 | } 403 | 404 | default: 405 | // Path disallowed by robots.txt 406 | f.visit(v, nil, ErrDisallowed) 407 | wait = nil 408 | } 409 | // Every time a command is received, reset the ttl channel 410 | ttl = time.After(f.WorkerIdleTTL) 411 | 412 | case <-ttl: 413 | // Worker has been idle for WorkerIdleTTL, terminate it 414 | f.mu.Lock() 415 | inch, ok := f.hosts[hostKey] 416 | delete(f.hosts, hostKey) 417 | 418 | // Close the queue if AutoClose is set and there are no more hosts. 419 | if f.AutoClose && len(f.hosts) == 0 { 420 | go f.q.Close() 421 | } 422 | f.mu.Unlock() 423 | if ok { 424 | close(inch) 425 | } 426 | break loop 427 | } 428 | } 429 | 430 | // need to drain ch until it is closed, to prevent the producer goroutine 431 | // from leaking. 432 | for _ = range ch { 433 | } 434 | 435 | f.q.wg.Done() 436 | } 437 | 438 | // Get the robots.txt User-Agent-specific group. 439 | func (f *Fetcher) getRobotAgent(r robotCommand) *robotstxt.Group { 440 | res, err := f.doRequest(r) 441 | if err != nil { 442 | // TODO: Ignore robots.txt request error? 443 | fmt.Fprintf(os.Stderr, "fetchbot: error fetching robots.txt: %s\n", err) 444 | return nil 445 | } 446 | if res.Body != nil { 447 | defer res.Body.Close() 448 | } 449 | robData, err := robotstxt.FromResponse(res) 450 | if err != nil { 451 | // TODO : Ignore robots.txt parse error? 452 | fmt.Fprintf(os.Stderr, "fetchbot: error parsing robots.txt: %s\n", err) 453 | return nil 454 | } 455 | return robData.FindGroup(f.UserAgent) 456 | } 457 | 458 | // Call the Handler for this Command. Closes the response's body. 459 | func (f *Fetcher) visit(cmd Command, res *http.Response, err error) { 460 | if res != nil && res.Body != nil { 461 | defer res.Body.Close() 462 | } 463 | // if the Command implements Handler, call that handler, otherwise 464 | // dispatch to the Fetcher's Handler. 465 | if h, ok := cmd.(Handler); ok { 466 | h.Handle(&Context{Cmd: cmd, Q: f.q}, res, err) 467 | return 468 | } 469 | f.Handler.Handle(&Context{Cmd: cmd, Q: f.q}, res, err) 470 | } 471 | 472 | // Prepare and execute the request for this Command. 473 | func (f *Fetcher) doRequest(cmd Command) (*http.Response, error) { 474 | req, err := http.NewRequest(cmd.Method(), cmd.URL().String(), nil) 475 | if err != nil { 476 | return nil, err 477 | } 478 | // If the Command implements some other recognized interfaces, set 479 | // the request accordingly (see cmd.go for the list of interfaces). 480 | // First, the Header values. 481 | if hd, ok := cmd.(HeaderProvider); ok { 482 | for k, v := range hd.Header() { 483 | req.Header[k] = v 484 | } 485 | } 486 | // BasicAuth has higher priority than an Authorization header set by 487 | // a HeaderProvider. 488 | if ba, ok := cmd.(BasicAuthProvider); ok { 489 | req.SetBasicAuth(ba.BasicAuth()) 490 | } 491 | // Cookies are added to the request, even if some cookies were set 492 | // by a HeaderProvider. 493 | if ck, ok := cmd.(CookiesProvider); ok { 494 | for _, c := range ck.Cookies() { 495 | req.AddCookie(c) 496 | } 497 | } 498 | // For the body of the request, ReaderProvider has higher priority 499 | // than ValuesProvider. 500 | if rd, ok := cmd.(ReaderProvider); ok { 501 | rdr := rd.Reader() 502 | rc, ok := rdr.(io.ReadCloser) 503 | if !ok { 504 | rc = ioutil.NopCloser(rdr) 505 | } 506 | req.Body = rc 507 | } else if val, ok := cmd.(ValuesProvider); ok { 508 | v := val.Values() 509 | req.Body = ioutil.NopCloser(strings.NewReader(v.Encode())) 510 | if req.Header.Get("Content-Type") == "" { 511 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 512 | } 513 | } 514 | // If there was no User-Agent implicitly set by the HeaderProvider, 515 | // set it to the default value. 516 | if req.Header.Get("User-Agent") == "" { 517 | req.Header.Set("User-Agent", f.UserAgent) 518 | } 519 | // Do the request. 520 | res, err := f.HttpClient.Do(req) 521 | if err != nil { 522 | return nil, err 523 | } 524 | return res, nil 525 | } 526 | -------------------------------------------------------------------------------- /fetch_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Martin Angers and Contributors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package fetchbot 6 | 7 | import ( 8 | "fmt" 9 | "io/ioutil" 10 | "net/http" 11 | "net/http/httptest" 12 | "net/url" 13 | "runtime" 14 | "strconv" 15 | "sync" 16 | "testing" 17 | "time" 18 | ) 19 | 20 | type spyHandler struct { 21 | mu sync.Mutex 22 | cmds []Command 23 | errs []error 24 | res []*http.Response 25 | bodies []string 26 | fn Handler 27 | } 28 | 29 | func (sh *spyHandler) Handle(ctx *Context, res *http.Response, err error) { 30 | sh.mu.Lock() 31 | sh.cmds = append(sh.cmds, ctx.Cmd) 32 | sh.errs = append(sh.errs, err) 33 | sh.res = append(sh.res, res) 34 | if res == nil { 35 | sh.bodies = append(sh.bodies, "") 36 | } else { 37 | b, err := ioutil.ReadAll(res.Body) 38 | if err != nil { 39 | sh.bodies = append(sh.bodies, "") 40 | } 41 | sh.bodies = append(sh.bodies, string(b)) 42 | } 43 | sh.mu.Unlock() 44 | if sh.fn != nil { 45 | sh.fn.Handle(ctx, res, err) 46 | } 47 | } 48 | 49 | func (sh *spyHandler) Errors() int { 50 | sh.mu.Lock() 51 | defer sh.mu.Unlock() 52 | cnt := 0 53 | for _, e := range sh.errs { 54 | if e != nil { 55 | cnt++ 56 | } 57 | } 58 | return cnt 59 | } 60 | 61 | func (sh *spyHandler) CommandFor(rawurl string) Command { 62 | sh.mu.Lock() 63 | defer sh.mu.Unlock() 64 | for _, c := range sh.cmds { 65 | if c.URL().String() == rawurl { 66 | return c 67 | } 68 | } 69 | return nil 70 | } 71 | 72 | func (sh *spyHandler) ErrorFor(rawurl string) error { 73 | sh.mu.Lock() 74 | defer sh.mu.Unlock() 75 | ix := -1 76 | for i, c := range sh.cmds { 77 | if c.URL().String() == rawurl { 78 | ix = i 79 | break 80 | } 81 | } 82 | if ix >= 0 { 83 | return sh.errs[ix] 84 | } 85 | return nil 86 | } 87 | 88 | func (sh *spyHandler) StatusFor(rawurl string) int { 89 | sh.mu.Lock() 90 | defer sh.mu.Unlock() 91 | ix := -1 92 | for i, c := range sh.cmds { 93 | if c.URL().String() == rawurl { 94 | ix = i 95 | break 96 | } 97 | } 98 | if ix >= 0 && sh.res[ix] != nil { 99 | return sh.res[ix].StatusCode 100 | } 101 | return -1 102 | } 103 | 104 | func (sh *spyHandler) BodyFor(rawurl string) string { 105 | sh.mu.Lock() 106 | defer sh.mu.Unlock() 107 | ix := -1 108 | for i, c := range sh.cmds { 109 | if c.URL().String() == rawurl { 110 | ix = i 111 | break 112 | } 113 | } 114 | if ix >= 0 { 115 | return sh.bodies[ix] 116 | } 117 | return "" 118 | } 119 | 120 | func (sh *spyHandler) CalledWithExactly(rawurl ...string) bool { 121 | sh.mu.Lock() 122 | defer sh.mu.Unlock() 123 | if len(sh.cmds) != len(rawurl) { 124 | return false 125 | } 126 | for _, u := range rawurl { 127 | ok := false 128 | for _, c := range sh.cmds { 129 | if u == c.URL().String() { 130 | ok = true 131 | break 132 | } 133 | } 134 | if !ok { 135 | return false 136 | } 137 | } 138 | return true 139 | } 140 | 141 | var nopHandler = HandlerFunc(func(ctx *Context, res *http.Response, err error) {}) 142 | 143 | // Test that an initialized Fetcher has the right defaults. 144 | func TestNew(t *testing.T) { 145 | f := New(nopHandler) 146 | if f.CrawlDelay != DefaultCrawlDelay { 147 | t.Errorf("expected CrawlDelay to be %s, got %s", DefaultCrawlDelay, f.CrawlDelay) 148 | } 149 | if f.HttpClient != http.DefaultClient { 150 | t.Errorf("expected HttpClient to be %v (default net/http client), got %v", http.DefaultClient, f.HttpClient) 151 | } 152 | if f.UserAgent != DefaultUserAgent { 153 | t.Errorf("expected UserAgent to be %s, got %s", DefaultUserAgent, f.UserAgent) 154 | } 155 | if f.WorkerIdleTTL != DefaultWorkerIdleTTL { 156 | t.Errorf("expected WorkerIdleTTL to be %s, got %s", DefaultWorkerIdleTTL, f.WorkerIdleTTL) 157 | } 158 | } 159 | 160 | func TestQueueClosed(t *testing.T) { 161 | f := New(nil) 162 | q := f.Start() 163 | q.Close() 164 | _, err := q.SendStringGet("http://host/a") 165 | if err != ErrQueueClosed { 166 | t.Errorf("expected error %s, got %v", ErrQueueClosed, err) 167 | } 168 | // Test that closing a closed Queue doesn't panic 169 | q.Close() 170 | } 171 | 172 | func TestBlock(t *testing.T) { 173 | // Start a test server 174 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 175 | w.Write([]byte("ok")) 176 | })) 177 | defer srv.Close() 178 | 179 | // Define the raw URLs to enqueue 180 | cases := []string{srv.URL + "/a", srv.URL + "/b", srv.URL + "/c"} 181 | 182 | // Start the Fetcher 183 | sh := &spyHandler{} 184 | f := New(sh) 185 | f.CrawlDelay = 0 186 | q := f.Start() 187 | _, err := q.SendStringGet(cases...) 188 | if err != nil { 189 | t.Fatal(err) 190 | } 191 | var mu sync.Mutex 192 | ok := false 193 | go func() { 194 | q.Block() 195 | mu.Lock() 196 | ok = true 197 | mu.Unlock() 198 | }() 199 | time.Sleep(100 * time.Millisecond) 200 | q.Close() 201 | time.Sleep(100 * time.Millisecond) 202 | // Assert that the handler got called with all cases 203 | if ok := sh.CalledWithExactly(cases...); !ok { 204 | t.Error("expected handler to be called with all cases") 205 | } 206 | // Expect 0 error 207 | if cnt := sh.Errors(); cnt != 0 { 208 | t.Errorf("expected no error, got %d", cnt) 209 | } 210 | // Expect ok to be true 211 | mu.Lock() 212 | if !ok { 213 | t.Error("expected flag to be set to true after Block release, got false") 214 | } 215 | mu.Unlock() 216 | } 217 | 218 | func TestSendVariadic(t *testing.T) { 219 | // Start a test server 220 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 221 | w.Write([]byte("ok")) 222 | })) 223 | defer srv.Close() 224 | 225 | // Define the raw URLs to enqueue 226 | cases := []string{srv.URL + "/a", srv.URL + "/b", "/nohost", ":"} 227 | handled := cases[:len(cases)-2] 228 | 229 | // Start the Fetcher 230 | sh := &spyHandler{} 231 | f := New(sh) 232 | f.CrawlDelay = 0 233 | q := f.Start() 234 | n, err := q.SendStringGet(cases...) 235 | if n != 2 { 236 | t.Errorf("expected %d URLs enqueued, got %d", 2, n) 237 | } 238 | if err != ErrEmptyHost { 239 | t.Errorf("expected %v, got %v", ErrEmptyHost, err) 240 | } 241 | // Stop to wait for all commands to be processed 242 | q.Close() 243 | // Assert that the handler got called with the right values 244 | if ok := sh.CalledWithExactly(handled...); !ok { 245 | t.Error("expected handler to be called with valid cases") 246 | } 247 | // Expect no error 248 | if cnt := sh.Errors(); cnt != 0 { 249 | t.Errorf("expected no error, got %d", cnt) 250 | } 251 | } 252 | 253 | func TestUserAgent(t *testing.T) { 254 | // Start a test server 255 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 256 | w.Write([]byte("ok")) 257 | })) 258 | defer srv.Close() 259 | 260 | // Define the raw URLs to enqueue 261 | cases := []string{srv.URL + "/a"} 262 | 263 | // Start the Fetcher 264 | f := New(nil) 265 | sh := &spyHandler{fn: HandlerFunc(func(ctx *Context, res *http.Response, err error) { 266 | if f.UserAgent != res.Request.UserAgent() { 267 | t.Errorf("expected user agent %s, got %s", f.UserAgent, res.Request.UserAgent()) 268 | } 269 | })} 270 | f.Handler = sh 271 | f.CrawlDelay = 0 272 | f.UserAgent = "test" 273 | q := f.Start() 274 | q.SendStringGet(cases...) 275 | // Stop to wait for all commands to be processed 276 | q.Close() 277 | // Assert that the handler got called with the right values 278 | if ok := sh.CalledWithExactly(cases...); !ok { 279 | t.Error("expected handler to be called with all cases") 280 | } 281 | // Assert that there was no error 282 | if cnt := sh.Errors(); cnt > 0 { 283 | t.Errorf("expected no errors, got %d", cnt) 284 | } 285 | } 286 | 287 | func TestSendString(t *testing.T) { 288 | // Start a test server 289 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 290 | w.Write([]byte("ok")) 291 | })) 292 | defer srv.Close() 293 | 294 | // Define the raw URLs to enqueue 295 | cases := []string{srv.URL + "/a", srv.URL + "/b", srv.URL + "/c"} 296 | 297 | // Start the Fetcher 298 | sh := &spyHandler{} 299 | f := New(sh) 300 | f.CrawlDelay = 0 301 | q := f.Start() 302 | for _, c := range cases { 303 | _, err := q.SendString("GET", c) 304 | if err != nil { 305 | t.Fatal(err) 306 | } 307 | } 308 | // Stop to wait for all commands to be processed 309 | q.Close() 310 | // Assert that the handler got called with the right values 311 | if ok := sh.CalledWithExactly(cases...); !ok { 312 | t.Error("expected handler to be called with all cases") 313 | } 314 | // Assert that there was no error 315 | if cnt := sh.Errors(); cnt > 0 { 316 | t.Errorf("expected no errors, got %d", cnt) 317 | } 318 | } 319 | 320 | func TestFetchDisallowed(t *testing.T) { 321 | // Start 2 test servers 322 | srvDisAll := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 323 | if r.URL.Path == "/robots.txt" { 324 | w.Write([]byte(` 325 | User-agent: * 326 | Disallow: / 327 | `)) 328 | return 329 | } 330 | w.Write([]byte("ok")) 331 | })) 332 | defer srvDisAll.Close() 333 | srvAllSome := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 334 | if r.URL.Path == "/robots.txt" { 335 | w.Write([]byte(` 336 | User-agent: Googlebot 337 | Disallow: / 338 | 339 | User-agent: Fetchbot 340 | Disallow: /a 341 | `)) 342 | return 343 | } 344 | w.Write([]byte("ok")) 345 | })) 346 | defer srvAllSome.Close() 347 | 348 | // Define the raw URLs to enqueue 349 | cases := []string{srvDisAll.URL + "/a", srvDisAll.URL + "/b", srvAllSome.URL + "/a", srvAllSome.URL + "/b"} 350 | 351 | // Start the Fetcher 352 | sh := &spyHandler{} 353 | f := New(sh) 354 | f.CrawlDelay = 0 355 | q := f.Start() 356 | for _, c := range cases { 357 | _, err := q.SendString("GET", c) 358 | if err != nil { 359 | t.Fatal(err) 360 | } 361 | } 362 | // Stop to wait for all commands to be processed 363 | q.Close() 364 | // Assert that the handler got called with the right values 365 | if ok := sh.CalledWithExactly(cases...); !ok { 366 | t.Error("expected handler to be called with all cases") 367 | } 368 | // Assert that there was the correct number of expected errors 369 | if cnt := sh.Errors(); cnt != 3 { 370 | t.Errorf("expected 3 errors, got %d", cnt) 371 | } 372 | for i := 0; i < 3; i++ { 373 | if err := sh.ErrorFor(cases[i]); err != ErrDisallowed { 374 | t.Errorf("expected error %s for %s, got %v", ErrDisallowed, cases[i], err) 375 | } 376 | } 377 | } 378 | 379 | func TestCrawlDelay(t *testing.T) { 380 | // Start a test server 381 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 382 | if r.URL.Path == "/robots.txt" { 383 | w.Write([]byte(` 384 | User-agent: Fetchbot 385 | Crawl-delay: 1 386 | `)) 387 | return 388 | } 389 | w.Write([]byte("ok")) 390 | })) 391 | defer srv.Close() 392 | 393 | // Define the raw URLs to enqueue 394 | cases := []string{srv.URL + "/a", srv.URL + "/b"} 395 | 396 | // Start the Fetcher 397 | sh := &spyHandler{} 398 | f := New(sh) 399 | f.CrawlDelay = 0 400 | start := time.Now() 401 | q := f.Start() 402 | _, err := q.SendStringGet(cases...) 403 | if err != nil { 404 | t.Fatal(err) 405 | } 406 | // Stop to wait for all commands to be processed 407 | q.Close() 408 | delay := time.Now().Sub(start) 409 | // Assert that the handler got called with the right values 410 | if ok := sh.CalledWithExactly(cases...); !ok { 411 | t.Error("expected handler to be called with all cases") 412 | } 413 | // Assert that there was no error 414 | if cnt := sh.Errors(); cnt > 0 { 415 | t.Errorf("expected no errors, got %d", cnt) 416 | } 417 | // Assert that the total elapsed time is around 2 seconds 418 | if delay < 2*time.Second || delay > (2*time.Second+100*time.Millisecond) { 419 | t.Errorf("expected delay to be around 2s, got %s", delay) 420 | } 421 | } 422 | 423 | func TestManyCrawlDelays(t *testing.T) { 424 | // Skip if -short flag is set 425 | if testing.Short() { 426 | t.SkipNow() 427 | } 428 | // Start two test servers 429 | srv1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 430 | if r.URL.Path == "/robots.txt" { 431 | w.Write([]byte(` 432 | User-agent: Fetchbot 433 | Crawl-delay: 1 434 | `)) 435 | return 436 | } 437 | w.Write([]byte("ok")) 438 | })) 439 | defer srv1.Close() 440 | srv2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 441 | w.Write([]byte("ok")) 442 | })) 443 | defer srv2.Close() 444 | 445 | // Define the raw URLs to enqueue 446 | cases := []string{srv1.URL + "/a", srv1.URL + "/b", srv2.URL + "/a", srv2.URL + "/b"} 447 | 448 | // Start the Fetcher 449 | sh := &spyHandler{} 450 | f := New(sh) 451 | f.CrawlDelay = 2 * time.Second 452 | start := time.Now() 453 | q := f.Start() 454 | _, err := q.SendStringGet(cases...) 455 | if err != nil { 456 | t.Fatal(err) 457 | } 458 | // Stop to wait for all commands to be processed 459 | q.Close() 460 | delay := time.Now().Sub(start) 461 | // Assert that the handler got called with the right values 462 | if ok := sh.CalledWithExactly(cases...); !ok { 463 | t.Error("expected handler to be called with all cases") 464 | } 465 | // Assert that there was no error 466 | if cnt := sh.Errors(); cnt > 0 { 467 | t.Errorf("expected no errors, got %d", cnt) 468 | } 469 | // Assert that the total elapsed time is around 4 seconds 470 | if delay < 4*time.Second || delay > (4*time.Second+100*time.Millisecond) { 471 | t.Errorf("expected delay to be around 4s, got %s", delay) 472 | } 473 | } 474 | 475 | // Custom Command for TestCustomCommand 476 | type IDCmd struct { 477 | *Cmd 478 | ID int 479 | } 480 | 481 | func TestCustomCommand(t *testing.T) { 482 | // Start a test server 483 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 484 | w.Write([]byte("ok")) 485 | })) 486 | defer srv.Close() 487 | 488 | // Define the raw URLs to enqueue 489 | cases := []string{srv.URL + "/a", srv.URL + "/b"} 490 | 491 | // Start the Fetcher 492 | sh := &spyHandler{} 493 | f := New(sh) 494 | f.CrawlDelay = 0 495 | q := f.Start() 496 | for i, c := range cases { 497 | parsed, err := url.Parse(c) 498 | if err != nil { 499 | t.Fatal(err) 500 | } 501 | q.Send(&IDCmd{&Cmd{U: parsed, M: "GET"}, i}) 502 | } 503 | // Stop to wait for all commands to be processed 504 | q.Close() 505 | // Assert that the handler got called with the right values 506 | if ok := sh.CalledWithExactly(cases...); !ok { 507 | t.Error("expected handler to be called with all cases") 508 | } 509 | // Assert that there was no error 510 | if cnt := sh.Errors(); cnt > 0 { 511 | t.Errorf("expected no errors, got %d", cnt) 512 | } 513 | // Assert that all commands got passed with the correct custom information 514 | for i, c := range cases { 515 | cmd := sh.CommandFor(c) 516 | if idc, ok := cmd.(*IDCmd); !ok { 517 | t.Errorf("expected command for %s to be an *IDCmd, got %T", c, cmd) 518 | } else if idc.ID != i { 519 | t.Errorf("expected command ID for %s to be %d, got %d", c, i, idc.ID) 520 | } 521 | } 522 | } 523 | 524 | func TestFreeIdleHost(t *testing.T) { 525 | // Start 2 test servers 526 | srv1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 527 | w.Write([]byte("ok")) 528 | })) 529 | defer srv1.Close() 530 | srv2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 531 | w.Write([]byte("ok")) 532 | })) 533 | defer srv2.Close() 534 | 535 | // Define the raw URLs to enqueue 536 | cases := []string{srv1.URL + "/a", srv2.URL + "/a"} 537 | 538 | // Start the Fetcher 539 | sh := &spyHandler{} 540 | f := New(sh) 541 | f.CrawlDelay = 0 542 | f.WorkerIdleTTL = 100 * time.Millisecond 543 | q := f.Start() 544 | for i, c := range cases { 545 | if i == 1 { 546 | // srv1 should now be removed 547 | f.mu.Lock() 548 | if _, ok := f.hosts[srv1.URL[len("http://"):]]; ok { 549 | t.Error("expected server srv1 to be removed from hosts") 550 | } 551 | f.mu.Unlock() 552 | } 553 | _, err := q.SendStringGet(c) 554 | if err != nil { 555 | t.Fatal(err) 556 | } 557 | time.Sleep(110 * time.Millisecond) 558 | } 559 | q.Close() 560 | // Assert that the handler got called with the right values 561 | if ok := sh.CalledWithExactly(cases...); !ok { 562 | t.Error("expected handler to be called with all cases") 563 | } 564 | // Assert that there was no error 565 | if cnt := sh.Errors(); cnt > 0 { 566 | t.Errorf("expected no errors, got %d", cnt) 567 | } 568 | } 569 | 570 | func TestRemoveHosts(t *testing.T) { 571 | // Start 2 test servers 572 | srv1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 573 | w.Write([]byte("ok")) 574 | })) 575 | defer srv1.Close() 576 | srv2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 577 | w.Write([]byte("ok")) 578 | })) 579 | defer srv2.Close() 580 | 581 | // Define the raw URLs to enqueue 582 | cases := []string{srv1.URL + "/a", srv2.URL + "/a"} 583 | 584 | // Start the Fetcher 585 | sh := &spyHandler{} 586 | f := New(sh) 587 | f.CrawlDelay = 0 588 | f.WorkerIdleTTL = 100 * time.Millisecond 589 | q := f.Start() 590 | for _, c := range cases { 591 | _, err := q.SendStringGet(c) 592 | if err != nil { 593 | t.Fatal(err) 594 | } 595 | time.Sleep(101 * time.Millisecond) 596 | } 597 | q.Close() 598 | // Assert that the handler got called with the right values 599 | if ok := sh.CalledWithExactly(cases...); !ok { 600 | t.Error("expected handler to be called with all cases") 601 | } 602 | // Assert that there was no error 603 | if cnt := sh.Errors(); cnt > 0 { 604 | t.Errorf("expected no errors, got %d", cnt) 605 | } 606 | // Assert that hosts are all removed 607 | if l := len(f.hosts); l > 0 { 608 | t.Errorf("expected hosts to be empty, got %d", l) 609 | } 610 | } 611 | 612 | func TestRestart(t *testing.T) { 613 | f := New(nil) 614 | f.CrawlDelay = 0 615 | for i := 0; i < 2; i++ { 616 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 617 | w.Write([]byte("ok")) 618 | })) 619 | cases := []string{srv.URL + "/a", srv.URL + "/b"} 620 | sh := &spyHandler{} 621 | f.Handler = sh 622 | q := f.Start() 623 | // Assert that the lists and maps are empty 624 | if len(f.hosts) != 0 { 625 | t.Errorf("run %d: expected clean slate after call to Start, found hosts=%d", i, len(f.hosts)) 626 | } 627 | _, err := q.SendStringGet(cases...) 628 | if err != nil { 629 | t.Fatal(err) 630 | } 631 | q.Close() 632 | // Assert that the handler got called with the right values 633 | if ok := sh.CalledWithExactly(cases...); !ok { 634 | t.Error("expected handler to be called with all cases") 635 | } 636 | // Assert that there was no error 637 | if cnt := sh.Errors(); cnt > 0 { 638 | t.Errorf("expected no errors, got %d", cnt) 639 | } 640 | srv.Close() 641 | } 642 | } 643 | 644 | func TestOverflowBuffer(t *testing.T) { 645 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 646 | w.Write([]byte("ok")) 647 | })) 648 | defer srv.Close() 649 | cases := []string{srv.URL + "/a", srv.URL + "/b", srv.URL + "/c", srv.URL + "/d", srv.URL + "/e", srv.URL + "/f"} 650 | signal := make(chan struct{}) 651 | sh := &spyHandler{fn: HandlerFunc(func(ctx *Context, res *http.Response, err error) { 652 | if ctx.Cmd.URL().Path == "/a" { 653 | // Enqueue a bunch, while this host's goroutine is busy waiting for this call 654 | _, err := ctx.Q.SendStringGet(cases[1:]...) 655 | if err != nil { 656 | t.Fatal(err) 657 | } 658 | close(signal) 659 | } 660 | })} 661 | f := New(sh) 662 | f.CrawlDelay = 0 663 | q := f.Start() 664 | _, err := q.SendStringGet(cases[0]) 665 | if err != nil { 666 | t.Fatal(err) 667 | } 668 | <-signal 669 | q.Close() 670 | // Assert that the handler got called with the right values 671 | if ok := sh.CalledWithExactly(cases...); !ok { 672 | t.Error("expected handler to be called with all cases") 673 | } 674 | // Assert that there was no error 675 | if cnt := sh.Errors(); cnt > 0 { 676 | t.Errorf("expected no errors, got %d", cnt) 677 | } 678 | } 679 | 680 | func TestCancel(t *testing.T) { 681 | srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 682 | w.Write([]byte("ok")) 683 | })) 684 | defer srv.Close() 685 | allowHandler := make(chan struct{}) 686 | allowCancel := make(chan struct{}) 687 | sh := &spyHandler{fn: HandlerFunc(func(ctx *Context, res *http.Response, err error) { 688 | // allow cancel as soon as /0 is received 689 | <-allowHandler 690 | if res.Request.URL.Path == "/0" { 691 | close(allowCancel) 692 | } 693 | })} 694 | 695 | f := New(sh) 696 | f.CrawlDelay = time.Second 697 | f.DisablePoliteness = true 698 | q := f.Start() 699 | // enqueue a bunch of URLs 700 | for i := 0; i < 1000; i++ { 701 | _, err := q.SendStringGet(srv.URL + "/" + strconv.Itoa(i)) 702 | if err != nil { 703 | t.Fatal(err) 704 | } 705 | } 706 | // allow one to proceed 707 | close(allowHandler) 708 | // wait for cancel signal 709 | <-allowCancel 710 | q.Cancel() 711 | 712 | // Assert that the handler got called with the right values 713 | if ok := sh.CalledWithExactly(srv.URL + "/0"); !ok { 714 | t.Error("expected handler to be called only with /0") 715 | } 716 | // Assert that there was no error 717 | if cnt := sh.Errors(); cnt > 0 { 718 | t.Errorf("expected no errors, got %d", cnt) 719 | } 720 | } 721 | 722 | type doerFunc func(*http.Request) (*http.Response, error) 723 | 724 | func (f doerFunc) Do(req *http.Request) (*http.Response, error) { 725 | return f(req) 726 | } 727 | 728 | func TestGoroLeak(t *testing.T) { 729 | callCount := 0 730 | f := New(HandlerFunc(func(c *Context, res *http.Response, err error) { 731 | // sleep a bit so that it produces faster than it consumes 732 | callCount++ 733 | time.Sleep(time.Millisecond) 734 | })) 735 | 736 | f.HttpClient = doerFunc(func(req *http.Request) (*http.Response, error) { 737 | return &http.Response{Request: req, StatusCode: 200}, nil 738 | }) 739 | 740 | f.DisablePoliteness = true 741 | f.CrawlDelay = 0 742 | 743 | startGoros := runtime.NumGoroutine() 744 | q := f.Start() 745 | 746 | // start a goro that enqueues a new URL (always on the same domain) 747 | // until Send fails. 748 | wg := sync.WaitGroup{} 749 | wg.Add(1) 750 | counter := 0 751 | go func() { 752 | defer wg.Done() 753 | for { 754 | counter++ 755 | _, err := q.SendStringGet(fmt.Sprintf("http://example.com/%d", counter)) 756 | if err != nil { 757 | return 758 | } 759 | } 760 | }() 761 | 762 | <-time.After(100 * time.Millisecond) 763 | q.Cancel() 764 | wg.Wait() 765 | 766 | // if the race detector is set, may fail if num goroutine checked 767 | // immediately. But under normal circumstances, the goroutines 768 | // are released when Cancel/Close returns. 769 | time.Sleep(10 * time.Millisecond) 770 | 771 | cancelGoros := runtime.NumGoroutine() 772 | 773 | // should have sent a lot of URLs 774 | if counter < 10*callCount { 775 | t.Errorf("want many more Send than Calls, got %d and %d", counter, callCount) 776 | } 777 | // should have received between 10-100 calls 778 | if callCount < 10 || callCount > 100 { 779 | t.Errorf("want at least 10 and no more than 100 handler calls, got %d", callCount) 780 | } 781 | // should have the same number of goroutines as there was at the start 782 | if startGoros < cancelGoros { 783 | t.Errorf("want %d goros like there was at the start, got %d (leak)", startGoros, cancelGoros) 784 | } 785 | 786 | t.Logf("start: %d, cancel: %d, counter: %d, calls: %d", startGoros, cancelGoros, counter, callCount) 787 | } 788 | --------------------------------------------------------------------------------