├── .gitignore ├── crawler-arc.png ├── .env ├── storage.go ├── proxy_test.go ├── proxy.go ├── useragent.go ├── visited.go ├── queue.go ├── Dockerfile ├── go.mod ├── LICENSE ├── helper.go ├── Makefile ├── helper_test.go ├── parser.go ├── inmemory.go ├── README.md ├── inmemory_test.go ├── redis.go ├── main.go ├── go.sum ├── crawler.go └── dashboard.json /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | *.pprof 3 | .vscode/ -------------------------------------------------------------------------------- /crawler-arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bp72/crwl/HEAD/crawler-arc.png -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | export REDIS_HOST=192.168.1.140 2 | export REDIS_PORT=6379 3 | export REDIS_BASE=4 4 | export REDIS_PASS=ddlmaster 5 | -------------------------------------------------------------------------------- /storage.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // "context" 4 | 5 | type Storage interface { 6 | IsVisited() bool 7 | IsSeen() bool 8 | Add(Host int, Uri string) 9 | } 10 | -------------------------------------------------------------------------------- /proxy_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestProxy(t *testing.T) { 8 | p := &Proxy{User: "user", Password: "pass", Host: "127.0.0.1", Port: 8080} 9 | expected := "user:pass@127.0.0.1:8080" 10 | if p.String() != expected { 11 | t.Errorf("Test 'New Proxy. String method' failed. Wrong output. Got %s, expected %s", p.String(), expected) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /proxy.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | 5 | var proxies = []string{ 6 | ":@:", 7 | } 8 | 9 | type Proxy struct { 10 | User string 11 | Password string 12 | Host string 13 | Port int 14 | } 15 | 16 | func (p *Proxy) String() string { 17 | return fmt.Sprintf("%s:%s@%s:%d", p.User, p.Password, p.Host, p.Port) 18 | } 19 | 20 | type Proxies struct { 21 | Items []Proxy 22 | } 23 | -------------------------------------------------------------------------------- /useragent.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func GetUserAgent(useGooglebot bool) string { 4 | if useGooglebot { 5 | return "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.179 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 6 | } 7 | return "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" 8 | } 9 | -------------------------------------------------------------------------------- /visited.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "time" 6 | ) 7 | 8 | type Visited interface { 9 | Add(ctx context.Context, uri string) 10 | Exists(ctx context.Context, uri string) bool 11 | } 12 | 13 | func NewVisited(useInternalCache bool) Visited { 14 | if useInternalCache { 15 | return NewInmemVisited() 16 | } 17 | 18 | return NewRedisCache( 19 | RedisConnectionParams{ 20 | Addr: *RedisAddr, 21 | Base: *RedisBase, 22 | Password: *RedisPass, 23 | Domain: *Domain, 24 | Timeout: 2000 * time.Millisecond, 25 | }, 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /queue.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "time" 6 | ) 7 | 8 | type Queue interface { 9 | Size(ctx context.Context) int64 10 | Put(ctx context.Context, t *Task) 11 | Take(ctx context.Context) (*Task, error) 12 | TaskDone(ctx context.Context) 13 | } 14 | 15 | func NewQueue(useInternalQueue bool) Queue { 16 | if useInternalQueue { 17 | return NewInmemQueue() 18 | } 19 | 20 | return NewRedisQueue( 21 | RedisConnectionParams{ 22 | Addr: *RedisAddr, 23 | Base: *RedisBase, 24 | Password: *RedisPass, 25 | Domain: *Domain, 26 | Timeout: 2000 * time.Millisecond, 27 | }, 28 | ) 29 | } 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.21-alpine as builder 2 | 3 | ARG VERSION=1.0.0 4 | ARG GITHASH="unknown" 5 | ARG BUILDAT="unknown" 6 | 7 | 8 | RUN apk update && apk add --no-cache git make gcc g++ 9 | WORKDIR $GOPATH/src/crwl/ 10 | COPY . . 11 | RUN go get -d -v 12 | 13 | RUN go build -o /go/bin/crwl \ 14 | -ldflags "-X main.version=${VERSION} -X 'main.buildat=${BUILDAT}' -X 'main.githash=${GITHASH}'" \ 15 | inmemory.go redis.go storage.go helper.go proxy.go queue.go visited.go crawler.go main.go 16 | 17 | FROM alpine 18 | 19 | RUN apk update 20 | 21 | COPY --from=builder /go/bin/crwl /bin/crwl 22 | 23 | ENTRYPOINT ["/bin/crwl"] 24 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/bp72/crwl 2 | 3 | go 1.21.0 4 | 5 | require ( 6 | github.com/cactus/go-statsd-client/v5 v5.1.0 7 | github.com/redis/go-redis/v9 v9.1.0 8 | github.com/valyala/fasthttp v1.48.0 9 | golang.org/x/net v0.14.0 10 | ) 11 | 12 | require ( 13 | github.com/PuerkitoBio/goquery v1.8.1 // indirect 14 | github.com/andybalholm/brotli v1.0.5 // indirect 15 | github.com/andybalholm/cascadia v1.3.1 // indirect 16 | github.com/cespare/xxhash/v2 v2.2.0 // indirect 17 | github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect 18 | github.com/klauspost/compress v1.16.3 // indirect 19 | github.com/valyala/bytebufferpool v1.0.0 // indirect 20 | golang.org/x/text v0.12.0 // indirect 21 | ) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Pavel Bitiukov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /helper.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "path" 6 | "strings" 7 | ) 8 | 9 | type Site struct { 10 | BaseUrl string 11 | MaxDepth int 12 | KeywordPrefix string 13 | } 14 | 15 | func (s *Site) NewTask(Uri string, Depth int) (*Task, error) { 16 | if s.MaxDepth <= Depth { 17 | return nil, fmt.Errorf("MaxDepth for %s reached: %d", s.BaseUrl, s.MaxDepth) 18 | } 19 | 20 | if strings.HasPrefix(Uri, s.BaseUrl) { 21 | Uri = strings.Replace(Uri, s.BaseUrl, "", -1) 22 | } 23 | 24 | return &Task{Site: s, Uri: Uri, Depth: Depth}, nil 25 | } 26 | 27 | type Task struct { 28 | Uri string 29 | Site *Site 30 | Depth int 31 | } 32 | 33 | func (t *Task) GetUrl() string { 34 | if t.Uri[0] == '/' { 35 | return fmt.Sprintf("%s%s", t.Site.BaseUrl, t.Uri) 36 | } 37 | return fmt.Sprintf("%s/%s", t.Site.BaseUrl, t.Uri) 38 | } 39 | 40 | func (t *Task) GetSubTree() string { 41 | filename := path.Base(t.Uri) 42 | 43 | if len(filename) == 0 { 44 | return "i/n/d" 45 | } 46 | 47 | if len(filename) == 1 { 48 | return filename + "/0/0" 49 | } 50 | 51 | if len(filename) == 2 { 52 | return filename[:1] + "/" + filename[1:2] + "/0" 53 | } 54 | 55 | return filename[:1] + "/" + filename[1:2] + "/" + filename[2:3] 56 | } 57 | 58 | func (t *Task) GetFilename() string { 59 | if t.Uri == "/" || t.Uri == "" { 60 | return "index" 61 | } 62 | 63 | filename := strings.Replace(t.Uri, "/", "__", -1) 64 | if len(filename) > 255 { 65 | return filename[:255] 66 | } 67 | return filename 68 | } 69 | 70 | type Link struct { 71 | Href string 72 | Anchor []string 73 | } 74 | 75 | func NewLink() *Link { 76 | l := Link{ 77 | Anchor: make([]string, 0), 78 | } 79 | return &l 80 | } 81 | 82 | func (l *Link) GetAnchor() string { 83 | return strings.TrimSpace(strings.Join(l.Anchor, " ")) 84 | } 85 | 86 | func (l *Link) IsKeyword(Prefix string) bool { 87 | return strings.HasPrefix(l.Href, Prefix) 88 | } 89 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DATE = $(shell date --iso=seconds) 2 | GITHASH = $(shell git rev-parse --short HEAD) 3 | VERSION = "1.0.0" 4 | SOURCES = inmemory.go redis.go storage.go helper.go proxy.go queue.go visited.go crawler.go parser.go main.go 5 | 6 | test: 7 | go test 8 | 9 | test-with-race: 10 | go test -race 11 | 12 | 13 | image: $(SOURCES) 14 | docker build . --file Dockerfile --tag crwl:$(GITHASH) \ 15 | --build-arg GITHASH=$(GITHASH) \ 16 | --build-arg VERSION=$(VERSION) \ 17 | --build-arg BUILDAT=$(DATE) 18 | 19 | docker build . --file Dockerfile --tag crwl:latest \ 20 | --build-arg GITHASH=$(GITHASH) \ 21 | --build-arg VERSION=$(VERSION) \ 22 | --build-arg BUILDAT=$(DATE) 23 | 24 | 25 | build: test 26 | go build -o bin/crwl \ 27 | -ldflags "-X main.version=$(VERSION) -X 'main.buildat=$(DATE)' -X 'main.githash=$(GITHASH)'" 28 | 29 | build-with-race: test-with-race 30 | go build -race -o bin/crwl \ 31 | -ldflags "-X main.version=$(VERSION) -X 'main.buildat=$(DATE)' -X 'main.githash=$(GITHASH)'" \ 32 | ${SOURCES} 33 | 34 | run: build 35 | bin/crwl -domain habr.com \ 36 | -use-redis \ 37 | -statsd-addr 192.168.1.140:8125 \ 38 | -store-path /tmp/crwl \ 39 | -redis-addr 192.168.1.140:6379 \ 40 | -redis-base 0 \ 41 | -redis-pass ddlmaster 42 | 43 | run-with-race: build-with-race 44 | bin/crwl -domain habr.com \ 45 | -use-redis \ 46 | -statsd-addr 192.168.1.140:8125 \ 47 | -store-path /tmp/crwl \ 48 | -redis-addr 192.168.1.140:6379 \ 49 | -redis-base 0 \ 50 | -redis-pass ddlmaster 51 | 52 | image-nas-repo: image 53 | docker tag crwl:$(GITHASH) 192.168.1.140:6088/crwl:$(GITHASH) 54 | docker push 192.168.1.140:6088/crwl:$(GITHASH) 55 | 56 | run-local: build 57 | bin/crwl -domain habr.com \ 58 | -use-internal-cache \ 59 | -use-internal-queue \ 60 | -statsd-addr 192.168.1.140:8125 \ 61 | -store-path /tmp/crwl \ 62 | -redis-addr 192.168.1.140:6379 \ 63 | -redis-base 0 \ 64 | -redis-pass ddlmaster \ 65 | -max-workers 2 -------------------------------------------------------------------------------- /helper_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestTask(t *testing.T) { 8 | 9 | task := &Task{ 10 | Uri: "/subdir/name-of-uri", 11 | Site: &Site{ 12 | BaseUrl: "http://domain.com", 13 | MaxDepth: 3, 14 | KeywordPrefix: "/db", 15 | }, 16 | Depth: 0, 17 | } 18 | 19 | expectUrl := "http://domain.com/subdir/name-of-uri" 20 | gotUrl := task.GetUrl() 21 | 22 | if gotUrl != expectUrl { 23 | t.Errorf("Test Task.GetUrl failed. Got %q, expected %q", gotUrl, expectUrl) 24 | } 25 | 26 | expectedSubdirTree := "n/a/m" 27 | gotSubdirTree := task.GetSubTree() 28 | 29 | if gotSubdirTree != expectedSubdirTree { 30 | t.Errorf("Test Task.GetSubdirTree() failed. Got %q, expected %q", gotSubdirTree, expectedSubdirTree) 31 | } 32 | 33 | expectedFilename := "__subdir__name-of-uri" 34 | gotFilename := task.GetFilename() 35 | 36 | if gotFilename != expectedFilename { 37 | t.Errorf("Test Task.GetFilename() failed. Got %q, expected %q", gotFilename, expectedFilename) 38 | } 39 | } 40 | 41 | func TestTaskSubdirs(t *testing.T) { 42 | task := &Task{ 43 | Uri: "/subdir/name-of-uri", 44 | Site: &Site{ 45 | BaseUrl: "http://domain.com", 46 | MaxDepth: 3, 47 | KeywordPrefix: "/db", 48 | }, 49 | Depth: 0, 50 | } 51 | 52 | task.Uri = "/a" 53 | expectedSubdirTree := "a/0/0" 54 | gotSubdirTree := task.GetSubTree() 55 | 56 | if gotSubdirTree != expectedSubdirTree { 57 | t.Errorf("Test Task.GetSubdirTree() failed. Got %q, expected %q", gotSubdirTree, expectedSubdirTree) 58 | } 59 | 60 | task.Uri = "/ab" 61 | expectedSubdirTree = "a/b/0" 62 | gotSubdirTree = task.GetSubTree() 63 | 64 | if gotSubdirTree != expectedSubdirTree { 65 | t.Errorf("Test Task.GetSubdirTree() failed. Got %q, expected %q", gotSubdirTree, expectedSubdirTree) 66 | } 67 | 68 | task.Uri = "/abc" 69 | expectedSubdirTree = "a/b/c" 70 | gotSubdirTree = task.GetSubTree() 71 | 72 | if gotSubdirTree != expectedSubdirTree { 73 | t.Errorf("Test Task.GetSubdirTree() failed. Got %q, expected %q", gotSubdirTree, expectedSubdirTree) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /parser.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "time" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | "github.com/cactus/go-statsd-client/v5/statsd" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | type Article struct { 14 | Title string 15 | URL string 16 | Category string 17 | } 18 | 19 | type LinkParser interface { 20 | ParseLinks(reader *bytes.Reader, t *Task) 21 | } 22 | 23 | type HtmlLinkParser struct { 24 | q Queue 25 | stats statsd.Statter 26 | } 27 | 28 | func (p *HtmlLinkParser) ParseLinks(reader *bytes.Reader, t *Task) { 29 | start := time.Now() 30 | Log.Info("start parsing", "owner", "HtmlLinkParser") 31 | defer p.stats.TimingDuration("crawl.parser", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 32 | doc, err := goquery.NewDocumentFromReader(reader) 33 | 34 | if err != nil { 35 | Log.Error("create document from reader failed", "err", err) 36 | } 37 | 38 | Log.Info("parse document", "doc", doc) 39 | Unique := make(map[string]bool) 40 | 41 | doc.Find("a").Each(func(i int, sel *goquery.Selection) { 42 | Href, _ := sel.Attr("href") 43 | if _, exists := Unique[Href]; !exists { 44 | Unique[Href] = true 45 | nt, err := t.Site.NewTask(Href, t.Depth+1) 46 | if err == nil { 47 | go func() { 48 | Log.Info("new", "task", nt) 49 | localStart := time.Now() 50 | p.q.Put(context.Background(), nt) 51 | defer p.stats.TimingDuration("queue.put", time.Since(localStart), 1.0, statsd.Tag{"domain", *Domain}) 52 | }() 53 | } 54 | } 55 | }) 56 | } 57 | 58 | type HtmlLinkParser2 struct { 59 | q Queue 60 | stats statsd.Statter 61 | } 62 | 63 | func (p *HtmlLinkParser2) ParseLinks(reader *bytes.Reader, t *Task) { 64 | doc, err := html.Parse(reader) 65 | 66 | if err != nil { 67 | Log.Error("create document from reader failed", "parser", "HtmlLinkParser2", "err", err) 68 | } 69 | 70 | // Visit all nodes and extract links 71 | var links []string 72 | var f func(*html.Node) 73 | f = func(n *html.Node) { 74 | if n.Type == html.ElementNode && n.Data == "a" { 75 | for _, a := range n.Attr { 76 | if a.Key == "href" { 77 | links = append(links, a.Val) 78 | newtask, err := t.Site.NewTask(a.Val, t.Depth+1) 79 | if err == nil { 80 | p.q.Put(context.Background(), newtask) 81 | } 82 | break 83 | } 84 | } 85 | } 86 | for c := n.FirstChild; c != nil; c = c.NextSibling { 87 | f(c) 88 | } 89 | } 90 | f(doc) 91 | } 92 | -------------------------------------------------------------------------------- /inmemory.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "errors" 7 | "os" 8 | "strings" 9 | "sync" 10 | ) 11 | 12 | type Node struct { 13 | Value *Task 14 | Next *Node 15 | } 16 | 17 | type InmemQueue struct { 18 | head *Node 19 | tail *Node 20 | size int 21 | in_progress int 22 | lock sync.Mutex 23 | } 24 | 25 | func (q *InmemQueue) Add(ctx context.Context, t *Task) { 26 | // TODO: Deprecate Add method 27 | q.Put(ctx, t) 28 | } 29 | 30 | func (q *InmemQueue) Put(ctx context.Context, t *Task) { 31 | q.lock.Lock() 32 | defer q.lock.Unlock() 33 | 34 | node := &Node{Value: t} 35 | q.tail.Next = node 36 | q.tail = node 37 | q.size++ 38 | } 39 | 40 | func (q *InmemQueue) Take(ctx context.Context) (*Task, error) { 41 | q.lock.Lock() 42 | defer q.lock.Unlock() 43 | 44 | if q.head.Next == nil { 45 | return nil, errors.New("empty queue") 46 | } 47 | 48 | node := q.head.Next 49 | q.head.Next = node.Next 50 | node.Next = nil 51 | q.size-- 52 | q.in_progress++ 53 | 54 | return node.Value, nil 55 | } 56 | 57 | func (q *InmemQueue) Get(ctx context.Context) (*Task, error) { 58 | // TODO: Deprecate Get method 59 | return q.Take(ctx) 60 | } 61 | 62 | func (q *InmemQueue) Size(ctx context.Context) int64 { 63 | return int64(q.size + q.in_progress) 64 | } 65 | 66 | func (q *InmemQueue) TaskDone(ctx context.Context) { 67 | q.lock.Lock() 68 | defer q.lock.Unlock() 69 | 70 | q.in_progress-- 71 | } 72 | 73 | func (q *InmemQueue) LoadFromFile(site *Site, Filepath string) error { 74 | f, err := os.Open(Filepath) 75 | 76 | if err != nil { 77 | return err 78 | } 79 | defer f.Close() 80 | 81 | scanner := bufio.NewScanner(f) 82 | ctx := context.Background() 83 | for scanner.Scan() { 84 | items := strings.Split(scanner.Text(), "|||") 85 | task, err := site.NewTask(items[0], site.MaxDepth-1) 86 | if err != nil { 87 | continue 88 | } 89 | q.Put(ctx, task) 90 | } 91 | 92 | if err := scanner.Err(); err != nil { 93 | return err 94 | } 95 | 96 | return nil 97 | } 98 | 99 | func NewInmemQueue() *InmemQueue { 100 | q := &InmemQueue{} 101 | 102 | q.head = &Node{} 103 | q.tail = q.head 104 | 105 | return q 106 | } 107 | 108 | type InmemVisited struct { 109 | items map[string]bool 110 | lock sync.RWMutex 111 | } 112 | 113 | func (v *InmemVisited) Add(ctx context.Context, uri string) { 114 | v.lock.Lock() 115 | defer v.lock.Unlock() 116 | 117 | v.items[uri] = true 118 | } 119 | 120 | func (v *InmemVisited) Exists(ctx context.Context, uri string) bool { 121 | v.lock.RLock() 122 | defer v.lock.RUnlock() 123 | 124 | if _, exists := v.items[uri]; exists { 125 | return true 126 | } 127 | 128 | return false 129 | } 130 | 131 | func NewInmemVisited() *InmemVisited { 132 | v := &InmemVisited{ 133 | items: make(map[string]bool), 134 | } 135 | 136 | return v 137 | } 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Crawler

2 | 3 | **crwl** is an open source web crawler in Golang which allows you to traverse entire site. Using it, you can scan, benchmark and validate your site, for example evaluate [connected component](https://en.wikipedia.org/wiki/Component_(graph_theory)) or [internal pagerank](https://en.wikipedia.org/wiki/PageRank) 4 | 5 | ### Motivation 6 | I faced problem to crawl site as-is for various reason: create set site structure as graph, validate it, benchmark. 7 | 8 | # Get Started 9 | #### Clone repo 10 | ``` 11 | git clone git@github.com:bp72/crwl.git 12 | ``` 13 | 14 | #### Build 15 | ``` 16 | make build 17 | ``` 18 | 19 | #### Run 20 | ``` 21 | bin/crwl -domain example.com -use-internal-cache -max-depth 3 -max-workers 5 22 | ``` 23 | 24 | 25 | # Crawler arch 26 | ![alt text](https://github.com/bp72/crwl/blob/feature/update-readme-to-provide-more-context/crawler-arc.png?raw=true) 27 | 28 | 29 | # Web Crawler Features 30 | - Start from the root domain and crawl the web pages with a specified depth. 31 | - Save the pages 32 | - Support logging and statsd metrics 33 | 34 | # TODO Features 35 | - Add WebUI to control and manage crawler 36 | - Add Crawl delay support per domain 37 | - Add Data storage interface to support FS, ClickHouse, RDB 38 | - Add logic to respect robots.txt 39 | - Add Grafana dashboard to repo 40 | - Add docker-compose to setup and run crawler with external service dependencies 41 | - Add condition to save page content to storage, for example keyword or url pattern 42 | 43 | 44 | # Options 45 | 46 | #### Benchmark/Test mode 47 | Sometime you just need to traverse your site without storing the content, just to check everything works fine or how far you can go. In this case you can use **-do-not-store** option, it disables content storing function : 48 | ``` 49 | bin/crwl -do-not-store 50 | ``` 51 | 52 | #### Setting up limits 53 | 54 | Maximum crawls limitation 55 | Option allows to limit number of crawls with exact number, by default it's 100k pages to crawl 56 | ``` 57 | bin/crwl -max-crawl 1234 58 | ``` 59 | 60 | Maximum depth allows to set limitation on how deep crawler can go, by default it's 7 61 | ``` 62 | bin/crwl -max-depth 1 63 | ``` 64 | 65 | Maximum number of worker sets the limit of concurrent cralwers to run, by default it's 20 66 | ``` 67 | bin/crwl -max-workers 2 68 | ``` 69 | 70 | #### Run without any external service dependancy 71 | Crawler can be run standalone (without other services), however this configuration has memory limitation, since it's maintaince urls queue and visitied url in memory. 72 | ``` 73 | bin/crwl -use-internal-cache 74 | ``` 75 | 76 | # Metrics and logging 77 | Crawler support statd metric publishing technique, to enable it: 78 | ``` 79 | bin/crwl -statsd-addr hostname:port 80 | ``` 81 | 82 | ### Roadmap 83 | - [x] Define crawler arch 84 | - [x] Implement initial crawler version 85 | - [ ] Add WebUI to control and manage crawler 86 | - [ ] Add Crawl delay support per domain 87 | - [ ] Add Data storage interface to support FS, ClickHouse, RDB 88 | - [ ] Respect robots.txt 89 | - [ ] Add Grafana dashboard to repo 90 | - [ ] Add docker-compose to setup and run crawler with external service dependencies 91 | -------------------------------------------------------------------------------- /inmemory_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | ) 7 | 8 | func createTestSite(maxDepth int) *Site { 9 | return &Site{ 10 | BaseUrl: "http://example.com", 11 | MaxDepth: maxDepth, 12 | KeywordPrefix: "/pref", 13 | } 14 | } 15 | 16 | func createTestTask(uri string, depth int, site *Site) *Task { 17 | return &Task{ 18 | Uri: uri, 19 | Site: site, 20 | Depth: depth, 21 | } 22 | } 23 | 24 | func TestQueuePut(t *testing.T) { 25 | site := createTestSite(3) 26 | tasks := []*Task{ 27 | createTestTask("uri1", 0, site), 28 | createTestTask("uri2", 0, site), 29 | createTestTask("uri3", 0, site), 30 | } 31 | 32 | q := NewInmemQueue() 33 | task1 := tasks[0] 34 | ctx := context.Background() 35 | for pos, task := range tasks { 36 | q.Put(ctx, task) 37 | if q.Size(ctx) != int64(pos+1) { 38 | t.Errorf("Test 'New Queue. Put task' failed. Invalid size. Got %d, expected %d", q.Size(ctx), pos+1) 39 | } 40 | if q.head.Next.Value != task1 { 41 | t.Errorf("Test 'New Queue. Put task' failed. Invalid head item. Got %v, expected %v", q.head.Next.Value, task1) 42 | } 43 | if q.tail.Value != task { 44 | t.Errorf("Test 'New Queue. Put task' failed. Invalid tail item. Got %v, expected %v", q.head.Next.Value, task) 45 | } 46 | } 47 | } 48 | 49 | func TestQueueTake(t *testing.T) { 50 | site := createTestSite(3) 51 | tasks := []*Task{ 52 | createTestTask("uri1", 0, site), 53 | createTestTask("uri2", 0, site), 54 | createTestTask("uri3", 0, site), 55 | } 56 | 57 | q := NewInmemQueue() 58 | ctx := context.Background() 59 | 60 | for _, task := range tasks { 61 | q.Put(ctx, task) 62 | } 63 | 64 | for pos, expectedTask := range tasks { 65 | task, err := q.Take(ctx) 66 | if err != nil { 67 | t.Errorf("Test 'New Queue. Take task' failed. Unexpected err %v", err) 68 | } 69 | if task != expectedTask { 70 | t.Errorf("Test 'New Queue. Take task' failed. Invalid task. Got %v, expected %v", task, expectedTask) 71 | } 72 | if q.in_progress != pos+1 { 73 | t.Errorf("Test 'New Queue. Take task' failed. Invalid in progress task number. Got %d, expected %d", q.in_progress, pos+1) 74 | } 75 | } 76 | 77 | for pos, _ := range tasks { 78 | q.TaskDone(ctx) 79 | if q.in_progress != len(tasks)-pos-1 { 80 | t.Errorf("Test 'New Queue. Take task' failed. Invalid in progress task number. Got %d, expected %d", q.in_progress, len(tasks)-pos) 81 | } 82 | } 83 | } 84 | 85 | func TestInmemVisited(t *testing.T) { 86 | v := NewInmemVisited() 87 | ctx := context.Background() 88 | 89 | v.Add(ctx, "1") 90 | 91 | if len(v.items) != 1 { 92 | t.Errorf("Test 'InmemVisited. Add' failed. Invalid size. Got %d, expected %d", len(v.items), 1) 93 | } 94 | 95 | v.Add(ctx, "1") 96 | if len(v.items) != 1 { 97 | t.Errorf("Test 'InmemVisited. Add' failed. Invalid size. Got %d, expected %d", len(v.items), 1) 98 | } 99 | 100 | v.Add(ctx, "2") 101 | if len(v.items) != 2 { 102 | t.Errorf("Test 'InmemVisited. Add' failed. Invalid size. Got %d, expected %d", len(v.items), 2) 103 | } 104 | 105 | exists := v.Exists(ctx, "1") 106 | if exists != true { 107 | t.Errorf("Test 'InmemVisited.Exists' failed. Invalid result. Got %v, expected %v", exists, true) 108 | } 109 | 110 | exists = v.Exists(ctx, "2") 111 | if exists != true { 112 | t.Errorf("Test 'InmemVisited.Exists' failed. Invalid result. Got %v, expected %v", exists, true) 113 | } 114 | 115 | exists = v.Exists(ctx, "3") 116 | if exists != false { 117 | t.Errorf("Test 'InmemVisited.Exists' failed. Invalid result. Got %v, expected %v", exists, false) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /redis.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "sync" 8 | "time" 9 | 10 | "github.com/redis/go-redis/v9" 11 | ) 12 | 13 | type RedisConnectionParams struct { 14 | Addr string 15 | Password string 16 | Base int 17 | Domain string 18 | Timeout time.Duration 19 | } 20 | 21 | type RedisQueue struct { 22 | Domain string 23 | Key string 24 | Timeout time.Duration 25 | Client *redis.Client 26 | lock sync.Mutex 27 | InProgress int64 28 | } 29 | 30 | func (q *RedisQueue) Size(ctx context.Context) int64 { 31 | ctx, cancel := context.WithTimeout(ctx, q.Timeout) 32 | defer cancel() 33 | 34 | if val, err := q.Client.LLen(ctx, q.Key).Result(); err == nil { 35 | return val + q.InProgress 36 | } 37 | 38 | return q.InProgress 39 | } 40 | 41 | func (q *RedisQueue) Put(ctx context.Context, t *Task) { 42 | ctx, cancel := context.WithTimeout(ctx, q.Timeout) 43 | defer cancel() 44 | 45 | b, err := json.Marshal(t) 46 | if err != nil { 47 | fmt.Println(err) 48 | return 49 | } 50 | 51 | if err := q.Client.RPush(ctx, q.Key, b).Err(); err != nil { 52 | panic(err) 53 | } 54 | } 55 | 56 | func (q *RedisQueue) Take(ctx context.Context) (*Task, error) { 57 | ctx, cancel := context.WithTimeout(ctx, q.Timeout) 58 | defer cancel() 59 | 60 | q.lock.Lock() 61 | defer q.lock.Unlock() 62 | 63 | if val, err := q.Client.LPop(ctx, q.Key).Result(); err != nil { 64 | return nil, err 65 | } else { 66 | q.InProgress++ 67 | var task *Task 68 | _ = json.Unmarshal([]byte(val), &task) 69 | return task, nil 70 | } 71 | } 72 | 73 | func (q *RedisQueue) TaskDone(ctx context.Context) { 74 | q.lock.Lock() 75 | q.InProgress-- 76 | q.lock.Unlock() 77 | } 78 | 79 | func NewRedisQueue(p RedisConnectionParams) *RedisQueue { 80 | r := &RedisQueue{ 81 | Domain: p.Domain, 82 | Key: fmt.Sprintf("queue:%s", p.Domain), 83 | Timeout: p.Timeout, 84 | Client: redis.NewClient(&redis.Options{ 85 | Addr: p.Addr, 86 | Password: p.Password, 87 | DB: p.Base, 88 | WriteTimeout: time.Second * 5, 89 | ReadTimeout: time.Second * 5, 90 | DialTimeout: time.Second * 5, 91 | PoolTimeout: time.Second * 1, 92 | PoolSize: 1000, 93 | }), 94 | } 95 | 96 | return r 97 | } 98 | 99 | type RedisCache struct { 100 | Domain string 101 | Key string 102 | Timeout time.Duration 103 | TTL time.Duration 104 | Client *redis.Client 105 | } 106 | 107 | func (c *RedisCache) Add(ctx context.Context, uri string) { 108 | ctx, cancel := context.WithTimeout(ctx, c.Timeout) 109 | defer cancel() 110 | 111 | key := fmt.Sprintf("%s:%s", c.Key, uri) 112 | Log.Info("set", "key", key) 113 | if _, err := c.Client.Set(ctx, key, uri, c.TTL).Result(); err != nil { 114 | panic(err) 115 | } 116 | } 117 | 118 | func (c *RedisCache) Exists(ctx context.Context, uri string) bool { 119 | ctx, cancel := context.WithTimeout(ctx, c.Timeout) 120 | defer cancel() 121 | 122 | key := fmt.Sprintf("%s:%s", c.Key, uri) 123 | 124 | if _, err := c.Client.Get(ctx, key).Result(); err != nil { 125 | return false 126 | } 127 | 128 | return true 129 | } 130 | 131 | func NewRedisCache(p RedisConnectionParams) *RedisCache { 132 | r := &RedisCache{ 133 | Domain: p.Domain, 134 | Key: fmt.Sprintf("cache:%s", p.Domain), 135 | Timeout: p.Timeout, 136 | Client: redis.NewClient(&redis.Options{ 137 | Addr: p.Addr, 138 | Password: p.Password, 139 | DB: p.Base, 140 | WriteTimeout: time.Second * 5, 141 | ReadTimeout: time.Second * 5, 142 | DialTimeout: time.Second * 5, 143 | PoolTimeout: time.Second * 1, 144 | PoolSize: 1000, 145 | }), 146 | TTL: 24 * 7 * time.Hour, 147 | } 148 | 149 | return r 150 | } 151 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | /* 2 | TODO: 3 | [ ] Add Redis as External "Visited" 4 | [ ] Set "Visited" TTL to 86400*7 5 | [ ] Load proxy from external file 6 | */ 7 | package main 8 | 9 | import ( 10 | "context" 11 | "flag" 12 | "fmt" 13 | "log/slog" 14 | "path/filepath" 15 | 16 | "github.com/cactus/go-statsd-client/v5/statsd" 17 | 18 | // "math/rand" 19 | 20 | "os" 21 | ) 22 | 23 | var ( 24 | version = "alpha" 25 | buildat = "unknown" 26 | githash = "unknown" 27 | Log *slog.Logger 28 | Domain = flag.String("domain", "", "domain to scan") 29 | MaxDepth = flag.Int("max-depth", 7, "set max depth for crawling") 30 | MaxWorkers = flag.Int("max-workers", 20, "set max concurrent workers") 31 | UseGooglebot = flag.Bool("use-google-bot", false, "Run as Googlebot mode") 32 | DoNotUseProxy = flag.Bool("do-not-use-proxy", false, "Do not use proxy") 33 | DoNotStore = flag.Bool("do-not-store", false, "Do not store content") 34 | UseHttp = flag.Bool("use-http", false, "use http proto") 35 | Limit = flag.Int("max-crawl", 100000, "set max amount of page to crawl") 36 | RedisAddr = flag.String("redis-addr", "127.0.0.1:6379", "redis addr") 37 | RedisBase = flag.Int("redis-base", 0, "redis base") 38 | RedisPass = flag.String("redis-pass", "", "redis pass") 39 | UseInternalCache = flag.Bool("use-internal-cache", false, "Use internal cache instead of Redis") 40 | UseInternalQueue = flag.Bool("use-internal-queue", false, "Use internal queue instead of Redis") 41 | TaskUri = flag.String("task-uri", "", "tasks with uri") 42 | UseRedis = flag.Bool("use-redis", true, "use redis as queue backend") 43 | StatsdAddr = flag.String("statsd-addr", "127.0.0.1:8125", "statsd collector addr") 44 | StorePath = flag.String("store-path", "/tmp", "store pages to this dir") 45 | LastProxy = 0 46 | ) 47 | 48 | func init() { 49 | 50 | replace := func(groups []string, a slog.Attr) slog.Attr { 51 | // Remove time. 52 | if a.Key == slog.TimeKey && len(groups) == 0 { 53 | return slog.Attr{} 54 | } 55 | // Remove the directory from the source's filename. 56 | if a.Key == slog.SourceKey { 57 | source := a.Value.Any().(*slog.Source) 58 | source.File = filepath.Base(source.File) 59 | } 60 | return a 61 | } 62 | LogOpts := &slog.HandlerOptions{Level: slog.LevelInfo, AddSource: true, ReplaceAttr: replace} 63 | Log = slog.New(slog.NewTextHandler(os.Stdout, LogOpts)) 64 | } 65 | 66 | func GetMetricsPrefix() string { 67 | return "crawler" 68 | } 69 | 70 | func GetPrefix(useHttp bool) string { 71 | if useHttp { 72 | return "http://" 73 | } 74 | return "https://" 75 | } 76 | 77 | func GetBaseUrl(domain string, useHttp bool) string { 78 | return GetPrefix(useHttp) + *Domain 79 | } 80 | 81 | func main() { 82 | flag.Parse() 83 | Log.Info("start crawler", "version", version, "git", githash, "build-at", buildat) 84 | Log.Info("start-option", "use-internal-queue", *UseInternalQueue) 85 | Log.Info("start-option", "use-internal-cache", *UseInternalCache) 86 | 87 | statsdCfg := &statsd.ClientConfig{ 88 | Address: *StatsdAddr, 89 | Prefix: GetMetricsPrefix(), 90 | } 91 | 92 | statsdClient, err := statsd.NewClientWithConfig(statsdCfg) 93 | if err != nil { 94 | Log.Error("create statsd client error", "msg", err) 95 | } 96 | defer statsdClient.Close() 97 | 98 | Log.Info("use", "redis-addr", *RedisAddr, "base", *RedisBase, "password", *RedisPass) 99 | 100 | q := NewQueue(*UseInternalQueue) 101 | v := NewVisited(*UseInternalCache) 102 | 103 | Log.Info("use", "queue", "redis", "size", q.Size(context.Background())) 104 | 105 | StorePathForDomain := fmt.Sprintf("%s/%s", *StorePath, *Domain) 106 | 107 | baseUrl := GetBaseUrl(*Domain, *UseHttp) 108 | site := &Site{BaseUrl: baseUrl, MaxDepth: *MaxDepth, KeywordPrefix: "/db/"} 109 | task, err := site.NewTask("/", 0) 110 | if err != nil { 111 | Log.Error("error", "err", err) 112 | } 113 | q.Put(context.Background(), task) 114 | 115 | // if *TaskUri == "" { 116 | // task, err := site.NewTask("/", 0) 117 | // if err != nil { 118 | // Log.Error("error", "err", err) 119 | // } 120 | // q.Put(context.Background(), task) 121 | // } else { 122 | // // q.LoadFromFile(site, *TaskUri) 123 | // } 124 | 125 | if q.Size(context.Background()) == 0 { 126 | if task, err := site.NewTask("/", 0); err == nil { 127 | q.Put(context.Background(), task) 128 | // time.Sleep(1 * time.Second) 129 | } 130 | } 131 | 132 | crawler := NewCrawler(q, v, StorePathForDomain, statsdClient) 133 | crawler.statsd = statsdClient 134 | crawler.UserAgent = GetUserAgent(*UseGooglebot) 135 | 136 | Log.Info("crawler", "crwl", crawler, "base-url", baseUrl) 137 | 138 | crawler.Run(*MaxWorkers) 139 | } 140 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= 2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= 3 | github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= 4 | github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= 5 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= 6 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= 7 | github.com/bsm/ginkgo/v2 v2.9.5 h1:rtVBYPs3+TC5iLUVOis1B9tjLTup7Cj5IfzosKtvTJ0= 8 | github.com/bsm/ginkgo/v2 v2.9.5/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= 9 | github.com/bsm/gomega v1.26.0 h1:LhQm+AFcgV2M0WyKroMASzAzCAJVpAxQXv4SaI9a69Y= 10 | github.com/bsm/gomega v1.26.0/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= 11 | github.com/cactus/go-statsd-client/v5 v5.1.0 h1:sbbdfIl9PgisjEoXzvXI1lwUKWElngsjJKaZeC021P4= 12 | github.com/cactus/go-statsd-client/v5 v5.1.0/go.mod h1:COEvJ1E+/E2L4q6QE5CkjWPi4eeDw9maJBMIuMPBZbY= 13 | github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= 14 | github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 15 | github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= 16 | github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= 17 | github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= 18 | github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0PcNQY= 19 | github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= 20 | github.com/redis/go-redis/v9 v9.1.0 h1:137FnGdk+EQdCbye1FW+qOEcY5S+SpY9T0NiuqvtfMY= 21 | github.com/redis/go-redis/v9 v9.1.0/go.mod h1:urWj3He21Dj5k4TK1y59xH8Uj6ATueP8AH1cY3lZl4c= 22 | github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= 23 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= 24 | github.com/valyala/fasthttp v1.48.0 h1:oJWvHb9BIZToTQS3MuQ2R3bJZiNSa2KiNdeI8A+79Tc= 25 | github.com/valyala/fasthttp v1.48.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA= 26 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 27 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 28 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 29 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 30 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 31 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 32 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 33 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 34 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 35 | golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= 36 | golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= 37 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 38 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 39 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 40 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 41 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 42 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 43 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 44 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 45 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 47 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 48 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 49 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 50 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 51 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 52 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 53 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 54 | golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= 55 | golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 56 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 57 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 58 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 59 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 60 | -------------------------------------------------------------------------------- /crawler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "net/http" 8 | "os" 9 | "strconv" 10 | "sync" 11 | "time" 12 | 13 | "github.com/cactus/go-statsd-client/v5/statsd" 14 | "github.com/valyala/fasthttp" 15 | "golang.org/x/net/html" 16 | ) 17 | 18 | type Crawler struct { 19 | TTL time.Duration 20 | UserAgent string 21 | Stack []*Task 22 | TaskChan chan *Task 23 | Wg sync.WaitGroup 24 | Q Queue 25 | V Visited 26 | P LinkParser 27 | DoSaveContent bool 28 | StorePath string 29 | statsd statsd.Statter 30 | Pool sync.Pool 31 | } 32 | 33 | func (c *Crawler) EnqeueMany(t *Task, ch <-chan *Link, wg *sync.WaitGroup) (int, int) { 34 | start := time.Now() 35 | defer c.statsd.TimingDuration("queue.enqueuemany", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 36 | 37 | added := 0 38 | total := 0 39 | 40 | for link := range ch { 41 | total++ 42 | _, err := t.Site.NewTask(link.Href, t.Depth+1) 43 | 44 | if err != nil { 45 | wg.Done() 46 | continue 47 | } 48 | 49 | go func() { 50 | defer wg.Done() 51 | // c.Enqueue(nt) 52 | }() 53 | 54 | added++ 55 | } 56 | 57 | return added, total 58 | } 59 | 60 | func (c *Crawler) Crawl(WorkerNo int) { 61 | c.statsd.Inc("crawl.req.total", 1, 1.0, statsd.Tag{"domain", *Domain}) 62 | Log.Info("crawl.req.total", "qsize", c.Q.Size(context.Background())) 63 | start := time.Now() 64 | task, err := c.Q.Take(context.Background()) 65 | c.statsd.TimingDuration("queue.take", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 66 | 67 | if err != nil { 68 | time.Sleep(1 * time.Second) 69 | Log.Error("error", "w", WorkerNo, "err", err, "qsize", c.Q.Size(context.Background())) 70 | c.statsd.Inc("err", 1, 1.0, statsd.Tag{"domain", *Domain}, statsd.Tag{"type", "queue-take"}) 71 | return 72 | } 73 | 74 | startCheck := time.Now() 75 | if c.V.Exists(context.Background(), task.Uri); err != nil { 76 | Log.Info("already visited", "uri", task.Uri) 77 | c.statsd.Inc("crawl.req.alreadyvisited", 1, 1.0, statsd.Tag{"domain", *Domain}) 78 | return 79 | } 80 | c.statsd.TimingDuration("cache.check", time.Since(startCheck), 1.0, statsd.Tag{"domain", *Domain}) 81 | 82 | Log.Info("crawl task", "w", WorkerNo, "task.uri", task.Uri, "task.depth", task.Depth, "qsize", c.Q.Size(context.Background())) 83 | 84 | reader, err := c.Get(task.GetUrl()) 85 | if err != nil { 86 | c.Q.TaskDone(context.Background()) 87 | Log.Error("error", "w", WorkerNo, "err", err) 88 | c.statsd.Inc("err", 1, 1.0, statsd.Tag{"domain", *Domain}, statsd.Tag{"type", "http-request"}) 89 | return 90 | } 91 | 92 | go func() { 93 | startPut := time.Now() 94 | c.V.Add(context.Background(), task.Uri) 95 | c.statsd.TimingDuration("cache.put", time.Since(startPut), 1.0, statsd.Tag{"domain", *Domain}) 96 | }() 97 | 98 | c.P.ParseLinks(reader, task) 99 | 100 | saveStart := time.Now() 101 | if c.DoSaveContent { 102 | go func() { 103 | dirPath := fmt.Sprintf("%s/%s", c.StorePath, task.GetSubTree()) 104 | err := os.MkdirAll(dirPath, os.ModePerm) 105 | if err != nil { 106 | Log.Error("error", "w", WorkerNo, "err", err) 107 | } 108 | 109 | f, err := os.Create(fmt.Sprintf("%s/%s", dirPath, task.GetFilename())) 110 | 111 | if err != nil { 112 | panic(err) 113 | } 114 | defer f.Close() 115 | 116 | reader.Seek(0, 0) 117 | bytesWrote, err := reader.WriteTo(f) 118 | if err != nil { 119 | panic(err) 120 | } 121 | Log.Info("save content", "w", WorkerNo, "path", c.StorePath, "bytes", bytesWrote) 122 | c.statsd.Inc("crawl.req.saved", 1, 1.0, statsd.Tag{"domain", *Domain}) 123 | }() 124 | } 125 | c.statsd.TimingDuration("save", time.Since(saveStart), 1.0, statsd.Tag{"domain", *Domain}) 126 | 127 | c.Q.TaskDone(context.Background()) 128 | c.statsd.TimingDuration("crawl", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 129 | c.statsd.Inc("crawl.req.ok", 1, 1.0, statsd.Tag{"domain", *Domain}) 130 | Log.Info("task done", "w", WorkerNo, "exec-time", time.Since(start)) 131 | } 132 | 133 | func (c *Crawler) Get(Url string) (*bytes.Reader, error) { 134 | c.statsd.Inc("crawl.client.total", 1, 1.0, statsd.Tag{"domain", *Domain}) 135 | start := time.Now() 136 | req := fasthttp.AcquireRequest() 137 | 138 | req.SetRequestURI(Url) 139 | req.Header.SetMethod(fasthttp.MethodGet) 140 | req.Header.SetUserAgent(c.UserAgent) 141 | 142 | resp := fasthttp.AcquireResponse() 143 | 144 | client := c.Pool.Get().(*fasthttp.Client) 145 | err := client.DoRedirects(req, resp, 5) 146 | 147 | defer fasthttp.ReleaseRequest(req) 148 | defer fasthttp.ReleaseResponse(resp) 149 | 150 | if err != nil { 151 | c.statsd.Inc("crawl.client.err", 1, 1.0, statsd.Tag{"domain", *Domain}) 152 | return nil, err 153 | } 154 | 155 | c.statsd.Inc("crawl.client.code", 1, 1.0, statsd.Tag{"domain", *Domain}, statsd.Tag{"http_code", strconv.Itoa(resp.StatusCode())}) 156 | 157 | if resp.StatusCode() != http.StatusOK { 158 | return nil, fmt.Errorf("url=%s code=%d", Url, resp.StatusCode()) 159 | } 160 | 161 | c.statsd.TimingDuration("crawl.client", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 162 | c.statsd.Inc("crawl.client.ok", 1, 1.0, statsd.Tag{"domain", *Domain}) 163 | return bytes.NewReader(resp.Body()), nil 164 | } 165 | 166 | func getHref(t html.Token) (ok bool, href string) { 167 | for _, a := range t.Attr { 168 | if a.Key == "href" { 169 | href = a.Val 170 | ok = true 171 | } 172 | } 173 | 174 | return 175 | } 176 | 177 | func (c *Crawler) Parse(reader *bytes.Reader, ch chan<- *Link, wg *sync.WaitGroup) []*Link { 178 | start := time.Now() 179 | defer c.statsd.Timing("crawl.parser", int64(time.Since(start)), 1.0, statsd.Tag{"domain", *Domain}) 180 | tokenizer := html.NewTokenizer(reader) 181 | links := make([]*Link, 0) 182 | var l *Link 183 | 184 | for { 185 | tt := tokenizer.Next() 186 | 187 | switch { 188 | case tt == html.ErrorToken: 189 | return links 190 | case tt == html.StartTagToken: 191 | t := tokenizer.Token() 192 | 193 | // Check if the token is an tag 194 | isAnchor := t.Data == "a" 195 | if !isAnchor { 196 | continue 197 | } 198 | 199 | l = NewLink() 200 | 201 | // Extract the href value, if there is one 202 | ok, url := getHref(t) 203 | if !ok { 204 | continue 205 | } 206 | 207 | l.Href = url 208 | case tt == html.TextToken: 209 | t := tokenizer.Token() 210 | if l != nil && t.Data != "" { 211 | l.Anchor = append(l.Anchor, t.Data) 212 | } 213 | case tt == html.EndTagToken: 214 | t := tokenizer.Token() 215 | isAnchor := t.Data == "a" 216 | if !isAnchor { 217 | continue 218 | } 219 | if l != nil { 220 | // ch <- l 221 | // wg.Add(1) 222 | links = append(links, l) 223 | l = nil 224 | } 225 | } 226 | } 227 | } 228 | 229 | func (c *Crawler) Run(MaxGoroutine int) { 230 | Log.Info("start crawler.run", "max", MaxGoroutine) 231 | 232 | guard := make(chan struct{}, MaxGoroutine) 233 | 234 | i := 0 235 | for c.Q.Size(context.Background()) > 0 { 236 | guard <- struct{}{} 237 | go func(no int) { 238 | c.Crawl(no) 239 | <-guard 240 | }(i) 241 | i++ 242 | if i >= *Limit { 243 | break 244 | } 245 | } 246 | 247 | Log.Info("stop crawler.run", "max", MaxGoroutine) 248 | } 249 | 250 | func (c *Crawler) Enqueue(t *Task) error { 251 | start := time.Now() 252 | defer c.statsd.TimingDuration("queue.enqueue", time.Since(start), 1.0, statsd.Tag{"domain", *Domain}) 253 | 254 | startCheck := time.Now() 255 | if c.V.Exists(context.Background(), t.Uri) { 256 | return fmt.Errorf("uri=%s already visited or enqueued", t.Uri) 257 | } 258 | c.statsd.TimingDuration("cache.check", time.Since(startCheck), 1.0, statsd.Tag{"domain", *Domain}) 259 | 260 | startPut := time.Now() 261 | c.V.Add(context.Background(), t.Uri) 262 | c.statsd.TimingDuration("cache.put", time.Since(startPut), 1.0, statsd.Tag{"domain", *Domain}) 263 | 264 | go func() { 265 | startPut = time.Now() 266 | c.Q.Put(context.Background(), t) 267 | c.statsd.TimingDuration("queue.put", time.Since(startPut), 1.0, statsd.Tag{"domain", *Domain}) 268 | c.statsd.Gauge("crawl.queue.size", int64(c.Q.Size(context.Background())), 1.0, statsd.Tag{"domain", *Domain}) 269 | }() 270 | 271 | return nil 272 | } 273 | 274 | func NewCrawler(Q Queue, V Visited, StorePath string, statsd statsd.Statter) *Crawler { 275 | ttl, _ := time.ParseDuration("30m") 276 | 277 | readTimeout, _ := time.ParseDuration("6000ms") 278 | writeTimeout, _ := time.ParseDuration("6000ms") 279 | maxIdleConnDuration, _ := time.ParseDuration("1h") 280 | maxConnWaitTimeout, _ := time.ParseDuration("6000ms") 281 | 282 | cwlr := &Crawler{ 283 | TTL: ttl, 284 | UserAgent: GetUserAgent(*UseGooglebot), 285 | // Clients: make([]*fasthttp.Client, len(proxies)), 286 | Stack: make([]*Task, 0), 287 | TaskChan: make(chan *Task), 288 | V: V, 289 | Q: Q, 290 | P: &HtmlLinkParser{q: Q, stats: statsd}, 291 | DoSaveContent: !*DoNotStore, 292 | StorePath: StorePath, 293 | Pool: sync.Pool{ 294 | New: func() interface{} { 295 | // proxy := proxies[LastProxy%len(proxies)] 296 | LastProxy++ 297 | // dial := fasthttpproxy.FasthttpHTTPDialerTimeout(proxy, writeTimeout) 298 | 299 | // if *DoNotUseProxy { 300 | dial := (&fasthttp.TCPDialer{ 301 | Concurrency: 4096, 302 | DNSCacheDuration: time.Hour, 303 | }).Dial 304 | // } 305 | return &fasthttp.Client{ 306 | ReadTimeout: readTimeout, 307 | WriteTimeout: writeTimeout, 308 | MaxIdleConnDuration: maxIdleConnDuration, 309 | MaxConnWaitTimeout: maxConnWaitTimeout, 310 | NoDefaultUserAgentHeader: true, 311 | DisableHeaderNamesNormalizing: true, 312 | DisablePathNormalizing: true, 313 | Dial: dial, 314 | } 315 | }, 316 | }, 317 | } 318 | 319 | // for pos, proxyStr := range proxies { 320 | // dial := fasthttpproxy.FasthttpHTTPDialerTimeout(proxyStr, writeTimeout) 321 | // if *DoNotUseProxy { 322 | // dial = (&fasthttp.TCPDialer{ 323 | // Concurrency: 4096, 324 | // DNSCacheDuration: time.Hour, 325 | // }).Dial 326 | // } 327 | // cwlr.Clients[pos] = &fasthttp.Client{ 328 | // ReadTimeout: readTimeout, 329 | // WriteTimeout: writeTimeout, 330 | // MaxIdleConnDuration: maxIdleConnDuration, 331 | // MaxConnWaitTimeout: maxConnWaitTimeout, 332 | // NoDefaultUserAgentHeader: true, 333 | // DisableHeaderNamesNormalizing: true, 334 | // DisablePathNormalizing: true, 335 | // Dial: dial, 336 | // } 337 | // } 338 | 339 | return cwlr 340 | } 341 | -------------------------------------------------------------------------------- /dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_GRAPHITE", 5 | "label": "Graphite", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "graphite", 9 | "pluginName": "Graphite" 10 | } 11 | ], 12 | "__elements": {}, 13 | "__requires": [ 14 | { 15 | "type": "grafana", 16 | "id": "grafana", 17 | "name": "Grafana", 18 | "version": "9.3.2" 19 | }, 20 | { 21 | "type": "datasource", 22 | "id": "graphite", 23 | "name": "Graphite", 24 | "version": "1.0.0" 25 | }, 26 | { 27 | "type": "panel", 28 | "id": "timeseries", 29 | "name": "Time series", 30 | "version": "" 31 | } 32 | ], 33 | "annotations": { 34 | "list": [ 35 | { 36 | "builtIn": 1, 37 | "datasource": { 38 | "type": "grafana", 39 | "uid": "-- Grafana --" 40 | }, 41 | "enable": true, 42 | "hide": true, 43 | "iconColor": "rgba(0, 211, 255, 1)", 44 | "name": "Annotations & Alerts", 45 | "target": { 46 | "limit": 100, 47 | "matchAny": false, 48 | "tags": [], 49 | "type": "dashboard" 50 | }, 51 | "type": "dashboard" 52 | } 53 | ] 54 | }, 55 | "editable": true, 56 | "fiscalYearStartMonth": 0, 57 | "graphTooltip": 0, 58 | "id": null, 59 | "links": [], 60 | "liveNow": false, 61 | "panels": [ 62 | { 63 | "datasource": { 64 | "type": "graphite", 65 | "uid": "${DS_GRAPHITE}" 66 | }, 67 | "fieldConfig": { 68 | "defaults": { 69 | "color": { 70 | "mode": "palette-classic" 71 | }, 72 | "custom": { 73 | "axisCenteredZero": false, 74 | "axisColorMode": "text", 75 | "axisLabel": "", 76 | "axisPlacement": "auto", 77 | "barAlignment": 0, 78 | "drawStyle": "line", 79 | "fillOpacity": 0, 80 | "gradientMode": "none", 81 | "hideFrom": { 82 | "legend": false, 83 | "tooltip": false, 84 | "viz": false 85 | }, 86 | "lineInterpolation": "linear", 87 | "lineWidth": 1, 88 | "pointSize": 5, 89 | "scaleDistribution": { 90 | "type": "linear" 91 | }, 92 | "showPoints": "auto", 93 | "spanNulls": false, 94 | "stacking": { 95 | "group": "A", 96 | "mode": "none" 97 | }, 98 | "thresholdsStyle": { 99 | "mode": "off" 100 | } 101 | }, 102 | "mappings": [], 103 | "thresholds": { 104 | "mode": "absolute", 105 | "steps": [ 106 | { 107 | "color": "green", 108 | "value": null 109 | }, 110 | { 111 | "color": "red", 112 | "value": 80 113 | } 114 | ] 115 | } 116 | }, 117 | "overrides": [] 118 | }, 119 | "gridPos": { 120 | "h": 9, 121 | "w": 8, 122 | "x": 0, 123 | "y": 0 124 | }, 125 | "id": 4, 126 | "options": { 127 | "legend": { 128 | "calcs": [], 129 | "displayMode": "list", 130 | "placement": "bottom", 131 | "showLegend": true 132 | }, 133 | "tooltip": { 134 | "mode": "single", 135 | "sort": "none" 136 | } 137 | }, 138 | "targets": [ 139 | { 140 | "datasource": { 141 | "type": "graphite", 142 | "uid": "${DS_GRAPHITE}" 143 | }, 144 | "refCount": 0, 145 | "refId": "A", 146 | "target": "seriesByTag('name=stats.timers.crawler.crawl.std')" 147 | } 148 | ], 149 | "title": "Crawl timer", 150 | "type": "timeseries" 151 | }, 152 | { 153 | "datasource": { 154 | "type": "graphite", 155 | "uid": "${DS_GRAPHITE}" 156 | }, 157 | "fieldConfig": { 158 | "defaults": { 159 | "color": { 160 | "mode": "palette-classic" 161 | }, 162 | "custom": { 163 | "axisCenteredZero": false, 164 | "axisColorMode": "text", 165 | "axisLabel": "", 166 | "axisPlacement": "auto", 167 | "barAlignment": 0, 168 | "drawStyle": "line", 169 | "fillOpacity": 0, 170 | "gradientMode": "none", 171 | "hideFrom": { 172 | "legend": false, 173 | "tooltip": false, 174 | "viz": false 175 | }, 176 | "lineInterpolation": "linear", 177 | "lineWidth": 1, 178 | "pointSize": 5, 179 | "scaleDistribution": { 180 | "type": "linear" 181 | }, 182 | "showPoints": "auto", 183 | "spanNulls": false, 184 | "stacking": { 185 | "group": "A", 186 | "mode": "none" 187 | }, 188 | "thresholdsStyle": { 189 | "mode": "off" 190 | } 191 | }, 192 | "mappings": [], 193 | "thresholds": { 194 | "mode": "absolute", 195 | "steps": [ 196 | { 197 | "color": "green", 198 | "value": null 199 | }, 200 | { 201 | "color": "red", 202 | "value": 80 203 | } 204 | ] 205 | } 206 | }, 207 | "overrides": [] 208 | }, 209 | "gridPos": { 210 | "h": 9, 211 | "w": 7, 212 | "x": 8, 213 | "y": 0 214 | }, 215 | "id": 12, 216 | "options": { 217 | "legend": { 218 | "calcs": [], 219 | "displayMode": "list", 220 | "placement": "bottom", 221 | "showLegend": true 222 | }, 223 | "tooltip": { 224 | "mode": "single", 225 | "sort": "none" 226 | } 227 | }, 228 | "targets": [ 229 | { 230 | "datasource": { 231 | "type": "graphite", 232 | "uid": "${DS_GRAPHITE}" 233 | }, 234 | "refCount": 0, 235 | "refId": "A", 236 | "target": "seriesByTag('name=stats.timers.crawler.crawl.client.std')" 237 | } 238 | ], 239 | "title": "Http Client Timers", 240 | "type": "timeseries" 241 | }, 242 | { 243 | "datasource": { 244 | "type": "graphite", 245 | "uid": "${DS_GRAPHITE}" 246 | }, 247 | "fieldConfig": { 248 | "defaults": { 249 | "color": { 250 | "mode": "palette-classic" 251 | }, 252 | "custom": { 253 | "axisCenteredZero": false, 254 | "axisColorMode": "text", 255 | "axisLabel": "", 256 | "axisPlacement": "auto", 257 | "barAlignment": 0, 258 | "drawStyle": "line", 259 | "fillOpacity": 0, 260 | "gradientMode": "none", 261 | "hideFrom": { 262 | "legend": false, 263 | "tooltip": false, 264 | "viz": false 265 | }, 266 | "lineInterpolation": "linear", 267 | "lineWidth": 1, 268 | "pointSize": 5, 269 | "scaleDistribution": { 270 | "type": "linear" 271 | }, 272 | "showPoints": "auto", 273 | "spanNulls": false, 274 | "stacking": { 275 | "group": "A", 276 | "mode": "none" 277 | }, 278 | "thresholdsStyle": { 279 | "mode": "off" 280 | } 281 | }, 282 | "mappings": [], 283 | "thresholds": { 284 | "mode": "absolute", 285 | "steps": [ 286 | { 287 | "color": "green", 288 | "value": null 289 | }, 290 | { 291 | "color": "red", 292 | "value": 80 293 | } 294 | ] 295 | } 296 | }, 297 | "overrides": [] 298 | }, 299 | "gridPos": { 300 | "h": 9, 301 | "w": 9, 302 | "x": 15, 303 | "y": 0 304 | }, 305 | "id": 11, 306 | "options": { 307 | "legend": { 308 | "calcs": [], 309 | "displayMode": "list", 310 | "placement": "bottom", 311 | "showLegend": true 312 | }, 313 | "tooltip": { 314 | "mode": "single", 315 | "sort": "none" 316 | } 317 | }, 318 | "targets": [ 319 | { 320 | "datasource": { 321 | "type": "graphite", 322 | "uid": "${DS_GRAPHITE}" 323 | }, 324 | "refCount": 0, 325 | "refId": "A", 326 | "target": "seriesByTag('name=stats.timers.crawler.crawl.parser.std')" 327 | }, 328 | { 329 | "datasource": { 330 | "type": "graphite", 331 | "uid": "${DS_GRAPHITE}" 332 | }, 333 | "hide": false, 334 | "refCount": 0, 335 | "refId": "C", 336 | "target": "seriesByTag('name=stats.timers.crawler.crawl.parser.upper_90')", 337 | "textEditor": false 338 | } 339 | ], 340 | "title": "Html Parser Timer", 341 | "type": "timeseries" 342 | }, 343 | { 344 | "datasource": { 345 | "type": "graphite", 346 | "uid": "${DS_GRAPHITE}" 347 | }, 348 | "fieldConfig": { 349 | "defaults": { 350 | "color": { 351 | "mode": "palette-classic" 352 | }, 353 | "custom": { 354 | "axisCenteredZero": false, 355 | "axisColorMode": "text", 356 | "axisLabel": "", 357 | "axisPlacement": "auto", 358 | "barAlignment": 0, 359 | "drawStyle": "line", 360 | "fillOpacity": 0, 361 | "gradientMode": "none", 362 | "hideFrom": { 363 | "legend": false, 364 | "tooltip": false, 365 | "viz": false 366 | }, 367 | "lineInterpolation": "linear", 368 | "lineWidth": 1, 369 | "pointSize": 5, 370 | "scaleDistribution": { 371 | "type": "linear" 372 | }, 373 | "showPoints": "auto", 374 | "spanNulls": false, 375 | "stacking": { 376 | "group": "A", 377 | "mode": "none" 378 | }, 379 | "thresholdsStyle": { 380 | "mode": "off" 381 | } 382 | }, 383 | "mappings": [], 384 | "thresholds": { 385 | "mode": "absolute", 386 | "steps": [ 387 | { 388 | "color": "green", 389 | "value": null 390 | }, 391 | { 392 | "color": "red", 393 | "value": 80 394 | } 395 | ] 396 | } 397 | }, 398 | "overrides": [] 399 | }, 400 | "gridPos": { 401 | "h": 9, 402 | "w": 7, 403 | "x": 0, 404 | "y": 9 405 | }, 406 | "id": 3, 407 | "options": { 408 | "legend": { 409 | "calcs": [], 410 | "displayMode": "list", 411 | "placement": "bottom", 412 | "showLegend": true 413 | }, 414 | "tooltip": { 415 | "mode": "single", 416 | "sort": "none" 417 | } 418 | }, 419 | "targets": [ 420 | { 421 | "datasource": { 422 | "type": "graphite", 423 | "uid": "${DS_GRAPHITE}" 424 | }, 425 | "refCount": 0, 426 | "refId": "A", 427 | "target": "seriesByTag('name=stats.crawler.crawl.client.code', 'domain=habr.com')" 428 | } 429 | ], 430 | "title": "Requests http codes", 431 | "type": "timeseries" 432 | }, 433 | { 434 | "datasource": { 435 | "type": "graphite", 436 | "uid": "${DS_GRAPHITE}" 437 | }, 438 | "fieldConfig": { 439 | "defaults": { 440 | "color": { 441 | "mode": "palette-classic" 442 | }, 443 | "custom": { 444 | "axisCenteredZero": false, 445 | "axisColorMode": "text", 446 | "axisLabel": "", 447 | "axisPlacement": "auto", 448 | "barAlignment": 0, 449 | "drawStyle": "line", 450 | "fillOpacity": 0, 451 | "gradientMode": "none", 452 | "hideFrom": { 453 | "legend": false, 454 | "tooltip": false, 455 | "viz": false 456 | }, 457 | "lineInterpolation": "linear", 458 | "lineWidth": 1, 459 | "pointSize": 5, 460 | "scaleDistribution": { 461 | "type": "linear" 462 | }, 463 | "showPoints": "auto", 464 | "spanNulls": false, 465 | "stacking": { 466 | "group": "A", 467 | "mode": "none" 468 | }, 469 | "thresholdsStyle": { 470 | "mode": "off" 471 | } 472 | }, 473 | "mappings": [], 474 | "thresholds": { 475 | "mode": "absolute", 476 | "steps": [ 477 | { 478 | "color": "green", 479 | "value": null 480 | }, 481 | { 482 | "color": "red", 483 | "value": 80 484 | } 485 | ] 486 | } 487 | }, 488 | "overrides": [] 489 | }, 490 | "gridPos": { 491 | "h": 9, 492 | "w": 8, 493 | "x": 7, 494 | "y": 9 495 | }, 496 | "id": 6, 497 | "options": { 498 | "legend": { 499 | "calcs": [], 500 | "displayMode": "list", 501 | "placement": "bottom", 502 | "showLegend": true 503 | }, 504 | "tooltip": { 505 | "mode": "single", 506 | "sort": "none" 507 | } 508 | }, 509 | "targets": [ 510 | { 511 | "datasource": { 512 | "type": "graphite", 513 | "uid": "${DS_GRAPHITE}" 514 | }, 515 | "refCount": 0, 516 | "refId": "A", 517 | "target": "seriesByTag('name=stats.timers.crawler.queue.take.std')" 518 | }, 519 | { 520 | "datasource": { 521 | "type": "graphite", 522 | "uid": "${DS_GRAPHITE}" 523 | }, 524 | "hide": false, 525 | "refCount": 0, 526 | "refId": "C", 527 | "target": "seriesByTag('name=stats.timers.crawler.queue.take.upper_90')", 528 | "textEditor": true 529 | } 530 | ], 531 | "title": "Queue Take timers", 532 | "type": "timeseries" 533 | }, 534 | { 535 | "datasource": { 536 | "type": "graphite", 537 | "uid": "${DS_GRAPHITE}" 538 | }, 539 | "fieldConfig": { 540 | "defaults": { 541 | "color": { 542 | "mode": "palette-classic" 543 | }, 544 | "custom": { 545 | "axisCenteredZero": false, 546 | "axisColorMode": "text", 547 | "axisLabel": "", 548 | "axisPlacement": "auto", 549 | "barAlignment": 0, 550 | "drawStyle": "line", 551 | "fillOpacity": 0, 552 | "gradientMode": "none", 553 | "hideFrom": { 554 | "legend": false, 555 | "tooltip": false, 556 | "viz": false 557 | }, 558 | "lineInterpolation": "linear", 559 | "lineWidth": 1, 560 | "pointSize": 5, 561 | "scaleDistribution": { 562 | "type": "linear" 563 | }, 564 | "showPoints": "auto", 565 | "spanNulls": false, 566 | "stacking": { 567 | "group": "A", 568 | "mode": "none" 569 | }, 570 | "thresholdsStyle": { 571 | "mode": "off" 572 | } 573 | }, 574 | "mappings": [], 575 | "thresholds": { 576 | "mode": "absolute", 577 | "steps": [ 578 | { 579 | "color": "green", 580 | "value": null 581 | }, 582 | { 583 | "color": "red", 584 | "value": 80 585 | } 586 | ] 587 | } 588 | }, 589 | "overrides": [] 590 | }, 591 | "gridPos": { 592 | "h": 9, 593 | "w": 9, 594 | "x": 15, 595 | "y": 9 596 | }, 597 | "id": 5, 598 | "options": { 599 | "legend": { 600 | "calcs": [], 601 | "displayMode": "list", 602 | "placement": "bottom", 603 | "showLegend": true 604 | }, 605 | "tooltip": { 606 | "mode": "single", 607 | "sort": "none" 608 | } 609 | }, 610 | "targets": [ 611 | { 612 | "datasource": { 613 | "type": "graphite", 614 | "uid": "${DS_GRAPHITE}" 615 | }, 616 | "refCount": 0, 617 | "refId": "A", 618 | "target": "seriesByTag('name=stats.timers.crawler.queue.put.std')" 619 | }, 620 | { 621 | "datasource": { 622 | "type": "graphite", 623 | "uid": "${DS_GRAPHITE}" 624 | }, 625 | "hide": false, 626 | "refCount": 0, 627 | "refId": "C", 628 | "target": "seriesByTag('name=stats.timers.crawler.queue.put.upper_90')" 629 | } 630 | ], 631 | "title": "Queue put timer", 632 | "type": "timeseries" 633 | }, 634 | { 635 | "datasource": { 636 | "type": "graphite", 637 | "uid": "${DS_GRAPHITE}" 638 | }, 639 | "fieldConfig": { 640 | "defaults": { 641 | "color": { 642 | "mode": "palette-classic" 643 | }, 644 | "custom": { 645 | "axisCenteredZero": false, 646 | "axisColorMode": "text", 647 | "axisLabel": "", 648 | "axisPlacement": "auto", 649 | "barAlignment": 0, 650 | "drawStyle": "line", 651 | "fillOpacity": 0, 652 | "gradientMode": "none", 653 | "hideFrom": { 654 | "legend": false, 655 | "tooltip": false, 656 | "viz": false 657 | }, 658 | "lineInterpolation": "linear", 659 | "lineWidth": 1, 660 | "pointSize": 5, 661 | "scaleDistribution": { 662 | "type": "linear" 663 | }, 664 | "showPoints": "auto", 665 | "spanNulls": false, 666 | "stacking": { 667 | "group": "A", 668 | "mode": "none" 669 | }, 670 | "thresholdsStyle": { 671 | "mode": "off" 672 | } 673 | }, 674 | "mappings": [], 675 | "thresholds": { 676 | "mode": "absolute", 677 | "steps": [ 678 | { 679 | "color": "green", 680 | "value": null 681 | }, 682 | { 683 | "color": "red", 684 | "value": 80 685 | } 686 | ] 687 | } 688 | }, 689 | "overrides": [] 690 | }, 691 | "gridPos": { 692 | "h": 9, 693 | "w": 12, 694 | "x": 0, 695 | "y": 18 696 | }, 697 | "id": 7, 698 | "options": { 699 | "legend": { 700 | "calcs": [], 701 | "displayMode": "list", 702 | "placement": "bottom", 703 | "showLegend": true 704 | }, 705 | "tooltip": { 706 | "mode": "single", 707 | "sort": "none" 708 | } 709 | }, 710 | "targets": [ 711 | { 712 | "datasource": { 713 | "type": "graphite", 714 | "uid": "${DS_GRAPHITE}" 715 | }, 716 | "refCount": 0, 717 | "refId": "A", 718 | "target": "seriesByTag('name=stats.timers.crawler.queue.enqueue.std')" 719 | }, 720 | { 721 | "datasource": { 722 | "type": "graphite", 723 | "uid": "${DS_GRAPHITE}" 724 | }, 725 | "hide": false, 726 | "refCount": 0, 727 | "refId": "C", 728 | "target": "seriesByTag('name=stats.timers.crawler.queue.enqueue.upper_90')", 729 | "textEditor": true 730 | } 731 | ], 732 | "title": "Queue Enqueue Timer", 733 | "type": "timeseries" 734 | }, 735 | { 736 | "datasource": { 737 | "type": "graphite", 738 | "uid": "${DS_GRAPHITE}" 739 | }, 740 | "fieldConfig": { 741 | "defaults": { 742 | "color": { 743 | "mode": "palette-classic" 744 | }, 745 | "custom": { 746 | "axisCenteredZero": false, 747 | "axisColorMode": "text", 748 | "axisLabel": "", 749 | "axisPlacement": "auto", 750 | "barAlignment": 0, 751 | "drawStyle": "line", 752 | "fillOpacity": 0, 753 | "gradientMode": "none", 754 | "hideFrom": { 755 | "legend": false, 756 | "tooltip": false, 757 | "viz": false 758 | }, 759 | "lineInterpolation": "linear", 760 | "lineWidth": 1, 761 | "pointSize": 5, 762 | "scaleDistribution": { 763 | "type": "linear" 764 | }, 765 | "showPoints": "auto", 766 | "spanNulls": false, 767 | "stacking": { 768 | "group": "A", 769 | "mode": "none" 770 | }, 771 | "thresholdsStyle": { 772 | "mode": "off" 773 | } 774 | }, 775 | "mappings": [], 776 | "thresholds": { 777 | "mode": "absolute", 778 | "steps": [ 779 | { 780 | "color": "green", 781 | "value": null 782 | }, 783 | { 784 | "color": "red", 785 | "value": 80 786 | } 787 | ] 788 | } 789 | }, 790 | "overrides": [] 791 | }, 792 | "gridPos": { 793 | "h": 9, 794 | "w": 12, 795 | "x": 12, 796 | "y": 18 797 | }, 798 | "id": 8, 799 | "options": { 800 | "legend": { 801 | "calcs": [], 802 | "displayMode": "list", 803 | "placement": "bottom", 804 | "showLegend": true 805 | }, 806 | "tooltip": { 807 | "mode": "single", 808 | "sort": "none" 809 | } 810 | }, 811 | "targets": [ 812 | { 813 | "datasource": { 814 | "type": "graphite", 815 | "uid": "${DS_GRAPHITE}" 816 | }, 817 | "refCount": 0, 818 | "refId": "A", 819 | "target": "seriesByTag('name=stats.timers.crawler.queue.enqueuemany.std')" 820 | }, 821 | { 822 | "datasource": { 823 | "type": "graphite", 824 | "uid": "${DS_GRAPHITE}" 825 | }, 826 | "hide": false, 827 | "refCount": 0, 828 | "refId": "C", 829 | "target": "seriesByTag('name=stats.timers.crawler.queue.enqueuemany.upper_90')", 830 | "textEditor": true 831 | } 832 | ], 833 | "title": "Queue EnqueueMany Timer", 834 | "type": "timeseries" 835 | }, 836 | { 837 | "datasource": { 838 | "type": "graphite", 839 | "uid": "${DS_GRAPHITE}" 840 | }, 841 | "fieldConfig": { 842 | "defaults": { 843 | "color": { 844 | "mode": "palette-classic" 845 | }, 846 | "custom": { 847 | "axisCenteredZero": false, 848 | "axisColorMode": "text", 849 | "axisLabel": "", 850 | "axisPlacement": "auto", 851 | "barAlignment": 0, 852 | "drawStyle": "line", 853 | "fillOpacity": 0, 854 | "gradientMode": "none", 855 | "hideFrom": { 856 | "legend": false, 857 | "tooltip": false, 858 | "viz": false 859 | }, 860 | "lineInterpolation": "linear", 861 | "lineWidth": 1, 862 | "pointSize": 5, 863 | "scaleDistribution": { 864 | "type": "linear" 865 | }, 866 | "showPoints": "auto", 867 | "spanNulls": false, 868 | "stacking": { 869 | "group": "A", 870 | "mode": "none" 871 | }, 872 | "thresholdsStyle": { 873 | "mode": "off" 874 | } 875 | }, 876 | "mappings": [], 877 | "thresholds": { 878 | "mode": "absolute", 879 | "steps": [ 880 | { 881 | "color": "green", 882 | "value": null 883 | }, 884 | { 885 | "color": "red", 886 | "value": 80 887 | } 888 | ] 889 | } 890 | }, 891 | "overrides": [] 892 | }, 893 | "gridPos": { 894 | "h": 9, 895 | "w": 12, 896 | "x": 0, 897 | "y": 27 898 | }, 899 | "id": 9, 900 | "options": { 901 | "legend": { 902 | "calcs": [], 903 | "displayMode": "list", 904 | "placement": "bottom", 905 | "showLegend": true 906 | }, 907 | "tooltip": { 908 | "mode": "single", 909 | "sort": "none" 910 | } 911 | }, 912 | "targets": [ 913 | { 914 | "datasource": { 915 | "type": "graphite", 916 | "uid": "${DS_GRAPHITE}" 917 | }, 918 | "refCount": 0, 919 | "refId": "A", 920 | "target": "seriesByTag('name=stats.timers.crawler.cache.check.std')" 921 | }, 922 | { 923 | "datasource": { 924 | "type": "graphite", 925 | "uid": "${DS_GRAPHITE}" 926 | }, 927 | "hide": false, 928 | "refCount": 0, 929 | "refId": "C", 930 | "target": "seriesByTag('name=stats.timers.crawler.cache.check.upper_90')", 931 | "textEditor": false 932 | } 933 | ], 934 | "title": "Cache check Timer", 935 | "type": "timeseries" 936 | }, 937 | { 938 | "datasource": { 939 | "type": "graphite", 940 | "uid": "${DS_GRAPHITE}" 941 | }, 942 | "fieldConfig": { 943 | "defaults": { 944 | "color": { 945 | "mode": "palette-classic" 946 | }, 947 | "custom": { 948 | "axisCenteredZero": false, 949 | "axisColorMode": "text", 950 | "axisLabel": "", 951 | "axisPlacement": "auto", 952 | "barAlignment": 0, 953 | "drawStyle": "line", 954 | "fillOpacity": 0, 955 | "gradientMode": "none", 956 | "hideFrom": { 957 | "legend": false, 958 | "tooltip": false, 959 | "viz": false 960 | }, 961 | "lineInterpolation": "linear", 962 | "lineWidth": 1, 963 | "pointSize": 5, 964 | "scaleDistribution": { 965 | "type": "linear" 966 | }, 967 | "showPoints": "auto", 968 | "spanNulls": false, 969 | "stacking": { 970 | "group": "A", 971 | "mode": "none" 972 | }, 973 | "thresholdsStyle": { 974 | "mode": "off" 975 | } 976 | }, 977 | "mappings": [], 978 | "thresholds": { 979 | "mode": "absolute", 980 | "steps": [ 981 | { 982 | "color": "green", 983 | "value": null 984 | }, 985 | { 986 | "color": "red", 987 | "value": 80 988 | } 989 | ] 990 | } 991 | }, 992 | "overrides": [] 993 | }, 994 | "gridPos": { 995 | "h": 9, 996 | "w": 12, 997 | "x": 12, 998 | "y": 27 999 | }, 1000 | "id": 10, 1001 | "options": { 1002 | "legend": { 1003 | "calcs": [], 1004 | "displayMode": "list", 1005 | "placement": "bottom", 1006 | "showLegend": true 1007 | }, 1008 | "tooltip": { 1009 | "mode": "single", 1010 | "sort": "none" 1011 | } 1012 | }, 1013 | "targets": [ 1014 | { 1015 | "datasource": { 1016 | "type": "graphite", 1017 | "uid": "${DS_GRAPHITE}" 1018 | }, 1019 | "refCount": 0, 1020 | "refId": "A", 1021 | "target": "seriesByTag('name=stats.timers.crawler.cache.put.std')" 1022 | }, 1023 | { 1024 | "datasource": { 1025 | "type": "graphite", 1026 | "uid": "${DS_GRAPHITE}" 1027 | }, 1028 | "hide": false, 1029 | "refCount": 0, 1030 | "refId": "C", 1031 | "target": "seriesByTag('name=stats.timers.crawler.cache.put.upper_90')", 1032 | "textEditor": false 1033 | } 1034 | ], 1035 | "title": "Cache put Timer", 1036 | "type": "timeseries" 1037 | } 1038 | ], 1039 | "refresh": "5s", 1040 | "schemaVersion": 37, 1041 | "style": "dark", 1042 | "tags": [], 1043 | "templating": { 1044 | "list": [ 1045 | { 1046 | "current": {}, 1047 | "datasource": { 1048 | "type": "graphite", 1049 | "uid": "${DS_GRAPHITE}" 1050 | }, 1051 | "definition": "", 1052 | "error": {}, 1053 | "hide": 0, 1054 | "includeAll": false, 1055 | "multi": false, 1056 | "name": "query0", 1057 | "options": [], 1058 | "query": { 1059 | "queryType": "Value", 1060 | "refId": "A", 1061 | "target": "" 1062 | }, 1063 | "refresh": 1, 1064 | "regex": "", 1065 | "skipUrlSync": false, 1066 | "sort": 0, 1067 | "type": "query" 1068 | } 1069 | ] 1070 | }, 1071 | "time": { 1072 | "from": "now-30m", 1073 | "to": "now" 1074 | }, 1075 | "timepicker": {}, 1076 | "timezone": "", 1077 | "title": "Crwl Vs Eng", 1078 | "uid": "hFBRYAkSk", 1079 | "version": 17, 1080 | "weekStart": "" 1081 | } --------------------------------------------------------------------------------