├── .gitignore ├── README.md ├── builder └── default │ └── builder.go ├── cookie └── cookie.go ├── counter └── counter.go ├── fetcher ├── fetcher.go └── options.go ├── go.mod ├── go.sum ├── health └── health.go ├── parser └── parser.go ├── proxy └── proxy.go ├── request ├── request.go └── request_test.go ├── scheduler ├── nsq │ ├── nsq.go │ └── option.go └── scheduler.go ├── store ├── mongo │ ├── buffered.go │ └── unbuffered.go └── store.go ├── ua ├── ua.go └── ua_list.go ├── visit ├── redis │ └── redis.go └── visit.go └── worker ├── option.go ├── signal_test.go ├── signals.go └── worker.go /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | 23 | .vscode 24 | 25 | cmd 26 | 27 | main.go 28 | 29 | DESIGN 30 | 31 | 32 | examples -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gocrawler 2 | gocrawler是非常轻量级的分布式爬虫框架, 可以快速构建高性能爬虫(生产者-消费者模式), 同时gocrawler严格遵循面向接口的设计, 所以gocrawler的各种组件都是可以轻松扩展的 3 | 4 | 5 | 6 | - [gocrawler](#gocrawler) 7 | - [快速开始](#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) 8 | - [基础实现](#%E5%9F%BA%E7%A1%80%E5%AE%9E%E7%8E%B0) 9 | - [解析并提交更多Request](#%E8%A7%A3%E6%9E%90%E5%B9%B6%E6%8F%90%E4%BA%A4%E6%9B%B4%E5%A4%9Arequest) 10 | - [自定义组件](#%E8%87%AA%E5%AE%9A%E4%B9%89%E7%BB%84%E4%BB%B6) 11 | - [替换网络请求组件](#%E6%9B%BF%E6%8D%A2%E7%BD%91%E7%BB%9C%E8%AF%B7%E6%B1%82%E7%BB%84%E4%BB%B6) 12 | - [自定义代理](#%E8%87%AA%E5%AE%9A%E4%B9%89%E4%BB%A3%E7%90%86) 13 | - [自定义请求头](#%E8%87%AA%E5%AE%9A%E4%B9%89%E8%AF%B7%E6%B1%82%E5%A4%B4) 14 | - [自定义UA](#%E8%87%AA%E5%AE%9A%E4%B9%89ua) 15 | - [替换存储组件](#%E6%9B%BF%E6%8D%A2%E5%AD%98%E5%82%A8%E7%BB%84%E4%BB%B6) 16 | - [其他组件](#%E5%85%B6%E4%BB%96%E7%BB%84%E4%BB%B6) 17 | - [请求去重](#%E8%AF%B7%E6%B1%82%E5%8E%BB%E9%87%8D) 18 | - [任务计数](#%E4%BB%BB%E5%8A%A1%E8%AE%A1%E6%95%B0) 19 | - [限速](#%E9%99%90%E9%80%9F) 20 | - [错误处理和生命周期函数](#%E9%94%99%E8%AF%AF%E5%A4%84%E7%90%86%E5%92%8C%E7%94%9F%E5%91%BD%E5%91%A8%E6%9C%9F%E5%87%BD%E6%95%B0) 21 | - [沟通时机](#%E6%B2%9F%E9%80%9A%E6%97%B6%E6%9C%BA) 22 | - [沟通方式](#%E6%B2%9F%E9%80%9A%E6%96%B9%E5%BC%8F) 23 | - [进阶](#%E8%BF%9B%E9%98%B6) 24 | - [发送Request到其他爬虫Worker](#%E5%8F%91%E9%80%81request%E5%88%B0%E5%85%B6%E4%BB%96%E7%88%AC%E8%99%ABworker) 25 | - [第一步:替换默认Scheduler](#%E7%AC%AC%E4%B8%80%E6%AD%A5%E6%9B%BF%E6%8D%A2%E9%BB%98%E8%AE%A4scheduler) 26 | - [第二步:发送Request到seconndScheduler](#%E7%AC%AC%E4%BA%8C%E6%AD%A5%E5%8F%91%E9%80%81request%E5%88%B0seconndscheduler) 27 | - [任务优先级](#%E4%BB%BB%E5%8A%A1%E4%BC%98%E5%85%88%E7%BA%A7) 28 | - [Dev模式](#dev%E6%A8%A1%E5%BC%8F) 29 | - [参考](#%E5%8F%82%E8%80%83) 30 | 31 | 32 | ## 快速开始 33 | ### 基础实现 34 | 使用gocrawler的builder模式能够快速构建一个分布式爬虫, 作为一个示例, 我们将使用gocrawler抓取[zyte](https://www.zyte.com/blog/)上的博客信息 35 | 在运行下示例前, 你需要确保已经安装并能够链接以下依赖: 36 | - [nsq](https://nsq.io/) 37 | - [mongodb](https://www.mongodb.com/) 38 | 39 | > gocrawler本身并不依赖nsq作为消息组件, 同样也不依赖mongodb作为存储组件,后面会介绍替换的方式 40 | 41 | 我们的目标是爬取[zyte网站](https://www.zyte.com/blog)上的所有blog的基础信息, 包括: 42 | - 标题 43 | - 作者 44 | - 阅读时间 45 | - 发布时间 46 | 47 | > 我们会抓取列表项信息, 至于如何同时在抓取列表信息的同时抓取每个列表项的详情信息后面会介绍 48 | 49 | 首先我们创建一个项目 50 | ``` 51 | mkdir zyte 52 | ``` 53 | 然后初始化项目 54 | ``` 55 | go mod init zyte 56 | ``` 57 | 首先我们在zyte目录下创建一个parser目录, 并编写我们的解析函数: 58 | ```go 59 | //parser/parser.go 60 | package parser 61 | 62 | import ( 63 | "context" 64 | "net/http" 65 | 66 | "github.com/PuerkitoBio/goquery" 67 | "github.com/superjcd/gocrawler/parser" 68 | ) 69 | 70 | type zyteParser struct{} 71 | 72 | func NewZyteParser() *zyteParser { 73 | return &zyteParser{} 74 | } 75 | 76 | func (p *zyteParser) Parse(ctx context.Context, r *http.Response) (*parser.ParseResult, error) { 77 | doc, err := goquery.NewDocumentFromReader(r.Body) 78 | if err != nil { 79 | return nil, err 80 | } 81 | result := &parser.ParseResult{} 82 | resultItems := make([]parser.ParseItem, 0) 83 | 84 | doc.Find("div.CardResource_card__BhCok").Each( 85 | func(i int, s *goquery.Selection) { 86 | item := parser.ParseItem{} 87 | item["title"] = s.Find("div.free-text").Text() 88 | item["author"] = s.Find("div:nth-child(3) > div:nth-child(1) > span:nth-child(2)").Text() 89 | item["read_time"] = s.Find("div:nth-child(3) > div:nth-child(2) > span:nth-child(2)").Text() 90 | item["post_time"] = s.Find("div:nth-child(4) > div:nth-child(1) > span:nth-child(2)").Text() 91 | resultItems = append(resultItems, item) 92 | }, 93 | ) 94 | result.Items = resultItems 95 | 96 | return result, nil 97 | } 98 | 99 | ``` 100 | > 推荐使用[goquery](https://github.com/PuerkitoBio/goquery)来构建网页解析组件 101 | 102 | 接着, 我们可以在`main.go`文件中正式构建我们的第一个爬虫: 103 | ```go 104 | // main.go 105 | package main 106 | 107 | import ( 108 | default_builder "github.com/superjcd/gocrawler/builder/default" 109 | "github.com/superjcd/gocrawler_examples/zyte/parser" 110 | ) 111 | 112 | func main() { 113 | config := default_builder.DefaultWorkerBuilderConfig{} 114 | worker := config.Name("zyte").MaxRunTime(300).Workers(10).LimitRate(10).Build(parser.NewZyteParser()) 115 | worker.Run() 116 | } 117 | ``` 118 | > `MaxRunTime`定义Worker的运行时长(单位:秒);`Workers`定义并发数;`LimitRate`定义每秒的最大请求数量 119 | 120 | 在main.go路径运行命令`go run .`就能顺利地启动爬虫。当然为了让我们的爬虫worker工作起来, 我们需要喂给worker一些任务; 121 | 在pub/main.go中编写提交任务的逻辑(生产者): 122 | ```go 123 | package main 124 | 125 | import ( 126 | "fmt" 127 | "log" 128 | 129 | "github.com/gofrs/uuid" 130 | "github.com/superjcd/gocrawler/request" 131 | "github.com/superjcd/gocrawler/scheduler" 132 | "github.com/superjcd/gocrawler/scheduler/nsq" 133 | ) 134 | 135 | func main() { 136 | s := nsq.NewNsqScheduler("zyte", "default", "127.0.0.1:4150", "127.0.0.1:4161") 137 | pages := []int{} 138 | for i := 1; i < 10; i++ { 139 | pages = append(pages, i) 140 | } 141 | uid, err := uuid.NewV4() 142 | if err != nil { 143 | panic(err) 144 | } 145 | log.Printf("taskId: %s", uid.String()) 146 | 147 | for _, pg := range pages { 148 | data := make(map[string]string, 0) 149 | data["taskId"] = uid.String() 150 | url := fmt.Sprintf("https://www.zyte.com/blog/page/%d", pg) 151 | fmt.Println(url) 152 | req := request.Request{ 153 | URL: url, 154 | Method: "GET", 155 | Data: data, 156 | } 157 | s.Push(scheduler.TYP_PUSH_SCHEDULER, &req) 158 | 159 | } 160 | } 161 | ``` 162 | 新开一个终端, 并运行`go run .\pub\`, 可以在启动woker的终端中看到目标网站被解析并存入到mongodb的日志信息。 163 | 检查本地的mongodb的zyte数据库的default集合,你就会看到我们抓到的列表数据。 164 | 165 | 166 | ### 解析并提交更多Request 167 | 上面的例子有一个很大的问题在于:生产者显式地把需要抓取的page一页一页地提交给了gocrawler, 比如在上面例子中, 我们提交了9个请求, 问题是在真实场景下, 任务的请求数有可能是不固定的, 理想情况下, 我们会希望爬虫能够在爬取第一页的时候, 通过解析首页的最大页码数来自动的提交更多请求。 168 | 这一点在gocrawler中很好实现,因为gocrawler的Parser组件的Parse函数产出的`*parser.ParseResult`的结构体是可以包含Request对象的, 而这些被解析出来的Request对象会被gocrawler提交 169 | > 当然这里会衍生出另外的问题是, 如何过滤重复请求以及如何使用类似于自动的URL匹配器获取目标url, 关于前者, gocrawler可以通过添加Visit组件来过滤一定时间内已经抓取过的url, 后者gocrawler自身没有实现, 但是这个功能用户可以在自定义的Parser组件中实现 170 | 171 | 废话不多说 ,我们切入正题: 172 | 首先我们需要修改一下Parser: 173 | ```go 174 | package parser 175 | 176 | import ( 177 | "context" 178 | "net/http" 179 | 180 | "github.com/PuerkitoBio/goquery" 181 | "github.com/superjcd/gocrawler/parser" 182 | "github.com/superjcd/gocrawler/request" 183 | ) 184 | 185 | type zyteParser struct{} 186 | 187 | func NewZyteParser() *zyteParser { 188 | return &zyteParser{} 189 | } 190 | 191 | func (p *zyteParser) Parse(ctx context.Context, r *http.Response) (*parser.ParseResult, error) { 192 | ... 193 | resultItems := make([]parser.ParseItem, 0) 194 | requests := []*request.Request{} 195 | // gocrawler会默认把Request对象中的Data属性传递到上下文中, 用户可以通过ctx.Value(request.RequestDataCtxKey{})来获取这个值(map) 196 | ctxValue := ctx.Value(request.RequestDataCtxKey{}) 197 | requestData := ctxValue.(map[string]string) 198 | page := requestData["page"] 199 | 200 | if page == "1" { 201 | uid, _ := uuid.NewV4() 202 | for pg := 2; pg <= 5; pg++ { 203 | data := make(map[string]string, 0) 204 | data["taskId"] = uid.String() 205 | data["page"] = strconv.Itoa(pg) 206 | url := fmt.Sprintf("https://www.zyte.com/blog/page/%d", pg) 207 | // 注意: 在这里我们构建新的请求 208 | req := request.Request{ 209 | URL: url, 210 | Method: "GET", 211 | Data: data, 212 | } 213 | requests = append(requests, &req) 214 | } 215 | } 216 | 217 | ... 218 | result.Items = resultItems 219 | result.Requests = requests 220 | 221 | return result, nil 222 | } 223 | ``` 224 | 这样,当我们请求第一页的时候,就可以连带把其他页面的请求一并传递给任务队列(当然正常情况下, 最大页码数这个值是需要自己去解析的) 225 | 226 | 227 | 228 | 229 | ## 自定义组件 230 | ### 替换网络请求组件 231 | gocrawler的默认Fetcher只是一个非常简单的网络请求组件,只使用默认网络请求组件在应对一些常见的反扒手段的时候肯定是远远不够的, 所以我们有时候我们希望Fetcher可以支持诸如: 232 | - 从代理池获取代理 233 | - 从Cookie池获取cookie 234 | - 改变请求头 235 | 236 | Fetcher在gocrawler中只是一个接口,接口定义如下: 237 | 238 | ```go 239 | // Fetcher的定义 240 | type Fetcher interface { 241 | Fetch(ctx context.Context, req *request.Request) (*http.Response, error) 242 | } 243 | ``` 244 | 如果想要替换掉默认Fetcher, 只要在在Build函数中添加`worker.WithFetcher(your_fetcher)`即可: 245 | ```go 246 | config := default_builder.DefaultWorkerBuilderConfig{} 247 | worker := config.Name("zyte").Build(your_parser, worker.WithFetcher(your_fetcher)) 248 | ``` 249 | 你的自定义Fetcher--your_fetcher只要实现上诉的Fetcher定义即可, 当然你可以使用gocraler的NewFetcher穿件一个Fetcher对象, 然后结合Option模式修改默认Fetcher的行为(诸如代理请求头等) 250 | #### 自定义代理 251 | 使用gocrawler中的NewFetcher去创建一个Fetcher对象,, 替换掉Fetcher组件: 252 | 253 | ```go 254 | import ( 255 | "time" 256 | "github.com/superjcd/gocrawler/fetcher" 257 | ) 258 | 259 | fetcher := fetcher.NewFetcher(10 * time.Second, fetcher.WithProxyGetter(your_proxy_getter)) 260 | ``` 261 | `your_proxy_getter`是你需要实现的proxy获取组件, `ProxyGetter`的定义如下: 262 | ``` 263 | type ProxyGetter interface { 264 | Get(*http.Request) (*url.URL, error) 265 | } 266 | ``` 267 | 268 | #### 自定义请求头 269 | 请求头是默认的Fetcher组件的一部分,如果用户想要添加请求头, 可以通过下面的方式进行实现: 270 | ```go 271 | import ( 272 | "time" 273 | "github.com/superjcd/gocrawler/fetcher" 274 | ) 275 | 276 | headers := map[string]string{ 277 | "accept": "application/json" 278 | } 279 | 280 | fetcher := fetcher.NewFetcher(10 * time.Second, fetcher.WithHeaders(headers)) 281 | ``` 282 | 283 | #### 自定义UA 284 | `User-Agent`也是请求头的一部分, 用户可以基于上面的方式进行添加, 或者使用`UaGetter`动态地设置User-Agent,例如: 285 | 286 | ```go 287 | import ( 288 | "time" 289 | "github.com/superjcd/gocrawler/fetcher" 290 | "github.com/superjcd/gocrawler/ua" 291 | ) 292 | 293 | uaGetter := ua.NewRoundRobinUAGetter() 294 | fetcher := fetcher.NewFetcher(10 * time.Second, fetcher. WithUaGetter(uaGetter)) 295 | ``` 296 | > uaGetter会在每一次Fetcher进行网络请求的时候, 从一个随机UA池中挑选一个user-agent;在默认的Build模式中, 默认fetcher会自动使用这个特性 297 | 298 | 299 | ### 替换存储组件 300 | gocrawler的`DefaultWorkerBuilderConfig`目前只支持使用mongodb来作为爬虫的默认存储组件, 如果用户想要使用别的存储组件, 只要实现一个自定义的Storage即可,然后和前面的自定义Fetcher类似, 通过在Build函数中添加`worker.WithStorage(your_storage)`就能替换掉默认存储组件: 301 | ```go 302 | type Storage interface { 303 | Save(ctx context.Context, datas ...parser.ParseItem) error 304 | } 305 | ``` 306 | 需要注意的是, 用户自定义存储组件的时候, 最好考虑结合一些缓存机制,比如当缓存收集到一定数量的对象之后再把数据flush到存储器, 而不是一条一条数据的存, 特别是对于mysql这类关系数据库而言,高并发下使用逐条存储的代价是很大的。 307 | 默认的mongo存储组件是考虑了缓存机制的,用户可以通过调用`DefaultWorkerBuilderConfig`的`BufferSize`和`AutoFlushInterval`来定义缓存大小以及flush间隔(秒), 例如: 308 | ```go 309 | config := default_builder.DefaultWorkerBuilderConfig{} 310 | worker := config.Name("zyte").Workers(10).LimitRate(10).BufferSize(100).AutoFlushInterval(10).Build(your_parser, worker.WithStorage(your_storage)) 311 | ``` 312 | 在上例中, 我们的爬虫有一个大小为100的缓存,缓存如果满了就会存储到mongo中, 如果缓存没有满,也会在10秒之后被flush到mongo中 313 | 314 | ### 其他组件 315 | - [Visit](https://github.com/superjcd/gocrawler/blob/main/visit/visit.go) 去重组件 316 | - [Counter](https://github.com/superjcd/gocrawler/blob/main/counter/counter.go) 任务计数组件 317 | - Limit 限速 318 | 319 | 这些组件都可以通过`Build(parser, With<组件>(组件实现))`来嵌入到gocrawler中,或者说替换掉默认组件 320 | 321 | #### 请求去重 322 | 我们希望在爬虫的某个运行周期中, 不想重复请求, 可以使用Visit组件进行去重 323 | Visit组件的接口定义如下: 324 | ```go 325 | type Visit interface { 326 | SetVisitted(key string, ttl time.Duration) error 327 | UnsetVisitted(key string) error 328 | IsVisited(key string) bool 329 | } 330 | ``` 331 | `SetVisitted`会将某个请求在一定的声明周期内(ttl)会被标记为已被访问, 被标记过的请求(也就是Request对象)不会在这个周期内被再次访问 332 | gocrawler中有可以通过一下方式,通过redis来实现请求去重: 333 | ```go 334 | package main 335 | 336 | import ( 337 | "github.com/superjcd/gocrawler/worker" 338 | "github.com/superjcd/gocrawler/vist/redis" 339 | ) 340 | config := default_builder.DefaultWorkerBuilderConfig{} 341 | worker := config.Name("zyte").Build(your_parser, worker.WithVisiter(redis.NewRedisVisit(redis.Options, prefixKey))) 342 | ``` 343 | > gocrawler会默认根据Request对象的Url和Method进行去重,如果想要添加`Request.Data`中值作为去重项,通过在Build函数中使用`worker.WithAddtionalHashKeys(your_keys)`来实现, 注意如果你指定的key不存在于`Request.Data`,会panic 344 | #### 任务计数 345 | 对分布式爬虫进行任务计数会有一些麻烦,目前gocraler默认提供的`redisTaskCounters`基于redis的乐观锁机制实现了一个可用的分布式计数, 使用方式和上诉其他组件类似,不再赘述 346 | 347 | 348 | #### 限速 349 | ```golang 350 | package main 351 | 352 | import ( 353 | "github.com/superjcd/gocrawler/worker" 354 | "github.com/superjcd/gocrawler/vist/redis" 355 | ) 356 | config := default_builder.DefaultWorkerBuilderConfig{} 357 | worker := config.Name("zyte").Build(your_parser, worker.WithLimiter(your_limit)) 358 | ``` 359 | > 一般情况下使用config.Limit(number)足够了(见上面的快速开始), number表示每秒钟允许的请求次数 360 | 361 | ## 错误处理和生命周期函数 362 | 由于爬虫需要和网络以及各种日新月异的反爬技术打交道, 所以关于爬虫任务, 有一点是不会错的: 363 | > 我们的爬虫随时都会出错 364 | 365 | 所以如何正确的处理错误的请求是爬虫任务的一个挥之不去的主题, 简单的丢弃失败的请求肯定是不可行的, 当然无限的重试自然也不可取, 有限次数的重试似乎是不错的折衷方法, gocrawler也是这么做的, 重试次数用户可以通过`DefaultWorkerBuilderConfig`的`Retries`方法来定义(默认是5次) ,但是还有一个更加关键的点在于--用户如何告诉gocrawler对某个失败的请求进行重试而不是丢弃呢, 因为有时候我们确实也需要丢弃掉不需要的请求(比如状态码是404的请求), 所以这种和gocrawler引擎进行沟通的机制是必要。 366 | 实现这个沟通机制的关键在于两点: 367 | - 沟通的时机 368 | - 沟通的方式 369 | 370 | ### 沟通时机 371 | gocrawler的Worker有以下生命周期函数: 372 | - BeforeRequest 发生在请求之前 373 | - AfterRequest 发生在请求之后 374 | - BeforeSave 发生在存储之前 375 | - AfterSave 发生在存储之后 376 | 377 | 这里以`AfterRequest`为例: 378 | ```go 379 | func (w *worker) AfterRequest(ctx context.Context, resp *http.Response) (Signal, error) { 380 | var sig Signal 381 | if w.AfterRequestHook != nil { 382 | return w.AfterRequestHook(ctx, resp) 383 | } 384 | sig |= DummySignal 385 | return sig, nil 386 | } 387 | ``` 388 | `AfterRequest`会发生在请求发生之后(Fetcher进行fetch之后), 页面被解析之前;如果用户提供了`AfterRequestHook`,那么`AfterRequestHook`就会在这个阶段被执行(一个生命周期函数会对应一种hook); 389 | 所以用户完全可以在这个阶段, 通过判断请求的状态码来确定是不是要进行重试 390 | 391 | ### 沟通方式 392 | 说完了沟通时机, 现在需要说一下方式了;gocrawler会基于生命周期函数返回的Signal来决定下一步该如何行动, 下面我们尝试定义一个 `AfterRequestHook`(它会返回Signal): 393 | ```golang 394 | import "github.com/superjcd/gocrawler/worker" 395 | func CheckResponseStatus(ctx context.Context, resp *http.Response) (worker.Signal, error) { 396 | var sig worker.Signal 397 | switch resp.StatusCode { 398 | case http.StatusOK: 399 | sig |= worker.DummySignal 400 | case http.StatusNotFound: 401 | sig |= worker.ContinueWithoutRetrySignal 402 | default: 403 | sig |= worker.ContinueWithRetrySignal 404 | } 405 | 406 | return sig, nil 407 | } 408 | ``` 409 | > 用户可以通过在Build函数中添加worker.WithAfterRequestHook(CheckResponseStatus)来注册这个hook,其他生命周期的hook的注册方式也是一样的 410 | `CheckResponseStatus`会去判断http.Response的状态码, 如果是200就返回`DummySignal`信号, 404就返回`ContinueWithoutRetrySignal`,在其他情况下就是返回`ContinueWithRetrySignal`信号; 411 | 当gocrawler接收到`DummySignal`的, 会继续执行; 接收到`ContinueWithoutRetrySignal`的时候则会跳过后面的步骤直接处理下一个请求;而接收到`ContinueWithRetrySignal`的时候, gocrawler就会发起重试, 完整的信号列表: 412 | ```golang 413 | type Signal int8 414 | 415 | const ( 416 | DummySignal = 1 << iota // 默认初始signal 417 | ContinueWithRetrySignal // 重试信号 418 | ContinueWithoutRetrySignal // 不重试, 继续下一个任务 419 | BreakWithPanicSignal // 停止爬虫并panic 420 | BreakWithoutPanicSignal // 停止爬虫但是不panic 421 | ) 422 | ``` 423 | > Signal本质上就是一个8位有符号整数 424 | 425 | (最后还有一点需要注意的是, 上面的重试并不是立马重试, 而是请求会被重新发送到请求队列中,等待下一次被处理) 426 | 427 | 428 | ## 进阶 429 | ### 发送Request到其他爬虫Worker 430 | 如果我们想要把请求传递给其他的woker该怎么办呢, 假设我们有两个爬虫worker: 431 | - 列表worker, 获取列表项 432 | - 详情worker, 获取每一页的详情信息 433 | 434 | 这种需要用到多个worker的场景很常见, 比如以抓取房价信息为例, 房屋的简要信息会以列表页形式存在, 比如一个列表页上面可能有20个房屋链接;然后当我们点击每个链接, 就可以获得该房屋的详情信息; 435 | 由于列表页和详情页的url以及页面信息通常是不同的, 所以比较合理的方式就是分别运行两个Worker(可以共用部分组件, 比如fetcher), 那么现在需要面对的问题是, 如何在**列表爬虫**抓取列表页信息的时候, 把详情页的请求提交到**详情爬虫**? 436 | 在gocrawler中实现这个方式只需要两步: 437 | #### 第一步:替换默认Scheduler 438 | gocrawler中的Scheduler组件有一个Option(选项)是secondScheduler(也是一个Scheduler接口), 如果secondScheduler非空, 那么我们就能把请求传递给这个seconndScheduler(如何传递请求, 第二个步骤会讲), 只要另外一个爬虫Worker订阅了seconndScheduler的消息,那么第二个worker自然也能同时进行运行。 439 | 440 | ```go 441 | package main 442 | 443 | import ( 444 | "github.com/superjcd/gocrawler/worker" 445 | "github.com/superjcd/gocrawler/scheduler/nsq" 446 | ) 447 | 448 | 449 | func main() { 450 | // 重新准备一个scheduler 451 | secondScheduler := nsq.NewNsqScheduler(your_second_topic, your_second_channel, "localhost:4150", "localhost:4161") 452 | scheduler := nsq.NewNsqScheduler(your_second_topic, your_second_channel, "localhost:4150", "localhost:4161", nsq.WithSecondScheduler(secondScheduler)) 453 | 454 | config := default_builder.DefaultWorkerBuilderConfig{} 455 | worker := config.Name("zyte").MaxRunTime(300).Workers(10).LimitRate(10).Build( 456 | parser.NewZyteParser(),worker.WithScheduler(scheduler),) // 替换掉默认shceduler 457 | worker.Run() 458 | } 459 | 460 | ``` 461 | > `Scheduler`是构建gocrawler引擎的一个重要组件,而在gocrawler中所有的组件都是接口,所以用户可以轻松进行替换;其他组件的替换可以详见下面的[自定义组件](#自定义组件) 462 | 463 | #### 第二步:发送Request到seconndScheduler 464 | 要想把Request发送到secondScheduler很简单,只要修改一下Request的IsSecondary字段就好, 将它设置为true就可以了, 例如: 465 | 假设我们在列表页抓到若干个详情页的url, 我们需要像上例一样在Parse函数中构造新的Request对象 466 | ```go 467 | ... 468 | for _, url := range urls{ // urls是详情页请求地址队列 469 | reqData := make(map[string]string, 0) 470 | reqData["taskId"] = uid.String() 471 | newRequest := request.Request{ 472 | URL: url, 473 | Method: "GET", 474 | Data: reqData, 475 | IsSecondary: true, // 这里是关键 476 | } 477 | requests = append(requests, &newRequest) 478 | } 479 | ... 480 | result.Items = resultItems 481 | result.Requests = requests 482 | 483 | ``` 484 | 485 | > 理论上用户也可以把所有逻辑写在一个Worker中, 比如在Parser中根据不同Response采用不同的解析策略(存储组件也是同理);好处是不需要分发Request到secondScheduler,但同时会失去对不同爬虫任务的控制, 比如爬虫的限速, worker数量等等 486 | 487 | ### 任务优先级 488 | 首先我们需要确定什么是优先级? 489 | 如果是需要先保证Worker1完成之后再进行Worker2, 完全可以结合[任务计数](#任务计数)在确认Worker1完成之后再提交任务到Worker2; 490 | 如过我们希望Worker1和Worker2同时进行,但是希望Worker1的速度要比Worker2更快(比如2:1), 一种简单的方式是调整两个Worker的并发数和RateLimit;当然如果要做到更精细的控制, 也可以通过任务计数器实时调整Producer提交任务的速度(如果用户使用的是gocrawler的默认计数器, 其实只要分别读取两个worker对应的redis键的数值即可) 491 | 492 | ### Dev模式 493 | TODO 494 | 495 | 496 | ## 参考 497 | - [我的博客](https://superjcd.github.io/p/golang%E5%88%86%E5%B8%83%E5%BC%8F%E7%88%AC%E8%99%AB%E8%AE%BE%E8%AE%A1/) 498 | - [文档中的zyte的例子](https://github.com/superjcd/gocrawler_examples) -------------------------------------------------------------------------------- /builder/default/builder.go: -------------------------------------------------------------------------------- 1 | package default_builder 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/superjcd/gocrawler/fetcher" 7 | "github.com/superjcd/gocrawler/parser" 8 | "github.com/superjcd/gocrawler/scheduler/nsq" 9 | "github.com/superjcd/gocrawler/store/mongo" 10 | "github.com/superjcd/gocrawler/ua" 11 | "github.com/superjcd/gocrawler/worker" 12 | "golang.org/x/time/rate" 13 | ) 14 | 15 | type DefaultWorkerBuilderConfig struct { 16 | name string 17 | workers int 18 | retries int 19 | fetch_timeout int 20 | save_request_data bool 21 | max_run_time_seconds int 22 | nsqd_addr string 23 | nsqlookup_addr string 24 | nsq_topic_name string 25 | nsq_channel_name string 26 | mongo_uri string 27 | mongo_database string 28 | mongo_collection string 29 | limit_rate int 30 | buffer_szie int 31 | auto_flush_interval int 32 | } 33 | 34 | // defaults 35 | const ( 36 | WORKERS = 50 37 | RETRIES = 5 38 | FETCH_TIMEOUT = 10 39 | SAVE_REQUEST_DATA = true 40 | MAX_RUN_TIME_SECONDS = 360 41 | NSQD_ADDR = "localhost:4150" 42 | NSQLOOKUP_ADDR = "localhost:4161" 43 | NSQ_TOPIC_NAME = "gocralwer" 44 | NSQ_CHANNEL_NAME = "default" 45 | MONGO_URI = "mongodb://localhost:27017" 46 | MONGO_DATABASE = "gocrawler" 47 | MONGO_COLLECTION = "default" 48 | LIMIT_RATE = 50 49 | BUFFER_SIZE = 100 50 | AUTO_FLUSH_INTERVAL = 10 51 | ) 52 | 53 | func (bc *DefaultWorkerBuilderConfig) Name(name string) *DefaultWorkerBuilderConfig { 54 | bc.name = name 55 | return bc 56 | } 57 | 58 | func (bc *DefaultWorkerBuilderConfig) Workers(n int) *DefaultWorkerBuilderConfig { 59 | bc.workers = n 60 | return bc 61 | } 62 | 63 | func (bc *DefaultWorkerBuilderConfig) Retries(n int) *DefaultWorkerBuilderConfig { 64 | bc.retries = n 65 | return bc 66 | } 67 | 68 | func (bc *DefaultWorkerBuilderConfig) TimeOut(seconds int) *DefaultWorkerBuilderConfig { 69 | bc.fetch_timeout = seconds 70 | return bc 71 | } 72 | 73 | func (bc *DefaultWorkerBuilderConfig) SaveRequestData(save bool) *DefaultWorkerBuilderConfig { 74 | bc.save_request_data = save 75 | return bc 76 | } 77 | 78 | func (bc *DefaultWorkerBuilderConfig) MaxRunTime(seconds int) *DefaultWorkerBuilderConfig { 79 | bc.max_run_time_seconds = seconds 80 | return bc 81 | } 82 | 83 | func (bc *DefaultWorkerBuilderConfig) NsqScheduler(nsqd_addr, lookup_addr, topic, channel string) *DefaultWorkerBuilderConfig { 84 | if nsqd_addr != "" { 85 | bc.nsqd_addr = nsqd_addr 86 | } 87 | if lookup_addr != "" { 88 | bc.nsqlookup_addr = lookup_addr 89 | } 90 | if topic != "" { 91 | bc.nsq_topic_name = topic 92 | } 93 | if channel != "" { 94 | bc.nsq_channel_name = channel 95 | } 96 | 97 | return bc 98 | } 99 | 100 | func (bc *DefaultWorkerBuilderConfig) MongoDb(uri, database, collection string) *DefaultWorkerBuilderConfig { 101 | if uri != "" { 102 | bc.mongo_uri = uri 103 | } 104 | if database != "" { 105 | bc.mongo_database = database 106 | } 107 | if collection != "" { 108 | bc.mongo_collection = collection 109 | } 110 | return bc 111 | } 112 | 113 | func (bc *DefaultWorkerBuilderConfig) LimitRate(rate int) *DefaultWorkerBuilderConfig { 114 | bc.limit_rate = rate 115 | return bc 116 | } 117 | 118 | func (bc *DefaultWorkerBuilderConfig) BufferSize(size int) *DefaultWorkerBuilderConfig { 119 | bc.buffer_szie = size 120 | return bc 121 | } 122 | 123 | func (bc *DefaultWorkerBuilderConfig) AutoFlushInterval(interval int) *DefaultWorkerBuilderConfig { 124 | bc.auto_flush_interval = interval 125 | return bc 126 | } 127 | 128 | func (bc *DefaultWorkerBuilderConfig) Build(parser parser.Parser, opts ...worker.Option) worker.Worker { 129 | if bc.workers == 0 { 130 | bc.workers = WORKERS 131 | } 132 | if bc.retries == 0 { 133 | bc.retries = RETRIES 134 | } 135 | if bc.fetch_timeout == 0 { 136 | bc.fetch_timeout = FETCH_TIMEOUT 137 | } 138 | 139 | if bc.max_run_time_seconds == 0 { 140 | bc.max_run_time_seconds = MAX_RUN_TIME_SECONDS 141 | } 142 | if bc.nsqd_addr == "" { 143 | bc.nsqd_addr = NSQD_ADDR 144 | } 145 | if bc.nsqlookup_addr == "" { 146 | bc.nsqlookup_addr = NSQLOOKUP_ADDR 147 | } 148 | if bc.nsq_channel_name == "" { 149 | bc.nsq_channel_name = NSQ_CHANNEL_NAME 150 | } 151 | if bc.nsq_topic_name == "" { 152 | if bc.name != "" { 153 | bc.nsq_topic_name = bc.name 154 | } else { 155 | bc.nsq_topic_name = NSQ_TOPIC_NAME 156 | } 157 | } 158 | if bc.mongo_uri == "" { 159 | bc.mongo_uri = MONGO_URI 160 | } 161 | if bc.mongo_database == "" { 162 | if bc.name != "" { 163 | bc.mongo_database = bc.name 164 | } else { 165 | bc.mongo_database = MONGO_DATABASE 166 | } 167 | } 168 | if bc.mongo_collection == "" { 169 | bc.mongo_collection = MONGO_COLLECTION 170 | } 171 | if bc.limit_rate == 0 { 172 | bc.limit_rate = LIMIT_RATE 173 | } 174 | if bc.buffer_szie == 0 { 175 | bc.buffer_szie = BUFFER_SIZE 176 | } 177 | if bc.auto_flush_interval == 0 { 178 | bc.auto_flush_interval = AUTO_FLUSH_INTERVAL 179 | } 180 | fetcher := fetcher.NewFectcher(time.Duration(bc.fetch_timeout)*time.Second, fetcher.WithUaGetter(ua.NewRoundRobinUAGetter())) 181 | 182 | scheduler := nsq.NewNsqScheduler(bc.nsq_topic_name, bc.nsq_channel_name, bc.nsqd_addr, bc.nsqlookup_addr) 183 | 184 | limiter := rate.NewLimiter(rate.Limit(bc.limit_rate), 1) 185 | storage := mongo.NewBufferedMongoStorage(bc.mongo_uri, 186 | bc.mongo_database, 187 | bc.mongo_collection, 188 | bc.buffer_szie, 189 | time.Duration(bc.auto_flush_interval)*time.Second) 190 | 191 | worker := worker.NewWorker(bc.name, 192 | bc.workers, 193 | bc.retries, 194 | bc.save_request_data, 195 | time.Second*time.Duration(bc.max_run_time_seconds), 196 | worker.WithScheduler(scheduler), 197 | worker.WithFetcher(fetcher), 198 | worker.WithLimiter(limiter), 199 | worker.WithParser(parser), 200 | worker.WithStore(storage), 201 | ) 202 | 203 | for _, opt := range opts { 204 | opt(&worker.Options) 205 | } 206 | return worker 207 | } 208 | -------------------------------------------------------------------------------- /cookie/cookie.go: -------------------------------------------------------------------------------- 1 | package cookie 2 | 3 | import ( 4 | "context" 5 | "net/http/cookiejar" 6 | ) 7 | 8 | type CookieGetter interface { 9 | Get(context.Context) (*cookiejar.Jar, error) 10 | } 11 | -------------------------------------------------------------------------------- /counter/counter.go: -------------------------------------------------------------------------------- 1 | package counter 2 | 3 | import ( 4 | "strings" 5 | "sync/atomic" 6 | "time" 7 | 8 | "github.com/go-redis/redis" 9 | ) 10 | 11 | // 任务技计数器 12 | type Counter interface { 13 | Incr(key string, num int64) 14 | GetCounterPrefix() string 15 | GetTaskIdField() string 16 | } 17 | 18 | // Redis transactions use optimistic locking. 19 | const ( 20 | maxRetries = 1000 21 | ) 22 | 23 | type redisTaskCounters struct { 24 | prefix string 25 | taskKeyField string 26 | RCli *redis.Client 27 | TTL time.Duration 28 | } 29 | 30 | func NewRedisTaskCounters(r_config redis.Options, ttl time.Duration, counterPrefix, taskField string) *redisTaskCounters { 31 | if !strings.HasSuffix(counterPrefix, ":") { 32 | counterPrefix = counterPrefix + ":" 33 | } 34 | rc := &redisTaskCounters{TTL: ttl, prefix: counterPrefix, taskKeyField: taskField} 35 | rc.RCli = redis.NewClient(&r_config) 36 | return rc 37 | } 38 | 39 | func (c *redisTaskCounters) GetCounterPrefix() string { 40 | return c.prefix 41 | } 42 | 43 | func (c *redisTaskCounters) GetTaskIdField() string { 44 | return c.taskKeyField 45 | } 46 | 47 | func (c *redisTaskCounters) Incr(key string, increment int64) { 48 | // transaction 49 | key = c.prefix + key 50 | txf := func(tx *redis.Tx) error { 51 | // Get the current value or zero. 52 | n, err := tx.Get(key).Int64() 53 | if err != nil && err != redis.Nil { 54 | return err 55 | } 56 | 57 | // Actual operation (local in optimistic lock). 58 | atomic.AddInt64(&n, increment) 59 | 60 | // Operation is commited only if the watched keys remain unchanged. 61 | _, err = tx.Pipelined(func(pipe redis.Pipeliner) error { 62 | pipe.Set(key, n, c.TTL) // time 63 | return nil 64 | }) 65 | return err 66 | } 67 | 68 | // Retry if the key has been changed. 69 | for i := 0; i < maxRetries; i++ { 70 | err := c.RCli.Watch(txf, key) 71 | if err == nil { 72 | // Success. 73 | return 74 | } 75 | if err == redis.TxFailedErr { 76 | // Optimistic lock lost. Retry. 77 | continue 78 | } 79 | // TODO: igonore any other error. 80 | return 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /fetcher/fetcher.go: -------------------------------------------------------------------------------- 1 | package fetcher 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "time" 8 | 9 | "github.com/superjcd/gocrawler/request" 10 | ) 11 | 12 | type Fetcher interface { 13 | Fetch(ctx context.Context, req *request.Request) (*http.Response, error) 14 | } 15 | 16 | type fectcher struct { 17 | Cli *http.Client 18 | options 19 | } 20 | 21 | var _ Fetcher = (*fectcher)(nil) 22 | 23 | func NewFectcher(timeOut time.Duration, opts ...Option) *fectcher { 24 | options := options{} 25 | for _, opt := range opts { 26 | opt(&options) 27 | } 28 | 29 | var transport *http.Transport 30 | if options.transport != nil { 31 | transport = options.transport 32 | } else { 33 | transport = http.DefaultTransport.(*http.Transport) 34 | transport.DisableKeepAlives = true 35 | } 36 | 37 | client := &http.Client{Transport: transport, Timeout: timeOut} 38 | f := &fectcher{Cli: client} 39 | 40 | return f 41 | } 42 | 43 | func (f *fectcher) Fetch(ctx context.Context, r *request.Request) (resp *http.Response, err error) { 44 | if f.cookieGetter != nil { 45 | jar, err := f.cookieGetter.Get(ctx) 46 | if err != nil { 47 | return nil, err 48 | } 49 | f.Cli.Jar = jar 50 | } 51 | 52 | req, err := http.NewRequest(r.Method, r.URL, nil) 53 | if err != nil { 54 | return nil, fmt.Errorf("get url failed: %w", err) 55 | } 56 | 57 | if f.uaGetter != nil { 58 | ua, err := f.uaGetter.Get(ctx) 59 | 60 | if err != nil { 61 | return nil, fmt.Errorf("get ua failed: %w", err) 62 | } 63 | req.Header.Set("User-Agent", ua) 64 | } 65 | 66 | if f.headers != nil { 67 | for key, value := range f.headers { 68 | req.Header.Set(key, value) 69 | } 70 | } 71 | 72 | resp, err = f.Cli.Do(req) 73 | 74 | if err != nil { 75 | return nil, err 76 | } 77 | 78 | return 79 | } 80 | 81 | func (f *fectcher) Health() (bool, map[string]any) { 82 | return true, nil 83 | } 84 | -------------------------------------------------------------------------------- /fetcher/options.go: -------------------------------------------------------------------------------- 1 | package fetcher 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/superjcd/gocrawler/cookie" 7 | "github.com/superjcd/gocrawler/proxy" 8 | "github.com/superjcd/gocrawler/ua" 9 | ) 10 | 11 | // proxyGetter proxy.ProxyGetter, cookieGetter cookie.CoookieGetter, uaGetter ua.UaGetter 12 | type options struct { 13 | transport *http.Transport 14 | proxyGetter proxy.ProxyGetter 15 | cookieGetter cookie.CookieGetter 16 | uaGetter ua.UaGetter 17 | headers map[string]string 18 | } 19 | 20 | type Option func(opts *options) 21 | 22 | func WithTransport(transport *http.Transport) Option { 23 | return func(opts *options) { 24 | opts.transport = transport 25 | } 26 | } 27 | 28 | func WithProxyGetter(proxyGetter proxy.ProxyGetter) Option { 29 | return func(opts *options) { 30 | opts.proxyGetter = proxyGetter 31 | } 32 | } 33 | 34 | func WithCookieGetter(cookieGetter cookie.CookieGetter) Option { 35 | return func(opts *options) { 36 | opts.cookieGetter = cookieGetter 37 | } 38 | } 39 | 40 | func WithHeaders(headers map[string]string) Option { 41 | return func(opts *options) { 42 | opts.headers = headers 43 | } 44 | } 45 | 46 | func WithUaGetter(uaGetter ua.UaGetter) Option { 47 | return func(opts *options) { 48 | opts.uaGetter = uaGetter 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/superjcd/gocrawler 2 | 3 | go 1.21.6 4 | 5 | require ( 6 | github.com/go-redis/redis v6.15.9+incompatible 7 | github.com/nsqio/go-nsq v1.1.0 8 | github.com/qiniu/qmgo v1.1.8 9 | golang.org/x/time v0.5.0 10 | ) 11 | 12 | require ( 13 | github.com/fsnotify/fsnotify v1.6.0 // indirect 14 | github.com/go-playground/locales v0.13.0 // indirect 15 | github.com/go-playground/universal-translator v0.17.0 // indirect 16 | github.com/go-playground/validator/v10 v10.4.1 // indirect 17 | github.com/golang/snappy v0.0.4 // indirect 18 | github.com/google/go-cmp v0.6.0 // indirect 19 | github.com/klauspost/compress v1.13.6 // indirect 20 | github.com/leodido/go-urn v1.2.0 // indirect 21 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect 22 | github.com/onsi/ginkgo v1.16.5 // indirect 23 | github.com/onsi/gomega v1.30.0 // indirect 24 | github.com/stretchr/testify v1.8.4 // indirect 25 | github.com/xdg-go/pbkdf2 v1.0.0 // indirect 26 | github.com/xdg-go/scram v1.1.2 // indirect 27 | github.com/xdg-go/stringprep v1.0.4 // indirect 28 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect 29 | go.mongodb.org/mongo-driver v1.13.1 // indirect 30 | golang.org/x/crypto v0.19.0 // indirect 31 | golang.org/x/net v0.21.0 // indirect 32 | golang.org/x/sync v0.6.0 // indirect 33 | golang.org/x/sys v0.17.0 // indirect 34 | golang.org/x/text v0.14.0 // indirect 35 | ) 36 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= 5 | github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= 6 | github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= 7 | github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= 8 | github.com/go-playground/assert/v2 v2.0.1 h1:MsBgLAaY856+nPRTKrp3/OZK38U/wa0CcBYNjji3q3A= 9 | github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= 10 | github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= 11 | github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= 12 | github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= 13 | github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= 14 | github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= 15 | github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= 16 | github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg= 17 | github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA= 18 | github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= 19 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 20 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= 21 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= 22 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= 23 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= 24 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= 25 | github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= 26 | github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 27 | github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= 28 | github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 29 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 30 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 31 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 32 | github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 33 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 34 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 35 | github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= 36 | github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc= 37 | github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= 38 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 39 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 40 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 41 | github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= 42 | github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= 43 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0= 44 | github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= 45 | github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE= 46 | github.com/nsqio/go-nsq v1.1.0/go.mod h1:vKq36oyeVXgsS5Q8YEO7WghqidAVXQlcFxzQbQTuDEY= 47 | github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= 48 | github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= 49 | github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= 50 | github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= 51 | github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= 52 | github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= 53 | github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= 54 | github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= 55 | github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= 56 | github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= 57 | github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= 58 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 59 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 60 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 61 | github.com/qiniu/qmgo v1.1.8 h1:E64M+P59aqQpXKI24ClVtluYkLaJLkkeD2hTVhrdMks= 62 | github.com/qiniu/qmgo v1.1.8/go.mod h1:QvZkzWNEv0buWPx0kdZsSs6URhESVubacxFPlITmvB8= 63 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 64 | github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= 65 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 66 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 67 | github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= 68 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 69 | github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= 70 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 71 | github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= 72 | github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= 73 | github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= 74 | github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g= 75 | github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= 76 | github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= 77 | github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8= 78 | github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= 79 | github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= 80 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA= 81 | github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= 82 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 83 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 84 | go.mongodb.org/mongo-driver v1.11.6/go.mod h1:G9TgswdsWjX4tmDA5zfs2+6AEPpYJwqblyjsfuh8oXY= 85 | go.mongodb.org/mongo-driver v1.13.1 h1:YIc7HTYsKndGK4RFzJ3covLz1byri52x0IoMB0Pt/vk= 86 | go.mongodb.org/mongo-driver v1.13.1/go.mod h1:wcDf1JBCXy2mOW0bWHwO/IOYqdca1MPCwDtFu/Z9+eo= 87 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 88 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 89 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 90 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 91 | golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 92 | golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo= 93 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 94 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 95 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 96 | golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 97 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 98 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 99 | golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 100 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 101 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 102 | golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 103 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 104 | golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= 105 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 106 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 107 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 108 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 109 | golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 110 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 111 | golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= 112 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 113 | golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 114 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 115 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 116 | golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 117 | golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 118 | golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 119 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 120 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 121 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 122 | golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 123 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 124 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 125 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 126 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 127 | golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 128 | golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= 129 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 130 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 131 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 132 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 133 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 134 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 135 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 136 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 137 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= 138 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 139 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= 140 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 141 | golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= 142 | golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 143 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 144 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 145 | golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= 146 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 147 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 148 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 149 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 150 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 151 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= 152 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= 153 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= 154 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= 155 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= 156 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 157 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 158 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 159 | gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= 160 | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= 161 | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= 162 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 163 | gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 164 | gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 165 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 166 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 167 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 168 | -------------------------------------------------------------------------------- /health/health.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | type HealthChecker interface { 4 | Health() (bool, map[string]any) 5 | } 6 | 7 | 8 | -------------------------------------------------------------------------------- /parser/parser.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | 7 | "github.com/superjcd/gocrawler/request" 8 | ) 9 | 10 | type ParseItem map[string]interface{} 11 | 12 | type ParseResult struct { 13 | Items []ParseItem 14 | Requests []*request.Request 15 | } 16 | 17 | type Parser interface { 18 | Parse(ctx context.Context, resp *http.Response) (*ParseResult, error) 19 | } 20 | -------------------------------------------------------------------------------- /proxy/proxy.go: -------------------------------------------------------------------------------- 1 | package proxy 2 | 3 | import ( 4 | "net/http" 5 | "net/url" 6 | ) 7 | 8 | type ProxyGetter interface { 9 | Get(*http.Request) (*url.URL, error) 10 | } 11 | 12 | var _ ProxyGetter = (*randomFixedProxyGetter)(nil) 13 | 14 | type randomFixedProxyGetter struct { 15 | Urls []string 16 | } 17 | 18 | func NewRandomFixedProxyGetter(urls ...string) *randomFixedProxyGetter { 19 | return &randomFixedProxyGetter{Urls: urls} 20 | } 21 | 22 | func (p *randomFixedProxyGetter) Get(*http.Request) (*url.URL, error) { 23 | return nil, nil 24 | } 25 | -------------------------------------------------------------------------------- /request/request.go: -------------------------------------------------------------------------------- 1 | package request 2 | 3 | import ( 4 | "bytes" 5 | "crypto/md5" 6 | "fmt" 7 | ) 8 | 9 | type Request struct { 10 | URL string 11 | Method string 12 | Retry int 13 | Data map[string]string // optional, will be passed to context if exists 14 | IsSecondary bool 15 | } 16 | 17 | type RequestDataCtxKey struct{} 18 | 19 | func (r *Request) Hash(hashFields ...string) string { 20 | components := make([][]byte, 2+len(hashFields)) 21 | components[0] = []byte(r.URL) 22 | components[1] = []byte(r.Method) 23 | 24 | for i, field := range hashFields { 25 | if fieldValue, ok := r.Data[field]; ok { 26 | components[i+2] = []byte(fieldValue) 27 | } else { 28 | panic(fmt.Errorf("field not found in `request.Data`")) 29 | } 30 | 31 | } 32 | 33 | hash := md5.Sum(bytes.Join(components, []byte(":"))) 34 | return string(hash[:]) 35 | } 36 | -------------------------------------------------------------------------------- /request/request_test.go: -------------------------------------------------------------------------------- 1 | package request 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestRequestHashWithSamePayload(t *testing.T) { 8 | var moreInfos = map[string]string{ 9 | "taskid": "123", 10 | } 11 | req := Request{ 12 | URL: "http://example.com", 13 | Method: "GET", 14 | Data: moreInfos, 15 | } 16 | 17 | hashed1 := req.Hash("taskid") 18 | 19 | req2 := Request{ 20 | URL: "http://example.com", 21 | Method: "GET", 22 | Data: moreInfos, 23 | } 24 | hashed2 := req2.Hash("taskid") 25 | 26 | if hashed1 != hashed2 { 27 | t.Errorf("shoud be equal") 28 | } 29 | } 30 | 31 | func TestRequestHashWithDifferentPayload(t *testing.T) { 32 | var moreInfos = map[string]string{ 33 | "taskid": "123", 34 | } 35 | // different taskid 36 | var moreInfos2 = map[string]string{ 37 | "taskid": "456", 38 | } 39 | req := Request{ 40 | URL: "http://example.com", 41 | Method: "GET", 42 | Data: moreInfos, 43 | } 44 | 45 | hashed1 := req.Hash("taskid") 46 | 47 | req2 := Request{ 48 | URL: "http://example.com", 49 | Method: "GET", 50 | Data: moreInfos2, 51 | } 52 | hashed2 := req2.Hash("taskid") 53 | 54 | if hashed1 == hashed2 { 55 | t.Errorf("should not be equal") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /scheduler/nsq/nsq.go: -------------------------------------------------------------------------------- 1 | package nsq 2 | 3 | import ( 4 | "encoding/json" 5 | "log" 6 | 7 | "github.com/nsqio/go-nsq" 8 | "github.com/superjcd/gocrawler/request" 9 | "github.com/superjcd/gocrawler/scheduler" 10 | ) 11 | 12 | type nsqScheduler struct { 13 | workerCh chan *request.Request 14 | nsqLookupdAddr string 15 | topicName string 16 | channelName string 17 | nsqConsumer *nsq.Consumer 18 | nsqProducer *nsq.Producer 19 | options 20 | } 21 | 22 | type nsqMessageHandler struct { 23 | s *nsqScheduler 24 | } 25 | 26 | func (h *nsqMessageHandler) HandleMessage(m *nsq.Message) error { 27 | var err error 28 | if len(m.Body) == 0 { 29 | return nil 30 | } 31 | 32 | processMessage := func(mb []byte) error { 33 | var req request.Request 34 | if err = json.Unmarshal(mb, &req); err != nil { 35 | return err 36 | 37 | } 38 | h.s.Push(scheduler.TYP_PUSH_CHANNEL, &req) 39 | return nil 40 | } 41 | 42 | err = processMessage(m.Body) 43 | 44 | return err 45 | 46 | } 47 | 48 | var _ scheduler.Scheduler = (*nsqScheduler)(nil) 49 | 50 | func NewNsqScheduler(topicName, channelName, nsqAddr, nsqLookupdAddr string, opts ...Option) *nsqScheduler { 51 | options := options{} 52 | 53 | for _, opt := range opts { 54 | opt(&options) 55 | } 56 | 57 | nsqConfig := nsq.NewConfig() 58 | 59 | nsqConsumer, err := nsq.NewConsumer(topicName, channelName, nsqConfig) 60 | 61 | if err != nil { 62 | log.Fatal(err) 63 | } 64 | 65 | nsqProducer, err := nsq.NewProducer(nsqAddr, nsqConfig) 66 | 67 | if err != nil { 68 | log.Fatal(err) 69 | } 70 | 71 | workerCh := make(chan *request.Request) 72 | 73 | return &nsqScheduler{workerCh: workerCh, 74 | topicName: topicName, 75 | channelName: channelName, 76 | nsqLookupdAddr: nsqLookupdAddr, 77 | nsqConsumer: nsqConsumer, 78 | nsqProducer: nsqProducer, 79 | options: options, 80 | } 81 | } 82 | 83 | func (s *nsqScheduler) Pull() *request.Request { 84 | req := <-s.workerCh 85 | return req 86 | } 87 | 88 | func (s *nsqScheduler) Push(typ int, reqs ...*request.Request) { 89 | switch typ { 90 | case scheduler.TYP_PUSH_CHANNEL: 91 | for _, req := range reqs { 92 | s.workerCh <- req 93 | } 94 | case scheduler.TYP_PUSH_SCHEDULER: 95 | for _, req := range reqs { 96 | msg, err := json.Marshal(req) 97 | if err != nil { 98 | log.Printf("push msg to nsq failed") 99 | } 100 | s.nsqProducer.Publish(s.topicName, msg) 101 | 102 | } 103 | default: 104 | log.Fatal("wrong push type") 105 | } 106 | 107 | } 108 | 109 | func (s *nsqScheduler) Schedule() { 110 | s.nsqConsumer.AddHandler(&nsqMessageHandler{s: s}) 111 | if err := s.nsqConsumer.ConnectToNSQLookupd(s.nsqLookupdAddr); err != nil { 112 | log.Fatal(err) 113 | } 114 | 115 | } 116 | 117 | func (s *nsqScheduler) SecondScheduler() scheduler.Scheduler { 118 | return s.secondScheduler 119 | } 120 | 121 | func (s *nsqScheduler) Stop() { 122 | s.nsqConsumer.Stop() 123 | } 124 | -------------------------------------------------------------------------------- /scheduler/nsq/option.go: -------------------------------------------------------------------------------- 1 | package nsq 2 | 3 | import ( 4 | "github.com/superjcd/gocrawler/scheduler" 5 | ) 6 | 7 | type options struct { 8 | secondScheduler scheduler.Scheduler 9 | } 10 | 11 | type Option func(opts *options) 12 | 13 | func WithSecondScheduler(scheduler *nsqScheduler) Option { 14 | return func(opts *options) { 15 | opts.secondScheduler = scheduler 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import "github.com/superjcd/gocrawler/request" 4 | 5 | const ( 6 | TYP_PUSH_CHANNEL = iota 7 | TYP_PUSH_SCHEDULER 8 | ) 9 | 10 | type Scheduler interface { 11 | Pull() *request.Request 12 | Push(typ int, reqs ...*request.Request) 13 | Schedule() 14 | SecondScheduler() Scheduler 15 | } 16 | -------------------------------------------------------------------------------- /store/mongo/buffered.go: -------------------------------------------------------------------------------- 1 | package mongo 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "sync" 8 | "time" 9 | 10 | "github.com/go-redis/redis" 11 | "github.com/qiniu/qmgo" 12 | "github.com/superjcd/gocrawler/counter" 13 | "github.com/superjcd/gocrawler/parser" 14 | "github.com/superjcd/gocrawler/store" 15 | ) 16 | 17 | type bufferedMongoStorage struct { 18 | L *sync.Mutex 19 | Cli *qmgo.QmgoClient 20 | buf []parser.ParseItem 21 | bufSize int 22 | counter counter.Counter 23 | taskKeyField string 24 | } 25 | 26 | var _ store.Storage = (*bufferedMongoStorage)(nil) 27 | 28 | type BufferedMongoStorageOption func(s *bufferedMongoStorage) 29 | 30 | func WithRedisCounter(r_config redis.Options, ttl time.Duration, counterPrefix, keyField string) BufferedMongoStorageOption { 31 | return func(s *bufferedMongoStorage) { 32 | redisCounter := counter.NewRedisTaskCounters(r_config, ttl, counterPrefix, keyField) 33 | s.counter = redisCounter 34 | s.taskKeyField = keyField 35 | } 36 | } 37 | 38 | const DEFAULT_BUFFER_SIZE = 100 39 | 40 | func NewBufferedMongoStorage(uri, database, collection string, bufferSize int, autoFlushInterval time.Duration, opts ...BufferedMongoStorageOption) *bufferedMongoStorage { 41 | ctx := context.Background() 42 | cli, err := qmgo.Open(ctx, &qmgo.Config{Uri: uri, 43 | Database: database, 44 | Coll: collection}) 45 | if err != nil { 46 | panic(err) 47 | } 48 | 49 | if bufferSize == 0 { 50 | bufferSize = DEFAULT_BUFFER_SIZE 51 | } 52 | 53 | buf := make([]parser.ParseItem, 0, bufferSize) 54 | 55 | var l sync.Mutex 56 | 57 | store := &bufferedMongoStorage{Cli: cli, L: &l, bufSize: bufferSize, buf: buf} 58 | for _, option := range opts { 59 | option(store) 60 | } 61 | 62 | ticker := time.NewTicker(autoFlushInterval) 63 | 64 | go func() { 65 | for t := range ticker.C { 66 | log.Printf("auto flush triggered at %v", t) 67 | store.flush() 68 | } 69 | 70 | }() 71 | 72 | return store 73 | 74 | } 75 | 76 | func (s *bufferedMongoStorage) Save(ctx context.Context, items ...parser.ParseItem) error { 77 | s.L.Lock() 78 | defer s.L.Unlock() 79 | for { 80 | if len(items) > s.bufSize { 81 | return fmt.Errorf("number of items too large(larger than the max bufSize), either increase storage bufSize or decrease number of items") 82 | } 83 | 84 | if len(items) > (s.bufSize - len(s.buf)) { 85 | if err := s.flush(); err != nil { 86 | return err 87 | } 88 | 89 | } else { 90 | s.buf = append(s.buf, items...) 91 | break 92 | } 93 | } 94 | 95 | return nil 96 | } 97 | 98 | func (s *bufferedMongoStorage) flush() error { 99 | if len(s.buf) == 0 { 100 | return nil 101 | } 102 | err := s.insertManyTOMongo(s.buf...) 103 | if err != nil { 104 | fmt.Println("debug", err) 105 | return err 106 | } 107 | 108 | if s.counter != nil { 109 | tc := s.collectTaskCounts(s.buf) 110 | s.count(tc) 111 | } 112 | // update buffer to an empty buffer 113 | s.buf = make([]parser.ParseItem, 0, s.bufSize) 114 | log.Printf("Flushed") 115 | return nil 116 | } 117 | 118 | func (s *bufferedMongoStorage) collectTaskCounts(buf []parser.ParseItem) (tc map[string]int64) { 119 | tc = make(map[string]int64, 128) 120 | for _, item := range buf { 121 | if taskId, ok := item[s.taskKeyField]; !ok { 122 | panic(fmt.Errorf("`%s` not found in Parsed Item, if you want to use the task counter, then `%s` embeded must be stuffed in the ParsedItem", s.taskKeyField, s.taskKeyField)) 123 | } else { 124 | switch v := taskId.(type) { 125 | case string: 126 | tc[v] += 1 127 | default: 128 | panic("`taskId` must be string") 129 | } 130 | } 131 | 132 | } 133 | return tc 134 | } 135 | 136 | func (s *bufferedMongoStorage) count(tc map[string]int64) { 137 | for k, v := range tc { 138 | s.counter.Incr(k, v) 139 | } 140 | } 141 | 142 | func (s *bufferedMongoStorage) insertManyTOMongo(items ...parser.ParseItem) error { 143 | if result, err := s.Cli.Collection.InsertMany(context.Background(), items); err != nil { 144 | return err 145 | } else { 146 | log.Printf("%d objects saved", len(result.InsertedIDs)) 147 | return nil 148 | } 149 | } 150 | 151 | func (s *bufferedMongoStorage) Close() error { 152 | s.flush() 153 | return s.Cli.Close(context.Background()) 154 | } 155 | -------------------------------------------------------------------------------- /store/mongo/unbuffered.go: -------------------------------------------------------------------------------- 1 | package mongo 2 | 3 | import ( 4 | "context" 5 | "log" 6 | 7 | "github.com/qiniu/qmgo" 8 | "github.com/superjcd/gocrawler/parser" 9 | "github.com/superjcd/gocrawler/store" 10 | ) 11 | 12 | type mongoStorage struct { 13 | Cli *qmgo.QmgoClient 14 | } 15 | 16 | var _ store.Storage = (*mongoStorage)(nil) 17 | 18 | func NewMongoStorage(uri, database, collection string) *mongoStorage { 19 | ctx := context.Background() 20 | cli, err := qmgo.Open(ctx, &qmgo.Config{Uri: uri, 21 | Database: database, 22 | Coll: collection}) // counter 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | return &mongoStorage{Cli: cli} 28 | } 29 | 30 | func (s *mongoStorage) Save(ctx context.Context, items ...parser.ParseItem) error { 31 | var result *qmgo.InsertOneResult 32 | var err error 33 | for _, item := range items { 34 | 35 | result, err = s.Cli.Collection.InsertOne(context.Background(), item) 36 | if err == nil { 37 | log.Println("[store]insert one ok") 38 | } 39 | } 40 | if err != nil { 41 | return err 42 | } 43 | _ = result 44 | return nil 45 | } 46 | -------------------------------------------------------------------------------- /store/store.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/superjcd/gocrawler/parser" 7 | ) 8 | 9 | type Storage interface { 10 | Save(ctx context.Context, datas ...parser.ParseItem) error 11 | } 12 | -------------------------------------------------------------------------------- /ua/ua.go: -------------------------------------------------------------------------------- 1 | package ua 2 | 3 | import ( 4 | "context" 5 | "math/rand" 6 | ) 7 | 8 | type UaGetter interface { 9 | Get(context.Context) (string, error) 10 | } 11 | 12 | type roundRobinUA struct{} 13 | 14 | func (u *roundRobinUA) Get(ctx context.Context) (string, error) { 15 | return DEFAULT_UAS[rand.Intn(len(DEFAULT_UAS)-1)], nil 16 | } 17 | 18 | func NewRoundRobinUAGetter() *roundRobinUA { 19 | return &roundRobinUA{} 20 | } 21 | -------------------------------------------------------------------------------- /ua/ua_list.go: -------------------------------------------------------------------------------- 1 | package ua 2 | 3 | var DEFAULT_UAS []string = []string{ 4 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", 5 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 6 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", 7 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 8 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0", 9 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.4.16 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 10 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0", 11 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", 12 | "Mozilla/5.0 (compatible; AirtableScripting; +undefined/developers/scripting;)", 13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 14 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/13.6.3 Chrome/114.0.5735.289 Electron/25.8.4 Safari/537.36", 15 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0", 16 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 17 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.4.14 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 18 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.6; rv:92.0) Gecko/20100101 Firefox/92.0", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", 20 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15", 21 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 22 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 PageSpeedPlus/1.0.0", 23 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.4.13 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 24 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 OPR/106.0.0.0", 25 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15", 26 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", 27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", 29 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 30 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0", 31 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", 32 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.4.16 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 33 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15", 34 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", 35 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", 36 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", 37 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0", 38 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", 39 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", 40 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", 41 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", 42 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Safari/605.1.15", 43 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67", 44 | "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0", 45 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", 46 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36", 47 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0", 48 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", 49 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", 50 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", 51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", 52 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", 53 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", 54 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", 55 | "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", 56 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15", 57 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", 58 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", 59 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.4.13 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36", 60 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", 61 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36", 62 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", 63 | "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0", 64 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/13.6.2 Chrome/114.0.5735.289 Electron/25.8.4 Safari/537.36", 65 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 OPR/105.0.0.0", 66 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", 67 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0", 68 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", 69 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0", 70 | "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0", 71 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0", 72 | } 73 | -------------------------------------------------------------------------------- /visit/redis/redis.go: -------------------------------------------------------------------------------- 1 | package redis 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/go-redis/redis" 7 | "github.com/superjcd/gocrawler/visit" 8 | ) 9 | 10 | type RedisVisit struct { 11 | VisitedKeyPrefix string 12 | RCli *redis.Client 13 | } 14 | 15 | var _ visit.Visit = (*RedisVisit)(nil) 16 | 17 | func NewRedisVisit(r_config redis.Options, prefixKey string) *RedisVisit { 18 | rc := &RedisVisit{VisitedKeyPrefix: prefixKey} 19 | rc.RCli = redis.NewClient(&r_config) 20 | return rc 21 | } 22 | 23 | func (rc *RedisVisit) SetVisitted(key string, ttl time.Duration) error { 24 | if rc.VisitedKeyPrefix == "" { 25 | key = "gocrawler:" + key 26 | } else { 27 | key = rc.VisitedKeyPrefix + key 28 | } 29 | _, err := rc.RCli.Set(key, "", ttl).Result() 30 | 31 | return err 32 | } 33 | 34 | func (rc *RedisVisit) UnsetVisitted(key string) error { 35 | _, err := rc.RCli.Del(key).Result() 36 | 37 | return err 38 | } 39 | 40 | func (rc *RedisVisit) IsVisited(key string) bool { 41 | _, err := rc.RCli.Get(key).Result() 42 | 43 | return err == nil 44 | } 45 | -------------------------------------------------------------------------------- /visit/visit.go: -------------------------------------------------------------------------------- 1 | package visit 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type Visit interface { 8 | SetVisitted(key string, ttl time.Duration) error 9 | UnsetVisitted(key string) error 10 | IsVisited(key string) bool 11 | } 12 | -------------------------------------------------------------------------------- /worker/option.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "time" 7 | 8 | "github.com/superjcd/gocrawler/fetcher" 9 | "github.com/superjcd/gocrawler/parser" 10 | "github.com/superjcd/gocrawler/request" 11 | "github.com/superjcd/gocrawler/scheduler" 12 | "github.com/superjcd/gocrawler/store" 13 | "github.com/superjcd/gocrawler/visit" 14 | "golang.org/x/time/rate" 15 | ) 16 | 17 | type Options struct { 18 | Scheduler scheduler.Scheduler 19 | Limiter *rate.Limiter 20 | UseVisit bool 21 | Visiter visit.Visit 22 | VisiterTTL time.Duration 23 | Fetcher fetcher.Fetcher 24 | Parser parser.Parser 25 | Store store.Storage 26 | Duration time.Duration 27 | AddtionalHashKeys []string 28 | 29 | BeforeRequestHook BeforeRequestHook 30 | AfterRequestHook AfterRequestHook 31 | BeforeSaveHook BeforeSaveHook 32 | AfterSaveHook AfterSaveHook 33 | } 34 | 35 | // lifecycle hooks 36 | 37 | type BeforeRequestHook func(context.Context, *request.Request) (Signal, error) 38 | 39 | type AfterRequestHook func(context.Context, *http.Response) (Signal, error) 40 | 41 | type BeforeSaveHook func(context.Context, *parser.ParseResult) (Signal, error) 42 | 43 | type AfterSaveHook func(context.Context, *parser.ParseResult) (Signal, error) 44 | 45 | type Option func(opts *Options) 46 | 47 | func WithScheduler(s scheduler.Scheduler) Option { 48 | return func(opts *Options) { 49 | opts.Scheduler = s 50 | } 51 | } 52 | 53 | func WithStore(store store.Storage) Option { 54 | return func(opts *Options) { 55 | opts.Store = store 56 | } 57 | } 58 | 59 | func WithFetcher(fetcher fetcher.Fetcher) Option { 60 | return func(opts *Options) { 61 | opts.Fetcher = fetcher 62 | } 63 | } 64 | 65 | func WithLimiter(limiter *rate.Limiter) Option { 66 | return func(opts *Options) { 67 | opts.Limiter = limiter 68 | } 69 | } 70 | 71 | func WithVisiter(v visit.Visit, ttl time.Duration) Option { 72 | return func(opts *Options) { 73 | opts.Visiter = v 74 | opts.UseVisit = true 75 | opts.VisiterTTL = ttl 76 | 77 | } 78 | } 79 | 80 | func WithDuration(duration time.Duration) Option { 81 | return func(opts *Options) { 82 | opts.Duration = duration 83 | } 84 | } 85 | 86 | func WithParser(p parser.Parser) Option { 87 | return func(opts *Options) { 88 | opts.Parser = p 89 | } 90 | } 91 | 92 | func WithAddtionalHashKeys(keys []string) Option { 93 | return func(opts *Options) { 94 | opts.AddtionalHashKeys = keys 95 | } 96 | } 97 | 98 | func WithBeforeRequestHook(h BeforeRequestHook) Option { 99 | return func(opts *Options) { 100 | opts.BeforeRequestHook = h 101 | } 102 | } 103 | 104 | func WithAfterRequestHook(h AfterRequestHook) Option { 105 | return func(opts *Options) { 106 | opts.AfterRequestHook = h 107 | } 108 | } 109 | 110 | func WithBeforeSaveHook(h BeforeSaveHook) Option { 111 | return func(opts *Options) { 112 | opts.BeforeSaveHook = h 113 | } 114 | } 115 | 116 | func WithAfterSaveHook(h AfterSaveHook) Option { 117 | return func(opts *Options) { 118 | opts.AfterSaveHook = h 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /worker/signal_test.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestContinueSignalOperation(t *testing.T) { 8 | var sig Signal 9 | sig |= ContinueWithRetrySignal 10 | 11 | if sig&ContinueWithRetrySignal == 0 { 12 | t.Errorf("wrong signal operation") 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /worker/signals.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | type Signal int8 4 | 5 | const ( 6 | DummySignal = 1 << iota 7 | ContinueWithRetrySignal 8 | ContinueWithoutRetrySignal 9 | BreakWithPanicSignal 10 | BreakWithoutPanicSignal 11 | ) 12 | 13 | const ( 14 | KeepGoing = iota 15 | ContinueLoop 16 | BreakLoop 17 | ) 18 | -------------------------------------------------------------------------------- /worker/worker.go: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "net/http" 7 | "time" 8 | 9 | "github.com/superjcd/gocrawler/health" 10 | "github.com/superjcd/gocrawler/parser" 11 | "github.com/superjcd/gocrawler/request" 12 | "github.com/superjcd/gocrawler/scheduler" 13 | ) 14 | 15 | type Worker interface { 16 | Name() string 17 | Run() 18 | BeforeRequest(context.Context, *request.Request) (Signal, error) 19 | AfterRequest(context.Context, *http.Response) (Signal, error) 20 | BeforeSave(context.Context, *parser.ParseResult) (Signal, error) 21 | AfterSave(context.Context, *parser.ParseResult) (Signal, error) 22 | } 23 | 24 | type worker struct { 25 | health.HealthChecker 26 | name string 27 | Workers int 28 | MaxRetries int 29 | SaveRequestData bool 30 | MaxRunTime time.Duration 31 | Options 32 | } 33 | 34 | var _ Worker = (*worker)(nil) 35 | 36 | func NewWorker(name string, workers, retries int, saveRequestData bool, maxRunTime time.Duration, opts ...Option) *worker { 37 | Options := Options{} 38 | 39 | for _, opt := range opts { 40 | opt(&Options) 41 | } 42 | w := &worker{name: name, Workers: workers, MaxRetries: retries, SaveRequestData: saveRequestData, MaxRunTime: maxRunTime} 43 | w.Options = Options 44 | 45 | go w.Scheduler.Schedule() 46 | 47 | return w 48 | } 49 | 50 | func (w *worker) BeforeRequest(ctx context.Context, req *request.Request) (Signal, error) { 51 | var sig Signal 52 | if w.BeforeRequestHook != nil { 53 | return w.BeforeRequestHook(ctx, req) 54 | } 55 | sig |= DummySignal 56 | return sig, nil 57 | } 58 | 59 | func (w *worker) AfterRequest(ctx context.Context, resp *http.Response) (Signal, error) { 60 | var sig Signal 61 | if w.AfterRequestHook != nil { 62 | return w.AfterRequestHook(ctx, resp) 63 | } 64 | sig |= DummySignal 65 | return sig, nil 66 | } 67 | 68 | func (w *worker) BeforeSave(ctx context.Context, par *parser.ParseResult) (Signal, error) { 69 | var sig Signal 70 | if w.BeforeSaveHook != nil { 71 | return w.BeforeSaveHook(ctx, par) 72 | } 73 | sig |= DummySignal 74 | return sig, nil 75 | } 76 | 77 | func (w *worker) AfterSave(ctx context.Context, par *parser.ParseResult) (Signal, error) { 78 | var sig Signal 79 | 80 | if w.AfterSaveHook != nil { 81 | return w.AfterSaveHook(ctx, par) 82 | } 83 | sig |= DummySignal 84 | return sig, nil 85 | } 86 | 87 | func (w *worker) Run() { 88 | ctx, cancel := context.WithTimeout( 89 | context.Background(), 90 | w.MaxRunTime, 91 | ) 92 | defer cancel() 93 | for i := 0; i < w.Workers; i++ { 94 | go singleRun(w) 95 | } 96 | 97 | <-ctx.Done() 98 | } 99 | 100 | func singleRun(w *worker) { 101 | var sig Signal 102 | var err error 103 | 104 | Loop: 105 | for { 106 | w.Limiter.Wait(context.TODO()) 107 | req := w.Scheduler.Pull() 108 | if req == nil { 109 | continue 110 | } 111 | 112 | var reqKey string 113 | if w.UseVisit { 114 | if w.AddtionalHashKeys == nil { 115 | reqKey = req.Hash() 116 | } else { 117 | reqKey = req.Hash(w.AddtionalHashKeys...) 118 | } 119 | 120 | if w.Visiter.IsVisited(reqKey) { 121 | continue Loop 122 | } 123 | } 124 | originReq := req 125 | 126 | ctx := context.Background() 127 | ctx = context.WithValue(ctx, request.RequestDataCtxKey{}, req.Data) 128 | 129 | sig, err = w.BeforeRequest(ctx, req) 130 | 131 | switch w.dealSignal(sig, err, req, originReq) { 132 | case ContinueLoop: 133 | continue Loop 134 | case BreakLoop: 135 | break Loop 136 | } 137 | 138 | resp, err := w.Fetcher.Fetch(ctx, req) 139 | if err != nil { 140 | log.Printf("fetch failed: %v", err) 141 | w.retry(req, originReq) 142 | continue 143 | } 144 | 145 | if w.AfterRequestHook != nil { 146 | sig, err = w.AfterRequestHook(ctx, resp) 147 | switch w.dealSignal(sig, err, req, originReq) { 148 | case ContinueLoop: 149 | continue Loop 150 | case BreakLoop: 151 | break Loop 152 | } 153 | } else { 154 | if resp.StatusCode != http.StatusOK { 155 | w.retry(req, originReq) 156 | continue 157 | } 158 | } 159 | 160 | // Parse 161 | parseResult, err := w.Parser.Parse(ctx, resp) 162 | if err != nil { 163 | log.Printf("parse failed for request: %s, error: %v", req.URL, err) 164 | w.retry(req, originReq) 165 | continue 166 | } 167 | 168 | // New Requests 169 | if parseResult.Requests != nil && len(parseResult.Requests) > 0 { 170 | for _, req := range parseResult.Requests { 171 | if !req.IsSecondary { 172 | w.Scheduler.Push(scheduler.TYP_PUSH_SCHEDULER, req) 173 | continue 174 | } 175 | if secondScheduler := w.Scheduler.SecondScheduler(); secondScheduler != nil { 176 | secondScheduler.Push(scheduler.TYP_PUSH_SCHEDULER, req) 177 | } 178 | } 179 | } 180 | 181 | // Save 182 | if parseResult.Items != nil && len(parseResult.Items) > 0 { 183 | if w.SaveRequestData { 184 | for _, p_item := range parseResult.Items { 185 | for dk, dv := range req.Data { 186 | p_item[dk] = dv 187 | } 188 | } 189 | } 190 | 191 | sig, err = w.BeforeSave(ctx, parseResult) 192 | switch w.dealSignal(sig, err, req, originReq) { 193 | case ContinueLoop: 194 | continue Loop 195 | case BreakLoop: 196 | break Loop 197 | } 198 | 199 | if err := w.Store.Save(ctx, parseResult.Items...); err != nil { 200 | log.Printf("item saved failed err: %v;items: ", err) 201 | continue 202 | } 203 | 204 | sig, err = w.AfterSave(ctx, parseResult) 205 | switch w.dealSignal(sig, err, req, originReq) { 206 | case ContinueLoop: 207 | continue Loop 208 | case BreakLoop: 209 | break Loop 210 | } 211 | } 212 | if w.UseVisit { 213 | w.Visiter.SetVisitted(reqKey, w.VisiterTTL) 214 | } 215 | 216 | } 217 | } 218 | 219 | func (w *worker) Name() string { 220 | return w.name 221 | } 222 | 223 | func (w *worker) retry(req, originReq *request.Request) { 224 | if req.Retry < w.MaxRetries { 225 | originReq.Retry += 1 226 | w.Scheduler.Push(scheduler.TYP_PUSH_SCHEDULER, originReq) 227 | } else { 228 | log.Printf("too many retries for request:%s, exceed max retries: %d", req.URL, w.MaxRetries) 229 | } 230 | } 231 | 232 | func (w *worker) dealSignal(sig Signal, err error, req, originReq *request.Request) int { 233 | if sig&DummySignal != 0 { 234 | return KeepGoing 235 | } 236 | if sig&ContinueWithRetrySignal != 0 { 237 | w.retry(req, originReq) 238 | return ContinueLoop 239 | } 240 | if sig&ContinueWithoutRetrySignal != 0 { 241 | return ContinueLoop 242 | } 243 | if sig&BreakWithPanicSignal != 0 { 244 | panic(err) 245 | } 246 | if sig&BreakWithoutPanicSignal != 0 { 247 | return BreakLoop 248 | } 249 | return KeepGoing 250 | } 251 | --------------------------------------------------------------------------------