├── .gitignore
├── html
├── parser.go
├── html_test.go
└── element.go
├── json
├── parser.go
├── json.go
└── json_test.go
├── methods.go
├── errors.go
├── go.mod
├── proxy
├── socks5.go
├── http.go
└── errors.go
├── proxy.go
├── cache.go
├── example
└── multipart
│ └── main.go
├── context
├── context_test.go
├── read.go
├── api.go
└── write.go
├── async_test.go
├── pool.go
├── status.go
├── options.go
├── response.go
├── go.sum
├── README.md
├── README.zh_CN.md
├── request.go
├── LICENSE
├── craw_test.go
└── craw.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 |
--------------------------------------------------------------------------------
/html/parser.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: parser.go
5 | * @Created: 2021-07-27 20:35:31
6 | * @Modified: 2021-10-12 09:44:32
7 | */
8 |
9 | package html
10 |
11 | import (
12 | "bytes"
13 |
14 | "github.com/PuerkitoBio/goquery"
15 | )
16 |
17 | // ParseHTML 解析 html
18 | func ParseHTML(body []byte) (*goquery.Document, error) {
19 | reader := bytes.NewReader(body)
20 | doc, err := goquery.NewDocumentFromReader(reader)
21 | if err != nil {
22 | return nil, err
23 | }
24 | return doc, nil
25 | }
26 |
--------------------------------------------------------------------------------
/json/parser.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: parser.go
5 | * @Created: 2021-07-27 20:41:02
6 | * @Modified: 2022-05-11 09:34:28
7 | */
8 |
9 | package json
10 |
11 | import "github.com/tidwall/gjson"
12 |
13 | type JSONResult = gjson.Result
14 |
15 | // ParseBytesToJSON converts `[]byte` variable to JSONResult
16 | func ParseBytesToJSON(body []byte) JSONResult {
17 | return gjson.ParseBytes(body)
18 | }
19 |
20 | // ParseBytesToJSON converts `string` variable to JSONResult
21 | func ParseJSON(body string) JSONResult {
22 | return gjson.Parse(body)
23 | }
24 |
--------------------------------------------------------------------------------
/methods.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: methods.go
5 | * @Created: 2022-03-01 12:53:41
6 | * @Modified: 2022-03-01 12:53:50
7 | */
8 |
9 | package predator
10 |
11 | // HTTP methods were copied from net/http.
12 | const (
13 | MethodGet = "GET" // RFC 7231, 4.3.1
14 | MethodHead = "HEAD" // RFC 7231, 4.3.2
15 | MethodPost = "POST" // RFC 7231, 4.3.3
16 | MethodPut = "PUT" // RFC 7231, 4.3.4
17 | MethodPatch = "PATCH" // RFC 5789
18 | MethodDelete = "DELETE" // RFC 7231, 4.3.5
19 | MethodConnect = "CONNECT" // RFC 7231, 4.3.6
20 | MethodOptions = "OPTIONS" // RFC 7231, 4.3.7
21 | MethodTrace = "TRACE" // RFC 7231, 4.3.8
22 | )
23 |
--------------------------------------------------------------------------------
/json/json.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: json.go
5 | * @Created: 2021-07-27 20:41:02
6 | * @Modified: 2022-04-18 13:30:27
7 | */
8 |
9 | package json
10 |
11 | import jsoniter "github.com/json-iterator/go"
12 |
13 | var json = jsoniter.ConfigCompatibleWithStandardLibrary
14 |
15 | func Marshal(v any) ([]byte, error) {
16 | return json.Marshal(v)
17 | }
18 |
19 | // Unmarshal 对于爬虫,反序列化是浪费资源的事,
20 | // 应尽量使用 gjson 完成对 json 的解析,实在无法
21 | // 用 gjson 解析时,再用此方法进行反序列化。
22 | //
23 | // 这里使用性能更高的第三方库完成反序列化。
24 | func Unmarshal(src []byte, v any) error {
25 | return json.Unmarshal(src, v)
26 | }
27 |
28 | func UnmarshalFromString(src string, v any) error {
29 | return json.UnmarshalFromString(src, v)
30 | }
31 |
--------------------------------------------------------------------------------
/errors.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: errors.go
5 | * @Created: 2022-02-17 15:30:54
6 | * @Modified: 2022-11-29 16:14:14
7 | */
8 |
9 | package predator
10 |
11 | import "errors"
12 |
13 | var (
14 | ErrRequestFailed = errors.New("request failed")
15 | ErrTimeout = errors.New("timeout, and it is recommended to try a new proxy if you are using a proxy pool")
16 | ErrInvalidCacheTypeCode = errors.New("invalid cache type code")
17 | ErrNotAllowedCacheFieldType = errors.New("only query parameters are allowed as cached fields in `GET` requests")
18 | ErrNoCache = errors.New("no cache configured")
19 | ErrInvalidResponseStatus = errors.New("if the http status code is `302`, there must be a valid `Location` field in the response header")
20 | )
21 |
--------------------------------------------------------------------------------
/json/json_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: json_test.go
5 | * @Created: 2021-07-29 18:53:57
6 | * @Modified: 2022-04-18 13:30:38
7 | */
8 |
9 | package json
10 |
11 | import (
12 | "testing"
13 |
14 | . "github.com/smartystreets/goconvey/convey"
15 | )
16 |
17 | func TestJSON(t *testing.T) {
18 | Convey("测试JSON", t, func() {
19 | type S struct {
20 | Name string `json:"name"`
21 | Age int `json:"age"`
22 | M map[string]any `json:"map"`
23 | }
24 |
25 | m := map[string]any{
26 | "key1": "value1",
27 | "key2": 2,
28 | "key3": 3.1,
29 | "key4": map[string]int{
30 | "one": 1,
31 | "two": 2,
32 | },
33 | "key5": S{"tom", 10, map[string]any{"a": 1.222}},
34 | }
35 |
36 | b, e := Marshal(m)
37 | So(e, ShouldBeNil)
38 | t.Log(string(b))
39 | })
40 | }
41 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/go-predator/predator
2 |
3 | go 1.19
4 |
5 | require (
6 | github.com/PuerkitoBio/goquery v1.8.0
7 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f
8 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b
9 | github.com/json-iterator/go v1.1.12
10 | github.com/smartystreets/goconvey v1.7.2
11 | github.com/tidwall/gjson v1.14.3
12 | github.com/valyala/bytebufferpool v1.0.0
13 | github.com/valyala/fasthttp v1.40.0
14 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458
15 | )
16 |
17 | require (
18 | github.com/andybalholm/brotli v1.0.4 // indirect
19 | github.com/andybalholm/cascadia v1.3.1 // indirect
20 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect
21 | github.com/jtolds/gls v4.20.0+incompatible // indirect
22 | github.com/klauspost/compress v1.15.4 // indirect
23 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect
24 | github.com/modern-go/reflect2 v1.0.2 // indirect
25 | github.com/rs/zerolog v1.26.1 // indirect
26 | github.com/smartystreets/assertions v1.2.0 // indirect
27 | github.com/tidwall/match v1.1.1 // indirect
28 | github.com/tidwall/pretty v1.2.0 // indirect
29 | golang.org/x/text v0.3.7 // indirect
30 | )
31 |
--------------------------------------------------------------------------------
/proxy/socks5.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: socks5.go
5 | * @Created: 2021-07-23 09:22:36
6 | * @Modified: 2022-03-04 10:45:38
7 | */
8 |
9 | package proxy
10 |
11 | import (
12 | "net"
13 | "net/url"
14 |
15 | "github.com/valyala/fasthttp"
16 | netProxy "golang.org/x/net/proxy"
17 | )
18 |
19 | func Socks5ProxyDialer(proxyAddr string) fasthttp.DialFunc {
20 | var (
21 | u *url.URL
22 | err error
23 | dialer netProxy.Dialer
24 | )
25 |
26 | if proxyAddr == "" {
27 | err = ProxyErr{
28 | Code: ErrInvalidProxyCode,
29 | Args: map[string]string{"addr": proxyAddr},
30 | Msg: "ip and port cannot be empty",
31 | }
32 | } else {
33 | if u, err = url.Parse(proxyAddr); err == nil {
34 | dialer, err = netProxy.FromURL(u, netProxy.Direct)
35 | }
36 | }
37 |
38 | // It would be nice if we could return the error here. But we can't
39 | // change our API so just keep returning it in the returned Dial function.
40 | // Besides the implementation of proxy.SOCKS5() at the time of writing this
41 | // will always return nil as error.
42 |
43 | return func(addr string) (net.Conn, error) {
44 | if err != nil {
45 | return nil, err
46 | }
47 | return dialer.Dial("tcp", addr)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/proxy.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: proxy.go
5 | * @Created: 2021-07-27 12:15:35
6 | * @Modified: 2022-05-24 09:22:46
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "time"
13 |
14 | "github.com/go-predator/log"
15 | "github.com/go-predator/predator/proxy"
16 | "github.com/valyala/fasthttp"
17 | )
18 |
19 | // 可以从一些代理网站的 api 中请求指定数量的代理 ip
20 | type AcquireProxies func(n int) []string
21 |
22 | func (c *Crawler) ProxyDialerWithTimeout(proxyAddr string, timeout time.Duration) fasthttp.DialFunc {
23 | c.lock.Lock()
24 | c.proxyInUse = proxyAddr
25 | c.lock.Unlock()
26 |
27 | if proxyAddr[:7] == "http://" || proxyAddr[:8] == "https://" {
28 | return proxy.HttpProxyDialerWithTimeout(proxyAddr, timeout)
29 | } else if proxyAddr[:9] == "socks5://" {
30 | return proxy.Socks5ProxyDialer(proxyAddr)
31 | } else {
32 | err := proxy.ProxyErr{
33 | Code: proxy.ErrUnknownProtocolCode,
34 | Args: map[string]string{
35 | "proxy_addr": proxyAddr,
36 | },
37 | Msg: "only support http and socks5 protocol, but the incoming proxy address uses an unknown protocol",
38 | }
39 | if c.log != nil {
40 | c.Fatal(err, log.Arg{Key: "proxy", Value: proxyAddr})
41 | } else {
42 | panic(err)
43 | }
44 | return nil
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/html/html_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: html_test.go
5 | * @Created: 2021-10-10 14:59:49
6 | * @Modified: 2021-10-12 09:44:28
7 | */
8 |
9 | package html
10 |
11 | import (
12 | "testing"
13 |
14 | . "github.com/smartystreets/goconvey/convey"
15 | )
16 |
17 | var body = []byte(`
18 |
31 | `)
32 |
33 | func TestGetParent(t *testing.T) {
34 | Convey("test to find the parent element", t, func() {
35 | doc, err := ParseHTML(body)
36 | So(err, ShouldBeNil)
37 |
38 | imgSelection := doc.Find("#showImg02")
39 | img := NewHTMLElementFromSelectionNode(imgSelection, imgSelection.Nodes[0], 0)
40 | So(img.Name, ShouldEqual, "img")
41 | So(img.Attr("id"), ShouldEqual, "showImg02")
42 |
43 | Convey("find the immediate parent element", func() {
44 | parent := img.Parent()
45 | So(parent.Name, ShouldEqual, "li")
46 | So(parent.Attr("id"), ShouldEqual, "showImg01")
47 | })
48 |
49 | Convey("find all parent elements", func() {
50 | parents := img.Parents()
51 | So(len(parents), ShouldEqual, 7)
52 | So(parents[len(parents)-1].Name, ShouldEqual, "html")
53 | })
54 | })
55 | }
56 |
--------------------------------------------------------------------------------
/cache.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: cache.go
5 | * @Created: 2021-11-24 20:39:11
6 | * @Modified: 2022-10-30 20:01:00
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "fmt"
13 | "net/url"
14 | )
15 |
16 | type Cache interface {
17 | // 是否开启压缩。压缩后能减小数据量,但压缩过程会耗时。
18 | // 如果原数据长度很长,压缩耗时要比查询耗时低得多,此时开启压缩功能是最佳选择。
19 | // 但如果原数据长度较短,压缩或不压缩,整体耗时区别不大。
20 | // 是否开启压缩,需要自行测试抉择。
21 | Compressed(yes bool)
22 | // 初始化,用来迁移数据库 / 表,和一些与数据库有关的前期准备工作
23 | Init() error
24 | // 当前请求是否已缓存过,如果缓存过,则返回缓存中的响应
25 | IsCached(key string) ([]byte, bool)
26 | // 将没有缓存过的请求保存到缓存中
27 | Cache(key string, val []byte) error
28 | // 清除全部缓存
29 | Clear() error
30 | }
31 |
32 | type CacheModel struct {
33 | Key string `gorm:"primaryKey"`
34 | Value []byte
35 | }
36 |
37 | func (CacheModel) TableName() string {
38 | return "predator-cache"
39 | }
40 |
41 | type cacheFieldType uint8
42 |
43 | const (
44 | // A key or field from URL query parameters
45 | queryParam cacheFieldType = iota
46 | // A key or field from request body parameters
47 | requestBodyParam
48 | )
49 |
50 | type CacheField struct {
51 | code cacheFieldType
52 | Field string
53 | }
54 |
55 | func (cf CacheField) String() string {
56 | return fmt.Sprintf("%d-%s", cf.code, cf.Field)
57 | }
58 |
59 | func addQueryParamCacheField(params url.Values, field CacheField) (string, string, error) {
60 | if val := params.Get(field.Field); val != "" {
61 | return field.String(), val, nil
62 | } else {
63 | // 如果设置了 cachedFields,但 url 查询参数中却没有某个 field,则报异常退出
64 | return "", "", fmt.Errorf("there is no such field [%s] in the query parameters: %v", field.Field, params.Encode())
65 | }
66 | }
67 |
68 | func NewQueryParamField(field string) CacheField {
69 | return CacheField{queryParam, field}
70 | }
71 |
72 | func NewRequestBodyParamField(field string) CacheField {
73 | return CacheField{requestBodyParam, field}
74 | }
75 |
--------------------------------------------------------------------------------
/proxy/http.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: http.go
5 | * @Created: 2021-07-23 09:22:36
6 | * @Modified: 2021-11-08 23:14:24
7 | */
8 |
9 | package proxy
10 |
11 | import (
12 | "bufio"
13 | "encoding/base64"
14 | "net"
15 | "strconv"
16 | "strings"
17 | "time"
18 |
19 | "github.com/valyala/fasthttp"
20 | )
21 |
22 | func HttpProxyDialerWithTimeout(proxyAddr string, timeout time.Duration) fasthttp.DialFunc {
23 | var auth string
24 | pAddr := strings.Split(proxyAddr, "//")[1]
25 | if strings.Contains(pAddr, "@") {
26 | split := strings.Split(pAddr, "@")
27 | auth = base64.StdEncoding.EncodeToString([]byte(split[0]))
28 | pAddr = split[1]
29 | }
30 |
31 | return func(addr string) (net.Conn, error) {
32 | var conn net.Conn
33 | var err error
34 | if timeout == 0 {
35 | conn, err = fasthttp.Dial(pAddr)
36 | } else {
37 | conn, err = fasthttp.DialTimeout(pAddr, timeout)
38 | }
39 | if err != nil {
40 | return nil, ProxyErr{
41 | Code: ErrUnableToConnectCode,
42 | Args: map[string]string{
43 | "proxy": pAddr,
44 | "error": err.Error(),
45 | },
46 | Msg: "cannot connect to proxy",
47 | }
48 | }
49 |
50 | req := "CONNECT " + addr + " HTTP/1.1\r\n"
51 | if auth != "" {
52 | req += "Proxy-Authorization: Basic " + auth + "\r\n"
53 | }
54 | req += "\r\n"
55 |
56 | if _, err := conn.Write([]byte(req)); err != nil {
57 | return nil, err
58 | }
59 |
60 | res := fasthttp.AcquireResponse()
61 | defer fasthttp.ReleaseResponse(res)
62 |
63 | res.SkipBody = true
64 |
65 | if err := res.Read(bufio.NewReader(conn)); err != nil {
66 | conn.Close()
67 | return nil, err
68 | }
69 | if res.Header.StatusCode() != 200 {
70 | conn.Close()
71 | return nil, ProxyErr{
72 | Code: ErrUnableToConnectCode,
73 | Args: map[string]string{
74 | "proxy": pAddr,
75 | "status_code": strconv.Itoa(res.Header.StatusCode()),
76 | },
77 | Msg: "cannot connect to proxy",
78 | }
79 | }
80 | return conn, nil
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/example/multipart/main.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: main.go
5 | * @Created: 2021-07-31 11:50:11
6 | * @Modified: 2022-05-24 09:23:31
7 | */
8 |
9 | package main
10 |
11 | import (
12 | "fmt"
13 | "math/rand"
14 | "strings"
15 |
16 | "github.com/go-predator/log"
17 | "github.com/go-predator/predator"
18 | "github.com/go-predator/predator/context"
19 | "github.com/tidwall/gjson"
20 | )
21 |
22 | // 自定义生成 boundary 的方法
23 | func randomBoundary() string {
24 | var s strings.Builder
25 | count := 29
26 | for i := 0; i < count; i++ {
27 | if i == 0 {
28 | s.WriteString(fmt.Sprint(rand.Intn(9) + 1))
29 | } else {
30 | s.WriteString(fmt.Sprint(rand.Intn(10)))
31 | }
32 | }
33 | return s.String()
34 | }
35 |
36 | func main() {
37 | c := predator.NewCrawler(
38 | // 使用 cookie
39 | predator.WithCookies(map[string]string{
40 | "PHPSESSID": "7ijqglcno1cljiqs76t2vo5oh2",
41 | }),
42 | // 使用日志
43 | predator.WithLogger(log.NewLogger(log.DEBUG, log.ToConsole())),
44 | predator.WithCache(nil, false, nil),
45 | )
46 |
47 | // 创建 multipart/form-data
48 | form := predator.NewMultipartForm(
49 | // boundary 前的横线
50 | "-------------------",
51 | // 传入自定义生成 boundary 的方法
52 | randomBoundary,
53 | )
54 |
55 | var err error
56 |
57 | // 向 form 中添加表单信息
58 | form.AppendString("type", "file")
59 | form.AppendString("action", "upload")
60 | form.AppendString("timestamp", "1627871450610")
61 | form.AppendString("auth_token", "f43cdc8a537eff5169dfddb946c2365d1f897b0c")
62 | form.AppendString("nsfw", "0")
63 | err = form.AppendFile("source", "/Users/thepoy/Pictures/Nginx.png")
64 | if err != nil {
65 | panic(err)
66 | }
67 |
68 | c.AfterResponse(func(r *predator.Response) {
69 | // 读取上下文
70 | val := r.Ctx.Get("foo")
71 | fmt.Println("value from context:", val)
72 |
73 | status := gjson.ParseBytes(r.Body).Get("status_code").Int()
74 | fmt.Println("status_code:", status)
75 | })
76 |
77 | // 创建上下文,并传入一些键值对
78 | ctx, err := context.NewContext()
79 | if err != nil {
80 | panic(err)
81 | }
82 | ctx.Put("foo", "bar")
83 |
84 | // 发送 multipart/form-data POST 请求
85 | err = c.PostMultipart("https://imgtu.com/json", form, ctx)
86 | if err != nil {
87 | panic(err)
88 | }
89 |
90 | // 清除缓存
91 | c.ClearCache()
92 | }
93 |
--------------------------------------------------------------------------------
/context/context_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: context_test.go
5 | * @Created: 2021-07-24 12:18:30
6 | * @Modified: 2022-04-18 13:29:16
7 | */
8 |
9 | package context
10 |
11 | import (
12 | "bytes"
13 | "testing"
14 |
15 | . "github.com/smartystreets/goconvey/convey"
16 | )
17 |
18 | func putSomeCtx(ctx Context) {
19 | ctx.Put("four", 4)
20 | ctx.Put("five", "5")
21 | ctx.Put("six", '6')
22 | ctx.Put("seven", []int{7})
23 | ctx.Put("eight", [1]int{8})
24 | }
25 |
26 | func ctxTest(ctx Context) {
27 | Convey("添加整数", func() {
28 | ctx.Put("one", 1)
29 | Convey("获取整数", func() {
30 | val := ctx.GetAny("one").(int)
31 | So(val, ShouldEqual, 1)
32 | Convey("删除添加的整数", func() {
33 | ctx.Delete("one")
34 | val := ctx.GetAny("one")
35 | So(val, ShouldBeNil)
36 | })
37 | })
38 | })
39 | Convey("添加字符串", func() {
40 | ctx.Put("two", "2")
41 | Convey("获取字符串", func() {
42 | val := ctx.Get("two")
43 | So(val, ShouldEqual, "2")
44 | Convey("获取并删除添加的字符串", func() {
45 | deleted := ctx.GetAndDelete("two")
46 | val := ctx.Get("two")
47 | So(deleted.(string), ShouldEqual, "2")
48 | So(val, ShouldEqual, "")
49 | })
50 | })
51 | })
52 | Convey("添加字节切片", func() {
53 | ctx.Put("three", []byte("3"))
54 | Convey("获取字节切片", func() {
55 | val := ctx.GetAny("three").([]byte)
56 | So(bytes.Equal(val, []byte("3")), ShouldBeTrue)
57 | })
58 | Convey("遍历上下文", func() {
59 | ctx.Put("four", 4)
60 | ctx.Put("five", "5")
61 | ctx.Put("six", '6')
62 | ctx.Put("seven", []int{7})
63 | ctx.Put("eight", [1]int{8})
64 | val := ctx.ForEach(func(key string, val any) any {
65 | return val
66 | })
67 | So(len(val), ShouldEqual, 6)
68 | })
69 | })
70 | Convey("上下文长度", func() {
71 | putSomeCtx(ctx)
72 | So(ctx.Length(), ShouldEqual, 5)
73 | })
74 | Convey("清空上下文", func() {
75 | putSomeCtx(ctx)
76 | ctx.Clear()
77 | So(ctx.Length(), ShouldEqual, 0)
78 | })
79 | }
80 |
81 | func TestContext(t *testing.T) {
82 | Convey("上下文测试", t, func() {
83 | Convey("读上下文测试", func() {
84 | ctx, _ := NewContext(ReadOp)
85 | ctxTest(ctx)
86 | })
87 | Convey("写上下文测试", func() {
88 | ctx, _ := NewContext(WriteOp)
89 | ctxTest(ctx)
90 | })
91 | })
92 | }
93 |
--------------------------------------------------------------------------------
/proxy/errors.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: errors.go
5 | * @Created: 2021-11-05 12:11:41
6 | * @Modified: 2022-11-29 16:14:26
7 | */
8 |
9 | package proxy
10 |
11 | import (
12 | "fmt"
13 | "regexp"
14 | "sort"
15 | "strings"
16 | )
17 |
18 | type ErrCode uint8
19 |
20 | const (
21 | ErrWrongFormatCode ErrCode = iota
22 | ErrUnknownProtocolCode
23 | ErrProxyExpiredCode
24 | ErrOnlyOneProxyIPCode
25 | ErrUnkownProxyIPCode
26 | ErrIPOrPortIsNullCode
27 | ErrEmptyProxyPoolCode
28 | ErrUnableToConnectCode
29 | ErrInvalidProxyCode
30 | )
31 |
32 | func (ec ErrCode) String() string {
33 | return fmt.Sprintf("proxy error [ %d ]", ec)
34 | }
35 |
36 | type ProxyErr struct {
37 | Code ErrCode
38 | Args map[string]string
39 | Msg string
40 | }
41 |
42 | func (pe ProxyErr) Error() string {
43 | var s strings.Builder
44 | s.WriteString(pe.Code.String())
45 |
46 | if len(pe.Msg) > 0 || len(pe.Args) > 0 {
47 | s.WriteByte(' ')
48 | }
49 |
50 | if pe.Msg != "" {
51 | s.WriteString("err=")
52 | s.WriteString(pe.Msg)
53 | s.WriteByte(',')
54 | s.WriteByte(' ')
55 | }
56 |
57 | keys := make([]string, 0, len(pe.Args))
58 | for k := range pe.Args {
59 | keys = append(keys, k)
60 | }
61 | sort.Strings(keys)
62 |
63 | for i, k := range keys {
64 | if i > 0 {
65 | s.WriteByte(',')
66 | s.WriteByte(' ')
67 | }
68 | s.WriteString(k)
69 | s.WriteByte('=')
70 | s.WriteString(pe.Args[k])
71 | }
72 |
73 | return s.String()
74 | }
75 |
76 | func IsProxyError(err error) (string, bool) {
77 | if e, ok := err.(ProxyErr); ok {
78 | switch e.Code {
79 | case ErrProxyExpiredCode, ErrUnableToConnectCode, ErrInvalidProxyCode:
80 | return e.Args["proxy"], true
81 | }
82 | return "", false
83 | }
84 |
85 | if len(err.Error()) < 26 {
86 | return "", false
87 | }
88 |
89 | // http proxy expired or invalid
90 | if err.Error()[:26] == "cannot connect to proxy ip" {
91 | re := regexp.MustCompile(`cannot connect to proxy ip \[ (.+?) \] -> .+?`)
92 | return re.FindAllStringSubmatch(err.Error(), 1)[0][1], true
93 | }
94 |
95 | // socks5 proxy expired or invalid
96 | if err.Error()[:17] == "socks connect tcp" {
97 | re := regexp.MustCompile("socks connect tcp (.+?)->.+?")
98 | return re.FindAllStringSubmatch(err.Error(), 1)[0][1], true
99 | }
100 |
101 | return "", false
102 | }
103 |
--------------------------------------------------------------------------------
/context/read.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: read.go
5 | * @Created: 2021-07-24 08:56:04
6 | * @Modified: 2022-04-18 13:29:10
7 | */
8 |
9 | package context
10 |
11 | import (
12 | "bytes"
13 | "fmt"
14 | "strings"
15 | "sync"
16 | )
17 |
18 | type rcontext struct {
19 | sync.Map
20 | }
21 |
22 | func (r *rcontext) GetAny(key string) any {
23 | val, ok := r.Load(key)
24 | if ok {
25 | return val
26 | }
27 | return nil
28 | }
29 |
30 | func (r *rcontext) Get(key string) string {
31 | val := r.GetAny(key)
32 | if val == nil {
33 | return ""
34 | }
35 | return val.(string)
36 | }
37 |
38 | func (r *rcontext) Put(key string, val any) {
39 | r.Store(key, val)
40 | }
41 |
42 | func (r *rcontext) GetAndDelete(key string) any {
43 | val, ok := r.LoadAndDelete(key)
44 | if ok {
45 | return val
46 | }
47 | return nil
48 | }
49 |
50 | func (r *rcontext) Delete(key string) {
51 | r.Map.Delete(key)
52 | }
53 |
54 | func (r *rcontext) ForEach(f func(key string, val any) any) []any {
55 | result := make([]any, 0, r.Length())
56 | r.Range(func(key, value any) bool {
57 | result = append(result, f(key.(string), value))
58 | return true
59 | })
60 | return result
61 | }
62 |
63 | func (r *rcontext) Clear() {
64 | r.Range(func(key, value any) bool {
65 | r.Map.Delete(key)
66 | return true
67 | })
68 | }
69 |
70 | func (r *rcontext) Length() int {
71 | l := 0
72 | r.Range(func(key, value any) bool {
73 | l++
74 | return true
75 | })
76 | return l
77 | }
78 |
79 | func (r *rcontext) Bytes() []byte {
80 | var b bytes.Buffer
81 | b.WriteByte('{')
82 | i := 0
83 | r.Range(func(key, value any) bool {
84 | if i > 0 {
85 | b.WriteString(`, `)
86 | }
87 | b.WriteByte('"')
88 | b.WriteString(key.(string))
89 | b.WriteString(`": "`)
90 | b.WriteString(fmt.Sprint(value))
91 | b.WriteByte('"')
92 | i++
93 | return true
94 | })
95 | b.WriteByte('}')
96 | return b.Bytes()
97 | }
98 |
99 | func (r *rcontext) String() string {
100 | var s strings.Builder
101 | s.WriteByte('{')
102 | i := 0
103 | r.Range(func(key, value any) bool {
104 | if i > 0 {
105 | s.WriteString(`, `)
106 | }
107 | s.WriteByte('"')
108 | s.WriteString(key.(string))
109 | s.WriteString(`": "`)
110 | s.WriteString(fmt.Sprint(value))
111 | s.WriteByte('"')
112 | i++
113 | return true
114 | })
115 | s.WriteByte('}')
116 | return strings.ReplaceAll(s.String(), ", }", "}")
117 | }
118 |
--------------------------------------------------------------------------------
/context/api.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: api.go
5 | * @Created: 2021-07-24 08:55:30
6 | * @Modified: 2022-04-18 13:29:17
7 | */
8 |
9 | package context
10 |
11 | import (
12 | "fmt"
13 | "sync"
14 | )
15 |
16 | type Context interface {
17 | // Get 通过 key 在上下文中获取一个字符串
18 | Get(key string) string
19 | // GetAny 通过 key 在上下文中获取一个任意类型
20 | GetAny(key string) any
21 | // Put 向上下文中传入一个 key: value
22 | Put(key string, val any)
23 | // GetAndDelete 获取并删除一个 key
24 | GetAndDelete(key string) any
25 | // Delete 在上下文中删除指定的 key
26 | Delete(key string)
27 | // ForEach 将上下文中的全部 key 和 value 用传
28 | // 入的函数处理后返回一个处理结果的切片
29 | ForEach(func(key string, val any) any) []any
30 | // Clear 清空一个上下文
31 | Clear()
32 | // Length 返回上下文的长度
33 | Length() int
34 | // String 将上下文转换为 json(非标准) 字符串
35 | String() string
36 | // Bytes 格式化为 json 用的字节切片
37 | Bytes() []byte
38 | }
39 |
40 | // 上下文类型
41 | type CtxOp int
42 |
43 | const (
44 | // 以读为主的上下文,
45 | // 适用于读操作远多于写的场景
46 | ReadOp CtxOp = iota
47 | // 适用于读写各半或写多于读的场景
48 | WriteOp
49 | )
50 |
51 | var ctxPool sync.Pool
52 |
53 | // AcquireCtx returns an empty Context instance from context pool.
54 | //
55 | // The returned Context instance may be passed to ReleaseCtx when it is
56 | // no longer needed. This allows Context recycling, reduces GC pressure
57 | // and usually improves performance.
58 | func AcquireCtx(ops ...CtxOp) (Context, error) {
59 | if len(ops) > 1 {
60 | return nil, fmt.Errorf("only 1 op can be passed in as most, but you passed %d ops", len(ops))
61 | }
62 | v := ctxPool.Get()
63 | if v == nil {
64 | return NewContext(ops...)
65 | }
66 | return v.(Context), nil
67 | }
68 |
69 | // ReleaseCtx returns ctx acquired via AcquireCtx to Context pool.
70 | //
71 | // It is forbidden accessing ctx and/or its' members after returning
72 | // it to Context pool.
73 | func ReleaseCtx(ctx Context) {
74 | ctx.Clear()
75 | ctxPool.Put(ctx)
76 | }
77 |
78 | // NewContext returns a new Context instance
79 | func NewContext(ops ...CtxOp) (Context, error) {
80 | if len(ops) > 1 {
81 | return nil, fmt.Errorf("only 1 op can be passed in as most, but you passed %d ops", len(ops))
82 | }
83 |
84 | var op CtxOp
85 | if len(ops) == 0 {
86 | op = WriteOp
87 | } else {
88 | op = ops[0]
89 | }
90 |
91 | switch op {
92 | case ReadOp:
93 | return &rcontext{}, nil
94 | case WriteOp:
95 | return &wcontext{
96 | m: make(map[string]any),
97 | l: &sync.RWMutex{},
98 | }, nil
99 | default:
100 | return nil, fmt.Errorf("unkown op: %d", op)
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/context/write.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: write.go
5 | * @Created: 2021-07-24 08:56:16
6 | * @Modified: 2022-04-18 13:28:52
7 | */
8 |
9 | package context
10 |
11 | import (
12 | "bytes"
13 | "fmt"
14 | "strings"
15 | "sync"
16 | )
17 |
18 | type wcontext struct {
19 | m map[string]any
20 | l *sync.RWMutex
21 | }
22 |
23 | func (w *wcontext) GetAny(key string) any {
24 | w.l.RLock()
25 | defer w.l.RUnlock()
26 |
27 | if v, ok := w.m[key]; ok {
28 | return v
29 | }
30 | return nil
31 | }
32 |
33 | func (w *wcontext) Get(key string) string {
34 | val := w.GetAny(key)
35 | if val == nil {
36 | return ""
37 | }
38 | return val.(string)
39 | }
40 |
41 | func (w *wcontext) Put(key string, val any) {
42 | w.l.Lock()
43 | w.m[key] = val
44 | w.l.Unlock()
45 | }
46 |
47 | func (w *wcontext) GetAndDelete(key string) any {
48 | w.l.Lock()
49 | defer w.l.Unlock()
50 |
51 | v, ok := w.m[key]
52 | if !ok {
53 | return nil
54 | }
55 |
56 | delete(w.m, key)
57 |
58 | return v
59 | }
60 |
61 | func (w *wcontext) Delete(key string) {
62 | w.GetAndDelete(key)
63 | }
64 |
65 | // ForEach 将上下文中的全部 key 和 value 用传入的函数处理后返回一个处理结果的切片
66 | func (w *wcontext) ForEach(f func(key string, val any) any) []any {
67 | w.l.RLock()
68 | defer w.l.RUnlock()
69 |
70 | result := make([]any, 0, len(w.m))
71 | for k, v := range w.m {
72 | result = append(result, f(k, v))
73 | }
74 | return result
75 | }
76 |
77 | func (w *wcontext) Clear() {
78 | w.l.Lock()
79 | // 不需要释放内存,而是应该复用内存,频繁地申请内存是不必要的
80 | for k := range w.m {
81 | delete(w.m, k)
82 | }
83 | w.l.Unlock()
84 | }
85 |
86 | func (w *wcontext) Length() int {
87 | w.l.RLock()
88 | defer w.l.RUnlock()
89 |
90 | return len(w.m)
91 | }
92 |
93 | func (w *wcontext) Bytes() []byte {
94 | w.l.RLock()
95 | defer w.l.RUnlock()
96 |
97 | var b bytes.Buffer
98 | b.WriteByte('{')
99 | i := 0
100 | for k, v := range w.m {
101 | if i > 0 {
102 | b.WriteString(`, `)
103 | }
104 | b.WriteByte('"')
105 | b.WriteString(k)
106 | b.WriteString(`": "`)
107 | b.WriteString(fmt.Sprint(v))
108 | b.WriteByte('"')
109 | i++
110 | }
111 | b.WriteByte('}')
112 | return b.Bytes()
113 | }
114 |
115 | func (w *wcontext) String() string {
116 | w.l.RLock()
117 | defer w.l.RUnlock()
118 |
119 | var s strings.Builder
120 | s.WriteByte('{')
121 | i := 0
122 | for k, v := range w.m {
123 | if i > 0 {
124 | s.WriteString(`, `)
125 | }
126 | s.WriteByte('"')
127 | s.WriteString(k)
128 | s.WriteString(`": "`)
129 | s.WriteString(fmt.Sprint(v))
130 | s.WriteByte('"')
131 | i++
132 | }
133 | s.WriteByte('}')
134 | return s.String()
135 | }
136 |
--------------------------------------------------------------------------------
/async_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: async_test.go
5 | * @Created: 2021-07-31 13:14:09
6 | * @Modified: 2021-11-17 11:38:16
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "fmt"
13 | "math/rand"
14 | "strings"
15 | "testing"
16 |
17 | "github.com/go-predator/predator/context"
18 | . "github.com/smartystreets/goconvey/convey"
19 | )
20 |
21 | func buildRequestBody(queryID string, page int) map[string]string {
22 | return map[string]string{
23 | "id": queryID,
24 | "page": fmt.Sprint(page),
25 | "key1": "value1",
26 | "key2": "value2",
27 | "key3": "value3",
28 | "key4": "",
29 | }
30 | }
31 |
32 | func randomBoundary() string {
33 | var s strings.Builder
34 | count := 29
35 | for i := 0; i < count; i++ {
36 | if i == 0 {
37 | s.WriteString(fmt.Sprint(rand.Intn(9) + 1))
38 | } else {
39 | s.WriteString(fmt.Sprint(rand.Intn(10)))
40 | }
41 | }
42 | return s.String()
43 | }
44 |
45 | func parsePerPage(c *Crawler, u, queryID string, page int) error {
46 | // 创造请求体
47 | body := buildRequestBody(queryID, page)
48 | form := NewMultipartForm(
49 | "-------------------",
50 | randomBoundary,
51 | )
52 | for k, v := range body {
53 | form.AppendString(k, v)
54 | }
55 |
56 | // 将请求体中的关键参数传入上下文
57 | ctx, _ := context.NewContext()
58 | ctx.Put("qid", queryID)
59 | ctx.Put("page", page)
60 |
61 | return c.PostMultipart(u, form, ctx)
62 | }
63 |
64 | func testAsync(crawler *Crawler, t *testing.T) {
65 | crawler.BeforeRequest(func(r *Request) {
66 | headers := map[string]string{
67 | "Accept": "*/*",
68 | "Accept-Language": "zh-CN",
69 | "Accept-Encoding": "gzip, deflate",
70 | }
71 |
72 | r.SetHeaders(headers)
73 |
74 | })
75 |
76 | crawler.AfterResponse(func(r *Response) {
77 | qid := r.Ctx.Get("qid")
78 | page := r.Ctx.GetAny("page").(int)
79 | t.Logf("qid=%s page=%d", qid, page)
80 | })
81 |
82 | // 请求多个分类的第一页内容
83 | for i := 0; i < 100; i++ {
84 | err := parsePerPage(crawler, "https://httpbin.org/post", fmt.Sprint(i+100), i+1)
85 | if err != nil {
86 | t.Error("爬取失败:", err)
87 | }
88 | }
89 | }
90 |
91 | func TestAsync(t *testing.T) {
92 | Convey("同步耗时", t, func() {
93 | defer timeCost()()
94 | crawler := NewCrawler(
95 | WithCache(nil, true, nil),
96 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"),
97 | )
98 |
99 | testAsync(crawler, t)
100 | crawler.ClearCache()
101 | })
102 |
103 | Convey("异步耗时", t, func() {
104 | defer timeCost()()
105 | crawler := NewCrawler(
106 | WithCache(nil, true, nil),
107 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"),
108 | WithConcurrency(30, false),
109 | )
110 |
111 | testAsync(crawler, t)
112 |
113 | crawler.Wait()
114 | crawler.ClearCache()
115 | })
116 | }
117 |
--------------------------------------------------------------------------------
/pool.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: pool.go
5 | * @Created: 2021-07-29 22:30:37
6 | * @Modified: 2022-05-24 09:23:16
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "errors"
13 | "fmt"
14 | "runtime/debug"
15 | "sync"
16 | "sync/atomic"
17 | "time"
18 |
19 | "github.com/go-predator/log"
20 | )
21 |
22 | // errors
23 | var (
24 | // return if pool size <= 0
25 | ErrInvalidPoolCap = errors.New("invalid pool cap")
26 | // put task but pool already closed
27 | ErrPoolAlreadyClosed = errors.New("pool already closed")
28 | // only the error type can be captured and processed
29 | ErrUnkownType = errors.New("recover only allows error type, but an unknown type is received")
30 | )
31 |
32 | // running status
33 | const (
34 | RUNNING = 1
35 | STOPED = 0
36 | )
37 |
38 | // Task task to-do
39 | type Task struct {
40 | crawler *Crawler
41 | req *Request
42 | isChained bool
43 | }
44 |
45 | // Pool task pool
46 | type Pool struct {
47 | capacity uint64
48 | runningWorkers uint64
49 | status int64
50 | chTask chan *Task
51 | log *log.Logger
52 | blockPanic bool
53 | sync.Mutex
54 | }
55 |
56 | // NewPool init pool
57 | func NewPool(capacity uint64) (*Pool, error) {
58 | if capacity <= 0 {
59 | return nil, ErrInvalidPoolCap
60 | }
61 | p := &Pool{
62 | capacity: capacity,
63 | status: RUNNING,
64 | chTask: make(chan *Task, capacity),
65 | }
66 |
67 | return p, nil
68 | }
69 |
70 | func (p *Pool) checkWorker() {
71 | p.Lock()
72 | defer p.Unlock()
73 |
74 | if p.runningWorkers == 0 && len(p.chTask) > 0 {
75 | p.run()
76 | }
77 | }
78 |
79 | // GetCap get capacity
80 | func (p *Pool) GetCap() uint64 {
81 | return p.capacity
82 | }
83 |
84 | // GetRunningWorkers get running workers
85 | func (p *Pool) GetRunningWorkers() uint64 {
86 | return atomic.LoadUint64(&p.runningWorkers)
87 | }
88 |
89 | func (p *Pool) incRunning() {
90 | atomic.AddUint64(&p.runningWorkers, 1)
91 | }
92 |
93 | func (p *Pool) decRunning() {
94 | atomic.AddUint64(&p.runningWorkers, ^uint64(0))
95 | }
96 |
97 | // Put put a task to pool
98 | func (p *Pool) Put(task *Task) error {
99 | p.Lock()
100 | defer p.Unlock()
101 |
102 | if p.status == STOPED {
103 | return ErrPoolAlreadyClosed
104 | }
105 |
106 | // run worker
107 | if p.GetRunningWorkers() < p.GetCap() {
108 | p.run()
109 | }
110 |
111 | // send task
112 | if p.status == RUNNING {
113 | p.chTask <- task
114 | }
115 |
116 | return nil
117 | }
118 |
119 | func (p *Pool) run() {
120 | p.incRunning()
121 |
122 | go func() {
123 | defer func() {
124 | p.decRunning()
125 |
126 | if r := recover(); r != nil {
127 | if p.blockPanic {
128 | // 打印panic的堆栈信息
129 | debug.PrintStack()
130 |
131 | p.log.Error(fmt.Errorf("worker panic: %s", r))
132 | } else {
133 | // panic 只允许 error 类型
134 | if e, ok := r.(error); ok {
135 | panic(e)
136 | } else {
137 | panic(fmt.Sprintf("%s: %v", ErrUnkownType, r))
138 | }
139 | }
140 | }
141 |
142 | p.checkWorker() // check worker avoid no worker running
143 | }()
144 |
145 | for task := range p.chTask {
146 | task.crawler.prepare(task.req, task.isChained)
147 | }
148 | }()
149 |
150 | }
151 |
152 | func (p *Pool) setStatus(status int64) bool {
153 | p.Lock()
154 | defer p.Unlock()
155 |
156 | if p.status == status {
157 | return false
158 | }
159 |
160 | p.status = status
161 |
162 | return true
163 | }
164 |
165 | // Close close pool graceful
166 | func (p *Pool) Close() {
167 |
168 | if !p.setStatus(STOPED) { // stop put task
169 | return
170 | }
171 |
172 | for len(p.chTask) > 0 { // wait all task be consumed
173 | time.Sleep(1e6) // reduce CPU load
174 | }
175 |
176 | close(p.chTask)
177 | }
178 |
--------------------------------------------------------------------------------
/status.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: status.go
5 | * @Created: 2022-03-01 12:48:11
6 | * @Modified: 2022-03-04 10:48:00
7 | */
8 |
9 | package predator
10 |
11 | import "github.com/valyala/fasthttp"
12 |
13 | // HTTP status codes were stolen from net/http.
14 | const (
15 | StatusContinue = 100 // RFC 7231, 6.2.1
16 | StatusSwitchingProtocols = 101 // RFC 7231, 6.2.2
17 | StatusProcessing = 102 // RFC 2518, 10.1
18 | StatusEarlyHints = 103 // RFC 8297
19 |
20 | StatusOK = 200 // RFC 7231, 6.3.1
21 | StatusCreated = 201 // RFC 7231, 6.3.2
22 | StatusAccepted = 202 // RFC 7231, 6.3.3
23 | StatusNonAuthoritativeInfo = 203 // RFC 7231, 6.3.4
24 | StatusNoContent = 204 // RFC 7231, 6.3.5
25 | StatusResetContent = 205 // RFC 7231, 6.3.6
26 | StatusPartialContent = 206 // RFC 7233, 4.1
27 | StatusMultiStatus = 207 // RFC 4918, 11.1
28 | StatusAlreadyReported = 208 // RFC 5842, 7.1
29 | StatusIMUsed = 226 // RFC 3229, 10.4.1
30 |
31 | StatusMultipleChoices = 300 // RFC 7231, 6.4.1
32 | StatusMovedPermanently = 301 // RFC 7231, 6.4.2
33 | StatusFound = 302 // RFC 7231, 6.4.3
34 | StatusSeeOther = 303 // RFC 7231, 6.4.4
35 | StatusNotModified = 304 // RFC 7232, 4.1
36 | StatusUseProxy = 305 // RFC 7231, 6.4.5
37 | _ = 306 // RFC 7231, 6.4.6 (Unused)
38 | StatusTemporaryRedirect = 307 // RFC 7231, 6.4.7
39 | StatusPermanentRedirect = 308 // RFC 7538, 3
40 |
41 | StatusBadRequest = 400 // RFC 7231, 6.5.1
42 | StatusUnauthorized = 401 // RFC 7235, 3.1
43 | StatusPaymentRequired = 402 // RFC 7231, 6.5.2
44 | StatusForbidden = 403 // RFC 7231, 6.5.3
45 | StatusNotFound = 404 // RFC 7231, 6.5.4
46 | StatusMethodNotAllowed = 405 // RFC 7231, 6.5.5
47 | StatusNotAcceptable = 406 // RFC 7231, 6.5.6
48 | StatusProxyAuthRequired = 407 // RFC 7235, 3.2
49 | StatusRequestTimeout = 408 // RFC 7231, 6.5.7
50 | StatusConflict = 409 // RFC 7231, 6.5.8
51 | StatusGone = 410 // RFC 7231, 6.5.9
52 | StatusLengthRequired = 411 // RFC 7231, 6.5.10
53 | StatusPreconditionFailed = 412 // RFC 7232, 4.2
54 | StatusRequestEntityTooLarge = 413 // RFC 7231, 6.5.11
55 | StatusRequestURITooLong = 414 // RFC 7231, 6.5.12
56 | StatusUnsupportedMediaType = 415 // RFC 7231, 6.5.13
57 | StatusRequestedRangeNotSatisfiable = 416 // RFC 7233, 4.4
58 | StatusExpectationFailed = 417 // RFC 7231, 6.5.14
59 | StatusTeapot = 418 // RFC 7168, 2.3.3
60 | StatusMisdirectedRequest = 421 // RFC 7540, 9.1.2
61 | StatusUnprocessableEntity = 422 // RFC 4918, 11.2
62 | StatusLocked = 423 // RFC 4918, 11.3
63 | StatusFailedDependency = 424 // RFC 4918, 11.4
64 | StatusUpgradeRequired = 426 // RFC 7231, 6.5.15
65 | StatusPreconditionRequired = 428 // RFC 6585, 3
66 | StatusTooManyRequests = 429 // RFC 6585, 4
67 | StatusRequestHeaderFieldsTooLarge = 431 // RFC 6585, 5
68 | StatusUnavailableForLegalReasons = 451 // RFC 7725, 3
69 |
70 | StatusInternalServerError = 500 // RFC 7231, 6.6.1
71 | StatusNotImplemented = 501 // RFC 7231, 6.6.2
72 | StatusBadGateway = 502 // RFC 7231, 6.6.3
73 | StatusServiceUnavailable = 503 // RFC 7231, 6.6.4
74 | StatusGatewayTimeout = 504 // RFC 7231, 6.6.5
75 | StatusHTTPVersionNotSupported = 505 // RFC 7231, 6.6.6
76 | StatusVariantAlsoNegotiates = 506 // RFC 2295, 8.1
77 | StatusInsufficientStorage = 507 // RFC 4918, 11.5
78 | StatusLoopDetected = 508 // RFC 5842, 7.2
79 | StatusNotExtended = 510 // RFC 2774, 7
80 | StatusNetworkAuthenticationRequired = 511 // RFC 6585, 6
81 | )
82 |
83 | // StatusMessage returns HTTP status message for the given status code.
84 | func StatusMessage(statusCode int) string {
85 | return fasthttp.StatusMessage(statusCode)
86 | }
87 |
--------------------------------------------------------------------------------
/options.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: options.go
5 | * @Created: 2021-07-23 08:58:31
6 | * @Modified: 2022-05-24 09:22:54
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "crypto/tls"
13 | "strings"
14 | "sync"
15 |
16 | "github.com/go-predator/log"
17 | )
18 |
19 | type CrawlerOption func(*Crawler)
20 |
21 | // SkipVerification will skip verifying the certificate when
22 | // you access the `https` protocol
23 | func SkipVerification() CrawlerOption {
24 | return func(c *Crawler) {
25 | c.client.TLSConfig = &tls.Config{InsecureSkipVerify: true}
26 | }
27 | }
28 |
29 | func WithLogger(logger *log.Logger) CrawlerOption {
30 | if logger == nil {
31 | logger = log.NewLogger(log.WARNING, log.ToConsole(), 2)
32 | }
33 |
34 | return func(c *Crawler) {
35 | c.log = log.NewLogger(log.Level(logger.L.GetLevel()), logger.Out(), 2)
36 | }
37 | }
38 |
39 | func WithConsoleLogger(level log.Level) CrawlerOption {
40 | return func(c *Crawler) {
41 | c.log = log.NewLogger(level, log.ToConsole(), 2)
42 | }
43 | }
44 |
45 | func WithFileLogger(level log.Level, filename string) CrawlerOption {
46 | return func(c *Crawler) {
47 | c.log = log.NewLogger(level, log.MustToFile(filename, -1), 2)
48 | }
49 | }
50 |
51 | func WithConsoleAndFileLogger(level log.Level, filename string) CrawlerOption {
52 | return func(c *Crawler) {
53 | c.log = log.NewLogger(level, log.MustToConsoleAndFile(filename, -1), 2)
54 | }
55 | }
56 |
57 | func WithDefaultLogger() CrawlerOption {
58 | return WithLogger(nil)
59 | }
60 |
61 | func WithUserAgent(ua string) CrawlerOption {
62 | return func(c *Crawler) {
63 | c.UserAgent = ua
64 | }
65 | }
66 |
67 | func WithRawCookie(cookie string) CrawlerOption {
68 | cookies := make(map[string]string)
69 | cookieSlice := strings.Split(cookie, "; ")
70 | for _, c := range cookieSlice {
71 | temp := strings.SplitN(c, "=", 2)
72 | cookies[temp[0]] = temp[1]
73 | }
74 | return WithCookies(cookies)
75 | }
76 |
77 | func WithCookies(cookies map[string]string) CrawlerOption {
78 | return func(c *Crawler) {
79 | c.cookies = cookies
80 | }
81 | }
82 |
83 | // WithConcurrency 使用并发,参数为要创建的协程池数量
84 | func WithConcurrency(count uint64, blockPanic bool) CrawlerOption {
85 | return func(c *Crawler) {
86 | p, err := NewPool(count)
87 | if err != nil {
88 | panic(err)
89 | }
90 | p.blockPanic = blockPanic
91 |
92 | c.goPool = p
93 | c.wg = new(sync.WaitGroup)
94 | }
95 | }
96 |
97 | type RetryCondition func(r *Response) bool
98 |
99 | // WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
100 | func WithRetry(count uint32, cond RetryCondition) CrawlerOption {
101 | return func(c *Crawler) {
102 | c.retryCount = count
103 | c.retryCondition = cond
104 | }
105 | }
106 |
107 | // WithProxy 使用一个代理
108 | func WithProxy(proxyURL string) CrawlerOption {
109 | return func(c *Crawler) {
110 | c.proxyURLPool = []string{proxyURL}
111 | }
112 | }
113 |
114 | // WithProxyPool 使用一个代理池
115 | func WithProxyPool(proxyURLs []string) CrawlerOption {
116 | return func(c *Crawler) {
117 | c.proxyURLPool = proxyURLs
118 | }
119 | }
120 |
121 | // WithComplementProxyPool replenishes the proxy pool when the proxy pool is empty
122 | func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption {
123 | return func(c *Crawler) {
124 | c.complementProxyPool = f
125 | }
126 | }
127 |
128 | // WithCache 使用缓存,可以选择是否压缩缓存的响应。
129 | // 使用缓存时,如果发出的是 POST 请求,最好传入能
130 | // 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
131 | //
132 | // 注意:当不传入缓存字段时,将会默认采用整个请求体作为
133 | // 缓存标识,但由于 map 无序,同一个请求体生成的 key 很
134 | // 难保证相同,所以可能会有同一个请求缓存多次,或者无法
135 | // 从缓存中读取已请求过的请求的响应的情况出现。
136 | func WithCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField) CrawlerOption {
137 | return func(c *Crawler) {
138 | cc.Compressed(compressed)
139 | err := cc.Init()
140 | if err != nil {
141 | panic(err)
142 | }
143 | c.cache = cc
144 | if cacheCondition == nil {
145 | cacheCondition = func(r *Response) bool {
146 | return r.StatusCode/100 == 2
147 | }
148 | }
149 | c.cacheCondition = cacheCondition
150 | if len(cacheFileds) > 0 {
151 | c.cacheFields = cacheFileds
152 | }
153 | }
154 | }
155 |
156 | func EnableIPv6() CrawlerOption {
157 | return func(c *Crawler) {
158 | c.client.DialDualStack = true
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/response.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: response.go
5 | * @Created: 2021-07-24 13:34:44
6 | * @Modified: 2022-11-29 16:14:21
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "errors"
13 | "net"
14 | "os"
15 | "sync"
16 |
17 | ctx "github.com/go-predator/predator/context"
18 | "github.com/go-predator/predator/json"
19 | "github.com/valyala/bytebufferpool"
20 | "github.com/valyala/fasthttp"
21 | )
22 |
23 | var (
24 | ErrIncorrectResponse = errors.New("the response status code is not 20X")
25 | )
26 |
27 | type Response struct {
28 | // 响应状态码
29 | StatusCode int
30 | // 二进制请求体
31 | Body []byte
32 | // 请求和响应之间共享的上下文
33 | Ctx ctx.Context `json:"-"`
34 | // 响应对应的请求
35 | Request *Request `json:"-"`
36 | // 响应头
37 | Headers fasthttp.ResponseHeader
38 | // 是否从缓存中取得的响应
39 | FromCache bool
40 | // 客户端公网 ip
41 | clientIP net.Addr
42 | // 本地局域网 ip
43 | localIP net.Addr
44 | timeout bool
45 | // Whether the response is valid,
46 | // html for invalid responses will not be parsed
47 | invalid bool
48 | }
49 |
50 | // Save writes response body to disk
51 | func (r *Response) Save(fileName string) error {
52 | return os.WriteFile(fileName, r.Body, 0644)
53 | }
54 |
55 | // Invalidate marks the current response as invalid and skips the html parsing process
56 | func (r *Response) Invalidate() {
57 | r.invalid = true
58 | }
59 |
60 | func (r *Response) GetSetCookie() string {
61 | return string(r.Headers.Peek("Set-Cookie"))
62 | }
63 |
64 | func (r *Response) ContentType() string {
65 | return string(r.Headers.Peek("Content-Type"))
66 | }
67 |
68 | // BodyGunzip returns un-gzipped body data.
69 | //
70 | // This method may be used if the response header contains
71 | // 'Content-Encoding: gzip' for reading un-gzipped body.
72 | // Use Body for reading gzipped response body.
73 | func (r *Response) BodyGunzip() ([]byte, error) {
74 | var bb bytebufferpool.ByteBuffer
75 | _, err := fasthttp.WriteGunzip(&bb, r.Body)
76 | if err != nil {
77 | return nil, err
78 | }
79 | return bb.B, nil
80 | }
81 |
82 | func (r *Response) String() string {
83 | return string(r.Body)
84 | }
85 |
86 | func (r *Response) Reset(releaseCtx bool) {
87 | r.StatusCode = 0
88 | if r.Body != nil {
89 | // 将 body 长度截为 0,这样不会删除引用关系,GC 不会回收,
90 | // 可以实现 body 的复用
91 | r.Body = r.Body[:0]
92 | }
93 |
94 | // 为了在链式请求中传递上下文,不能每次响应后都释放上下文。
95 | if releaseCtx {
96 | ctx.ReleaseCtx(r.Ctx)
97 | }
98 |
99 | ReleaseRequest(r.Request)
100 | r.Headers.Reset()
101 | r.FromCache = false
102 | r.invalid = false
103 | r.localIP = nil
104 | r.clientIP = nil
105 | }
106 |
107 | type cachedHeaders struct {
108 | StatusCode int
109 | ContentType []byte // this is the most important field
110 | ContentLength int
111 | Server []byte
112 | Location []byte
113 | }
114 |
115 | type cachedResponse struct {
116 | Body []byte
117 | Headers *cachedHeaders
118 | }
119 |
120 | func (r *Response) convertHeaders() (*cachedHeaders, error) {
121 | ch := &cachedHeaders{}
122 | ch.StatusCode = r.StatusCode
123 | ch.ContentType = r.Headers.ContentType()
124 | ch.ContentLength = r.Headers.ContentLength()
125 | ch.Server = r.Headers.Server()
126 |
127 | if ch.StatusCode == StatusFound {
128 | if ch.Location == nil {
129 | return nil, ErrInvalidResponseStatus
130 | }
131 | ch.Location = r.Headers.Peek("Location")
132 | }
133 |
134 | return ch, nil
135 | }
136 |
137 | func (r *Response) Marshal() ([]byte, error) {
138 | // The cached response does not need to save all the response headers,
139 | // so the following code is not used to convert the response headers to bytes
140 | // var buf bytes.Buffer
141 | // b := bufio.NewWriter(&buf)
142 | // r.Headers.Write(b)
143 | // b.Flush()
144 |
145 | var (
146 | cr cachedResponse
147 | err error
148 | )
149 | cr.Body = r.Body
150 | cr.Headers, err = r.convertHeaders()
151 | if err != nil {
152 | return nil, err
153 | }
154 |
155 | return json.Marshal(cr)
156 | }
157 |
158 | func (r *Response) Unmarshal(cachedBody []byte) error {
159 | var (
160 | cr cachedResponse
161 | err error
162 | )
163 | err = json.Unmarshal(cachedBody, &cr)
164 | if err != nil {
165 | return err
166 | }
167 |
168 | r.Body = cr.Body
169 | r.StatusCode = cr.Headers.StatusCode
170 | r.Headers.SetStatusCode(r.StatusCode)
171 | r.Headers.SetContentTypeBytes(cr.Headers.ContentType)
172 | r.Headers.SetContentLength(cr.Headers.ContentLength)
173 | r.Headers.SetServerBytes(cr.Headers.Server)
174 |
175 | return nil
176 | }
177 |
178 | func (r *Response) LocalIP() string {
179 | if r.localIP != nil {
180 | return r.localIP.String()
181 | }
182 | return ""
183 | }
184 |
185 | func (r *Response) ClientIP() string {
186 | if r.clientIP != nil {
187 | return r.clientIP.String()
188 | }
189 | return ""
190 | }
191 |
192 | func (r *Response) IsTimeout() bool {
193 | return r.timeout
194 | }
195 |
196 | var (
197 | responsePool sync.Pool
198 | )
199 |
200 | // AcquireResponse returns an empty Response instance from response pool.
201 | //
202 | // The returned Response instance may be passed to ReleaseResponse when it is
203 | // no longer needed. This allows Response recycling, reduces GC pressure
204 | // and usually improves performance.
205 | func AcquireResponse() *Response {
206 | v := responsePool.Get()
207 | if v == nil {
208 | return &Response{}
209 | }
210 | return v.(*Response)
211 | }
212 |
213 | // ReleaseResponse returns resp acquired via AcquireResponse to response pool.
214 | //
215 | // It is forbidden accessing resp and/or its' members after returning
216 | // it to response pool.
217 | func ReleaseResponse(resp *Response, releaseCtx bool) {
218 | resp.Reset(releaseCtx)
219 | responsePool.Put(resp)
220 | }
221 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
2 | github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
3 | github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
4 | github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
5 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
6 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
7 | github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
8 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
9 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
10 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
11 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f h1:0Nt/e0By/ClBnnJq+jbGsUWNVIVSbfqYNCE9Z6p7TY0=
12 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f/go.mod h1:TdZqX+mXzn9Xb+7QnjpCFZLYU3poUG64+Ct2+DnlRDU=
13 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b h1:MMUBfyosVuLCa4k0iDpq+3mOHIQAsBSZt/fJ/nN++js=
14 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b/go.mod h1:xG4JX2Eyw5NgKkSXVmKu6u90nPklN0M4fv7jLbzP3TY=
15 | github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
16 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
17 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
18 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
19 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
20 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
21 | github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
22 | github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
23 | github.com/klauspost/compress v1.15.0/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
24 | github.com/klauspost/compress v1.15.4 h1:1kn4/7MepF/CHmYub99/nNX8az0IJjfSOU/jbnTVfqQ=
25 | github.com/klauspost/compress v1.15.4/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
26 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
27 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
28 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
29 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
30 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
31 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
32 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
33 | github.com/rs/xid v1.3.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
34 | github.com/rs/zerolog v1.26.1 h1:/ihwxqH+4z8UxyI70wM1z9yCvkWcfz/a3mj48k/Zngc=
35 | github.com/rs/zerolog v1.26.1/go.mod h1:/wSSJWX7lVrsOwlbyTRSOJvqRlc+WjWlfes+CiJ+tmc=
36 | github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs=
37 | github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
38 | github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg=
39 | github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM=
40 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
41 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
42 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
43 | github.com/tidwall/gjson v1.14.3 h1:9jvXn7olKEHU1S9vwoMGliaT8jq1vJ7IH/n9zD9Dnlw=
44 | github.com/tidwall/gjson v1.14.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
45 | github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
46 | github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
47 | github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
48 | github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
49 | github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
50 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
51 | github.com/valyala/fasthttp v1.40.0 h1:CRq/00MfruPGFLTQKY8b+8SfdK60TxNztjRMnH0t1Yc=
52 | github.com/valyala/fasthttp v1.40.0/go.mod h1:t/G+3rLek+CyY9bnIE+YlMRddxVAAGjhxndDB4i4C0I=
53 | github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
54 | github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
55 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
56 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
57 | golang.org/x/crypto v0.0.0-20211215165025-cf75a172585e/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8=
58 | golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
59 | golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
60 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
61 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
62 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
63 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
64 | golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
65 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
66 | golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
67 | golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
68 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458 h1:MgJ6t2zo8v0tbmLCueaCbF1RM+TtB0rs3Lv8DGtOIpY=
69 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
70 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
71 | golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
72 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
73 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
74 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
75 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
76 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
77 | golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
78 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
79 | golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
80 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
81 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
82 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
83 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
84 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
85 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
86 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
87 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
88 | golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
89 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
90 | golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo=
91 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
92 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
93 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
94 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Predator
2 |
3 | A high-performance(maybe) crawler framework based on fasthttp.
4 |
5 | ## Usage
6 |
7 | ### 1 Create a new `Crawler`
8 |
9 | ```go
10 | import "github.com/go-predator/predator"
11 |
12 |
13 | func main() {
14 | c := predator.NewCrawler(
15 | predator.WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"),
16 | predator.WithCookies(map[string]string{"JSESSIONID": cookie}), // or use predator.WithRawCookie(cookie string)
17 | predator.WithProxy(ip), // or use a proxy pool -> predator.WithProxyPool([]string)
18 | )
19 | }
20 | ```
21 |
22 | Please refer to [predator/options.go](https://github.com/go-predator/predator/blob/main/options.go) for all options。
23 |
24 | ### 2 Send request with GET method
25 |
26 | ```go
27 | // BeforeRequest can do some patching on the request before sending it
28 | c.BeforeRequest(func(r *predator.Request) {
29 | headers := map[string]string{
30 | "Accept": "*/*",
31 | "Accept-Language": "zh-CN",
32 | "Accept-Encoding": "gzip, deflate",
33 | "X-Requested-With": "XMLHttpRequest",
34 | "Origin": "http://example.com",
35 | }
36 |
37 | r.SetHeaders(headers)
38 | })
39 |
40 | c.AfterResponse(func(r *predator.Response) {
41 | // Get the required parameters from the context
42 | id := r.Ctx.GetAny("id").(int)
43 | name := r.Ctx.Get("name")
44 | page := r.Ctx.Get("page")
45 |
46 | fmt.Println(r.String())
47 | })
48 |
49 | // Send a request
50 | c.Get("http://www.example.com")
51 |
52 | // Or send a request with context
53 | ctx, _ := context.AcquireCtx()
54 | ctx.Put('page', 1)
55 | ctx.Put("id", 10)
56 | ctx.Put("name", "Tom")
57 | c.GetWithCtx("http://www.example.com", ctx)
58 | ```
59 |
60 | ### 3 Send request with POST method
61 |
62 | #### 3.1 Request body's media-type is `application/x-www-form-urlencoded`
63 |
64 | ```go
65 | // BeforeRequest can do some patching on the request before sending it
66 | c.BeforeRequest(func(r *predator.Request) {
67 | headers := map[string]string{
68 | "Accept": "*/*",
69 | "Accept-Language": "zh-CN",
70 | "Accept-Encoding": "gzip, deflate",
71 | "X-Requested-With": "XMLHttpRequest",
72 | "Origin": "http://example.com",
73 | }
74 |
75 | r.SetHeaders(headers)
76 | })
77 |
78 | c.AfterResponse(func(r *predator.Response) {
79 | // Get the required parameters from the context
80 | id := r.Ctx.GetAny("id").(int)
81 | name := r.Ctx.Get("name")
82 |
83 | fmt.Println(r.String())
84 | })
85 |
86 |
87 | body := map[string]string{"foo": "bar"}
88 |
89 | // Send a request with context
90 | ctx, _ := context.AcquireCtx()
91 | ctx.Put("id", 10)
92 | ctx.Put("name", "Tom")
93 |
94 | c.Post("http://www.example.com", body, ctx)
95 | ```
96 |
97 | If you don't need to pass a context, you can pass `nil`:
98 |
99 | ```go
100 | c.Post("http://www.example.com", body, nil)
101 | ```
102 |
103 | #### 3.2 Request body's media-type is `multipart/form-data`
104 |
105 | Please refer to the complete example:https://github.com/go-predator/predator/blob/main/example/multipart/main.go
106 |
107 | #### 3.3 Request body's media-type is `application/json`
108 |
109 | ```go
110 | import (
111 | ...
112 |
113 | "github.com/go-predator/predator"
114 | "github.com/go-predator/predator/context"
115 | "github.com/go-predator/predator/json"
116 | )
117 |
118 | type User struct {
119 | Name string `json:"name"`
120 | Age int `json:"age"`
121 | }
122 |
123 | func main() {
124 | c := predator.NewCrawler()
125 |
126 | c.ParseJSON(true, func(j json.JSONResult) {
127 | fmt.Println(j.Get("json"))
128 | })
129 |
130 | body := map[string]any{
131 | "time": 156546535,
132 | "cid": "10_18772100220-1625540144276-302919",
133 | "args": []int{1, 2, 3, 4, 5},
134 | "dict": map[string]string{
135 | "mod": "1592215036_002", "t": "1628346994", "eleTop": "778",
136 | },
137 | "user": User{"Tom", 13},
138 | }
139 |
140 | c.PostJSON("https://httpbin.org/post", body, nil)
141 | }
142 | ```
143 |
144 | #### 3.4 Request body's media-type is others
145 |
146 | If the three request functions above cannot meet your needs, please send your own binary request body via `PostRaw`.
147 |
148 | ```go
149 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error
150 | ```
151 |
152 | ### 4 Allow Redirects
153 |
154 | Redirection is disabled by default.
155 |
156 | If you need to use redirects, you need to set the maximum number of redirects allowed via `AllowRedirect` in `BeforeRequest`.
157 |
158 | ```go
159 | c.BeforeRequest(func(r *predator.Request) {
160 | if r.URL()[8:12] == "abcd" {
161 | r.AllowRedirect(1)
162 | } else if r.URL()[8:12] == "efgh" {
163 | r.AllowRedirect(3)
164 | }
165 | })
166 | ```
167 |
168 | Setting global redirects is not allowed.
169 |
170 | ### 5 Context
171 |
172 | The context is an interface, and the following two contexts are currently implemented:
173 |
174 | - _ReadOp_:Based on `sync.Map`, it is suitable for scenarios with many reading contexts
175 |
176 | ```go
177 | ctx, err := AcquireCtx(context.ReadOp)
178 | ```
179 |
180 | - _WriteOp_(Default):Based on `map`, it is suitable for scenarios where the frequency of reading and writing is not much different or there are more writes than reads. This is the default context
181 |
182 | ```go
183 | ctx, err := AcquireCtx()
184 | ```
185 |
186 | If you implement the `Context` interface yourself:
187 |
188 | ```go
189 | ctx := YourContext()
190 | ```
191 |
192 | ### 6 Parse the HTML response
193 |
194 | Responses to web requests are mostly **HTML** and **JSON**.
195 |
196 | You can use the `ParseHTML` method to find html elements in combination with **CSS selector**.
197 |
198 | > :warning: The `Content-Type` of the response header must be `text/html`.
199 |
200 | ```go
201 | crawl := NewCrawler()
202 |
203 | crawl.ParseHTML("#main", func(he *html.HTMLElement) {
204 | he.String()
205 |
206 | h, err := he.InnerHTML()
207 |
208 | h, err := he.OuterHTML()
209 |
210 | he.Text()
211 |
212 | he.ChildText("#title")
213 |
214 | he.ChildrenText("li>a")
215 |
216 | he.Attr("class")
217 |
218 | he.FirstChild("p")
219 |
220 | he.LastChild("p")
221 |
222 | he.Child("p", 2)
223 |
224 | he.Children("p")
225 |
226 | he.ChildAttr("p", "class")
227 |
228 | he.ChildrenAttr("p", "class")
229 |
230 | he.Parent()
231 |
232 | he.Parents()
233 |
234 | he.Each("li>a", func (i, h) {
235 | if i < 10 {
236 | fmt.Println(h.Attr("href"))
237 | return false
238 | } else {
239 | return true
240 | }
241 | })
242 |
243 | he.FindChildByText("span.addr", "New York")
244 |
245 | he.FindChildByStripedText("span.addr", "New York") // if addr like ' New York '
246 | }
247 | ```
248 |
249 | ### 7 Goroutine pool
250 |
251 | ```go
252 | c := NewCrawler(
253 | // Use a goroutine pool with a capacity of 30 for web requests
254 | predator.WithConcurrency(30),
255 | )
256 |
257 | c.AfterResponse(func(r *predator.Response) {
258 | // handle response
259 | })
260 |
261 | for i := 0; i < 10; i++ {
262 | c.Post("http://www.example.com", map[string]string{
263 | "id": fmt.Sprint(i + 1),
264 | }, nil)
265 | }
266 |
267 | c.Wait()
268 | ```
269 |
270 | ### 8 Cache
271 |
272 | By default no cache is used.
273 |
274 | [`Cache`](https://github.com/go-predator/predator/blob/main/cache.go) is an interface.
275 |
276 | SQLite-based caching is currently implemented.
277 |
278 | If the response length is too long, in order to reduce the space usage, you can enable cache compression.
279 |
280 | ```go
281 | import (
282 | "github.com/go-predator/cache"
283 | )
284 |
285 | // SQLite3
286 | c := NewCrawler(
287 | predator.WithCache(&cache.SQLiteCache{
288 | URI: "test.sqlite",
289 | }, true), // enable compression
290 | )
291 | ```
292 |
293 | ### 9 Proxy
294 |
295 | You can use proxy pool:
296 |
297 | ```go
298 | predator.WithProxyPool([]string{"http://ip:port", "socks5://ip:port"})
299 | ```
300 |
301 | A proxy is randomly selected from the proxy pool before each request.
302 |
303 | When a proxy fails it is automatically removed from the proxy pool, and panic when the proxy pool is empty.
304 |
305 | To avoid panic, you can use `WithComplementProxyPool` to supplement the proxy pool when the proxy pool is empty.
306 |
307 | ```go
308 | func GetProxyIPs() []string {
309 | api := "http://proxy.api"
310 | client := &fasthttp.Client{}
311 | body := make([]byte, 0)
312 | _, body, err := client.Get(body, api)
313 | if err != nil {
314 | panic(err)
315 | }
316 |
317 | return strings.Split(string(body), "\r\n")
318 | }
319 |
320 | predator.WithComplementProxyPool(GetProxyIPs)
321 | ```
322 |
323 | ### 10 Logging
324 |
325 | Based on [`zerolog`](https://github.com/rs/zerolog).
326 |
327 | Logging is off by default.
328 |
329 | Use the `WithLogger` option to enable logging:
330 |
331 | ```go
332 | func WithLogger(logger *log.Logger) CrawlerOption
333 | ```
334 |
335 | If `logger` is nil, logs of level WARNING and above will be printed to the console.
336 |
337 | ```go
338 | crawler := predator.NewCrawler(
339 | predator.WithLogger(nil), // equal to predator.WithDefaultLogger()
340 | )
341 | ```
342 |
343 | If you want to print lower level logs, refer to the following code:
344 |
345 | ```go
346 | import "github.com/go-predator/predator/log"
347 |
348 | func main() {
349 | // print to console
350 | logger := log.NewLogger(log.DEBUG, log.ToConsole(), 1)
351 | // save to file
352 | logger := log.NewLogger(log.DEBUG, log.MustToFile("demo.log", -1), 1)
353 | // print to console and save to file
354 | logger := log.NewLogger(log.DEBUG, log.MustToConsoleAndFile("demo.log", -1), 1)
355 |
356 | crawler := predator.NewCrawler(
357 | predator.WithLogger(logger),
358 | )
359 | }
360 | ```
361 |
362 | ### 11 Other considerations
363 |
364 | If you need to serialize some data structures into json strings, or deserialize json strings, it is recommended to use `github.com/go-predator/predator/json` instead of `encdoing/json`.
365 |
366 | ```GO
367 | import "github.com/go-predator/predator/json"
368 |
369 | json.Marshal(any) ([]byte, error)
370 | json.Unmarshal([]byte, any) error
371 | json.UnmarshalFromString(string, any) error
372 | ```
373 |
--------------------------------------------------------------------------------
/README.zh_CN.md:
--------------------------------------------------------------------------------
1 | # predator / 掠食者
2 | 基于 fasthttp 开发的高性能爬虫框架
3 |
4 | ## 使用
5 |
6 | 下面是一个示例,基本包含了当前已完成的所有功能,使用方法可以参考注释。
7 |
8 | ### 1 创建一个 Crawler
9 |
10 | ```go
11 | import "github.com/go-predator/predator"
12 |
13 |
14 | func main() {
15 | crawler := predator.NewCrawler(
16 | predator.WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"),
17 | predator.WithCookies(map[string]string{"JSESSIONID": cookie}),
18 | predator.WithProxy(ip), // 或者使用代理池 predator.WithProxyPool([]string)
19 | )
20 | }
21 | ```
22 |
23 | 创建`Crawler`时有一些可选项用来功能增强。所有可选项参考[predator/options.go](https://github.com/go-predator/predator/blob/main/options.go)。
24 |
25 | ### 2 发送 Get 请求
26 |
27 | ```go
28 | crawler.Get("http://www.baidu.com")
29 | ```
30 |
31 | 对请求和响应的处理参考的是 colly,我觉得 colly 的处理方式非常舒服。
32 |
33 | ```go
34 | // BeforeRequest 可以在发送请求前,对请求进行一些修补
35 | crawler.BeforeRequest(func(r *predator.Request) {
36 | headers := map[string]string{
37 | "Accept": "*/*",
38 | "Accept-Language": "zh-CN",
39 | "Accept-Encoding": "gzip, deflate",
40 | "X-Requested-With": "XMLHttpRequest",
41 | "Origin": "http://example.com",
42 | }
43 |
44 | r.SetHeaders(headers)
45 |
46 | // 请求和响应之间的上下文传递,上下文见下面的上下文示例
47 | r.Ctx.Put("id", 10)
48 | r.Ctx.Put("name", "tom")
49 | })
50 |
51 | crawler.AfterResponse(func(r *predator.Response) {
52 | // 从请求发送的上下文中取值
53 | id := r.Ctx.GetAny("id").(int)
54 | name := r.Ctx.Get("name")
55 |
56 | // 对于 json 响应,建议使用 gjson 进行处理
57 | body := gjson.ParseBytes(r.Body)
58 | amount := body.Get("amount").Int()
59 | types := body.Get("types").Array()
60 | })
61 |
62 | // 请求语句要在 BeforeRequest 和 AfterResponse 后面调用
63 | crawler.Get("http://www.baidu.com")
64 | ```
65 |
66 | ### 3 发送 Post 请求
67 |
68 | 与 Get 请求有一点不同,通常每个 Post 的请求的参数是不同的,而这些参数都在请求体中,在`BeforeRequest`中重新解析请求体获取关键参数虽然可以,但绝非最佳选择。所以在构造 Post 请求时,可以直接传入上下文,用以解决与响应的信息传递。
69 |
70 | #### 3.1 普通 POST 表单(application/x-www-form-urlencoded)
71 |
72 | ```go
73 | // BeforeRequest 可以在发送请求前,对请求进行一些修补
74 | crawler.BeforeRequest(func(r *predator.Request) {
75 | headers := map[string]string{
76 | "Accept": "*/*",
77 | "Accept-Language": "zh-CN",
78 | "Accept-Encoding": "gzip, deflate",
79 | "X-Requested-With": "XMLHttpRequest",
80 | "Origin": "http://example.com",
81 | }
82 |
83 | r.SetHeaders(headers)
84 | })
85 |
86 | crawler.AfterResponse(func(r *predator.Response) {
87 | // 从请求发送的上下文中取值
88 | id := r.Ctx.GetAny("id").(int)
89 | name := r.Ctx.Get("name")
90 |
91 | // 对于 json 响应,建议使用 gjson 进行处理
92 | body := gjson.ParseBytes(r.Body)
93 | amount := body.Get("amount").Int()
94 | types := body.Get("types").Array()
95 | })
96 |
97 |
98 | body := map[string]string{"foo": "bar"}
99 |
100 | // 在 Post 请求中,应该将关键参数用这种方式放进上下文
101 | ctx, _ := context.AcquireCtx()
102 | ctx.Put("id", 10)
103 | ctx.Put("name", "tom")
104 |
105 | crawler.Post("http://www.baidu.com", body, ctx)
106 | ```
107 |
108 | 如果不需要传入上下文,可以直接用`nil`代替:
109 |
110 | ```go
111 | crawler.Post("http://www.baidu.com", body, nil)
112 | ```
113 |
114 | #### 3.2 复杂 POST 请求(multipart/form-data)
115 |
116 | `multipart/form-data`方法需要使用专门的`PostMultipart`方法,示例可能较长,这里不便书写。
117 |
118 | 使用方法请参考示例:https://github.com/go-predator/predator/blob/main/example/multipart/main.go
119 |
120 | #### 3.3 JSON 请求
121 |
122 | JSON 请求也有专门的方法`PostJSON`来完成,在使用`PostJSON`时会自动在请求头中添加`Content-Type: application/json`,无需重复添加。当然,你再重新添加一次也可以,最终将会使用你添加的`Content-Type`。
123 |
124 | 示例:
125 |
126 | ```go
127 | func main() {
128 | c := NewCrawler()
129 |
130 | c.AfterResponse(func(r *Response) {
131 | t.Log(r)
132 | })
133 |
134 | type User struct {
135 | Name string `json:"name"`
136 | Age int `json:"age"`
137 | }
138 |
139 | body := map[string]any{
140 | "time": 156546535,
141 | "cid": "10_18772100220-1625540144276-302919",
142 | "args": []int{1, 2, 3, 4, 5},
143 | "dict": map[string]string{
144 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778",
145 | },
146 | "user": User{"Tom", 13},
147 | }
148 |
149 | c.PostJSON("https://httpbin.org/post", body, nil)
150 | }
151 | ```
152 |
153 | #### 3.4 其他 POST 请求
154 |
155 | 虽然以上三种方式已解决大部分的网站的请求,但仍然存在一小部分网站比较特殊,此时需要使用`PostRaw`方法:
156 |
157 | ```go
158 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error
159 | ```
160 |
161 | 其中的请求体需要你自行构造,原始请求体可以是任何形式,构造完成后再序列化为`[]byte`作为请求体。
162 |
163 | ### 4 允许重定向
164 |
165 | 考虑到爬虫的效率问题,默认情况下是不允许重定向的。
166 |
167 | 但在正常的爬虫业务中难免遇到重定向问题,你可以根据每个请求的不同情况设置不同的最大重定向次数。
168 |
169 | ```go
170 | crawler.BeforeRequest(func(r *predator.Request) {
171 | // 用 GET 请求时可以根据 r.URL 判断,POST 请求时可以根据请求体判断,下面仅是示例
172 | if r.URL == 情况一 {
173 | // 允许重定向 1 次
174 | r.AllowRedirect(1)
175 | } else if r.URL == 情况二 {
176 | // 允许重定向 3 次
177 | r.AllowRedirect(3)
178 | }
179 | })
180 | ```
181 |
182 | 不允许设置全局重定向,只能针对每个请求进行修补。
183 |
184 | 当然,如果全局重定向呼声高的话,再考虑是否加入。
185 |
186 | ### 5 上下文
187 |
188 | 上下文是一个接口,我实现了两种上下文:
189 |
190 | - *ReadOp*:基于`sync.Map`实现,适用于读取上下文较多的场景
191 | - *WriteOp*:用`map`实现,适用于读写频率相差不大或写多于读的场景,这是默认采用的上下文
192 |
193 | 爬虫中如果遇到了读远多于写时就应该换`ReadOp`了,如下代码所示:
194 |
195 | ```go
196 | ctx, err := AcquireCtx(context.ReadOp)
197 | ```
198 |
199 | ### 6 处理 HTML
200 |
201 | 爬虫的结果大体可分为两种,一是 HTML 响应,另一种是 JSON 格式的响应。
202 |
203 | 与 JSON 相比,HTML 需要更多的代码处理。
204 |
205 | 本框架对 HTML 处理进行了一些函数封装,能方便地通过 css selector 进行元素的查找,可以提取元素中的属性和文本等。
206 |
207 | 响应头必须是`text/html`或其扩展类型如`text/html; charset=utf-8`才能执行此方法。
208 |
209 | ```go
210 | crawl := NewCrawler()
211 |
212 | crawl.ParseHTML("body", func(he *html.HTMLElement) {
213 | // 元素内部 HTML
214 | h, err := he.InnerHTML()
215 | // 元素整体 HTML
216 | h, err := he.OuterHTML()
217 | // 元素内的文本(包括子元素的文本)
218 | he.Text()
219 | // 元素的属性
220 | he.Attr("class")
221 | // 第一个匹配的子元素
222 | he.FirstChild("p")
223 | // 最后一个匹配的子元素
224 | he.LastChild("p")
225 | // 第 2 个匹配的子元素
226 | he.Child("p", 2)
227 | // 第一个匹配的子元素的属性
228 | he.ChildAttr("p", "class")
229 | // 所有匹配到的子元素的属性切片
230 | he.ChildrenAttr("p", "class")
231 | }
232 | ```
233 |
234 | ### 7 异步 / 多协程请求
235 |
236 | ```go
237 | c := NewCrawler(
238 | // 使用此 option 时自动使用指定数量的协程池发出请求,不使用此 option 则默认使用同步方式请求
239 | // 设置的数量不宜过少,也不宜过多,请自行测试设置不同数量时的效率
240 | WithConcurrency(30),
241 | )
242 |
243 | c.AfterResponse(func(r *predator.Response) {
244 | // handle response
245 | })
246 |
247 | for i := 0; i < 10; i++ {
248 | c.Post(ts.URL+"/post", map[string]string{
249 | "id": fmt.Sprint(i + 1),
250 | }, nil)
251 | }
252 |
253 | c.Wait()
254 | ```
255 |
256 | ### 8 使用缓存
257 |
258 | 默认情况下,缓存是不启用的,所有的请求都直接放行。
259 |
260 | 已经实现的缓存:
261 |
262 | - MySQL
263 | - PostgreSQL
264 | - Redis
265 | - SQLite3
266 |
267 | 缓存接口中有一个方法`Compressed(yes bool)`用来压缩响应的,毕竟有时,响应长度非常长,直接保存到数据库中会影响插入和查询时的性能。
268 |
269 | 这四个接口的使用方法示例:
270 |
271 | ```go
272 | // MySQL
273 | c := NewCrawler(
274 | WithCache(&cache.MySQLCache{
275 | Host: "127.0.0.1",
276 | Port: "3306",
277 | Database: "predator",
278 | Username: "root",
279 | Password: "123456",
280 | }, false), // false 为关闭压缩,true 为开启压缩,下同
281 | )
282 |
283 | // PostgreSQL
284 | c := NewCrawler(
285 | WithCache(&cache.PostgreSQLCache{
286 | Host: "127.0.0.1",
287 | Port: "54322",
288 | Database: "predator",
289 | Username: "postgres",
290 | Password: "123456",
291 | }, false),
292 | )
293 |
294 | // Redis
295 | c := NewCrawler(
296 | WithCache(&cache.RedisCache{
297 | Addr: "localhost:6379",
298 | }, true),
299 | )
300 |
301 | // SQLite3
302 | c := NewCrawler(
303 | WithCache(&cache.SQLiteCache{
304 | URI: uri, // uri 为数据库存放的位置,尽量加上后缀名 .sqlite
305 | }, true),
306 | )
307 | // 也可以使用默认值。WithCache 的第一个为 nil 时,
308 | // 默认使用 SQLite 作为缓存,且会将缓存保存在当前
309 | // 目录下的 predator-cache.sqlite 中
310 | c := NewCrawler(WithCache(nil, true))
311 | ```
312 |
313 | ### 9 代理
314 |
315 | 支持 HTTP 代理和 Socks5 代理。
316 |
317 | 使用代理时需要加上协议,如:
318 |
319 | ```go
320 | WithProxyPool([]string{"http://ip:port", "socks5://ip:port"})
321 | ```
322 |
323 | ### 10 日志
324 |
325 | 日志使用的是流行日志库[`zerolog`](https://github.com/rs/zerolog)。
326 |
327 | 默认情况下,日志是不开启的,需要手动开启。
328 |
329 | `WithLogger`选项需要填入一个参数`*predator.LogOp`,当填入`nil`时,默认会以`INFO`等级从终端美化输出。
330 |
331 | ```go
332 | crawler := predator.NewCrawler(
333 | predator.WithLogger(nil),
334 | )
335 | ```
336 |
337 | `predator.LogOp`对外公开四个方法:
338 |
339 | - *SetLevel*:设置日志等级。等级可选:`DEBUG`、`INFO`、`WARNING`、`ERROR`、`FATAL`
340 |
341 | ```go
342 | logOp := new(predator.LogOp)
343 | // 设置为 INFO
344 | logOp.SetLevel(log.INFO)
345 | ```
346 |
347 | - *ToConsole*:美化输出到终端。
348 |
349 | - *ToFile*:JSON 格式输出到文件。
350 |
351 | - *ToConsoleAndFile*:既美化输出到终端,同时以 JSON 格式输出到文件。
352 |
353 | 日志的完整示例:
354 |
355 | ```go
356 | import "github.com/go-predator/predator/log"
357 |
358 | func main() {
359 | logOp := new(predator.LogOp)
360 | logOp.SetLevel(log.INFO)
361 | logOp.ToConsoleAndFile("test.log")
362 |
363 | crawler := predator.NewCrawler(
364 | predator.WithLogger(logOp),
365 | )
366 | }
367 | ```
368 |
369 | ### 11 关于 JSON
370 |
371 | 本来想着封装一个 JSON 包用来快速处理 JSON 响应,但是想了一两天也没想出个好办法来,因为我能想到的,[gjson](https://github.com/tidwall/gjson)都已经解决了。
372 |
373 | 对于 JSON 响应,能用`gjson`处理就不要老想着反序列化了。对于爬虫而言,反序列化是不明智的选择。
374 |
375 | 当然,如果你确实有反序列化的需求,也不要用标准库,使用封装的 JSON 包中的序列化和反序列化方法比标准库性能高。
376 |
377 | ```GO
378 | import "github.com/go-predator/predator/json"
379 |
380 | json.Marshal()
381 | json.Unmarshal()
382 | json.UnmarshalFromString()
383 | ```
384 |
385 | 对付 JSON 响应,当前足够用了。
386 |
387 | ## 目标
388 |
389 | - [x] 完成对失败响应的重新请求,直到重试了传入的重试次数时才算最终请求失败
390 | - [x] 识别因代理失效而造成的请求失败。当使用代理池时,代理池中剔除此代理;代理池为空时,终止整个爬虫程序
391 | - 考虑到使用代理必然是因为不想将本地 ip 暴露给目标网站或服务器,所以在使用代理后,当所有代理都失效时,不再继续发出请求
392 | - [x] HTML 页面解析。方便定位查找元素
393 | - [x] json 扩展,用来处理、筛选 json 响应的数据,原生 json 库不适合用在爬虫上
394 | - 暂时没想到如何封装便捷好用的 json ,当前 json 包中只能算是使用示例
395 | - [x] 协程池,实现在多协程时对每个 goroutine 的复用,避免重复创建
396 | - [x] 定义缓存接口,并完成一种或多种缓存。因为临时缓存在爬虫中并不实用,所以 predator 采用持久化缓存。
397 | - 默认使用 sqlite3 进行缓存,可以使用已实现的其他缓存数据库,也可以自己实现缓存接口
398 | - 可用缓存存储有 SQLite3、MySQL、PostgreSQL、Redis
399 | - 因为采用持久化缓存,所以不实现以内存作为缓存,如果需要请自行根据缓存接口实现
400 | - [x] 数据库管理接口,用来保存爬虫数据,并完成一种或多种数据库的管理
401 | - SQL 数据库接口已实现了,NoSQL 接口与 SQL 差别较大,就不实现了,如果有使用 NoSQL 的需求,请自己实现
402 | - 数据库接口没有封装在 Crawler 方法中,根据需要使用,一般场景下够用,复杂场景中仍然需要自己重写数据库管理
403 | - [x] 添加日志
404 | - 可能还不完善
405 | - [x] 为`Request`和`Response`的请求体`Body`添加池管理,减少 GC 次数
406 | - body 本身就是`[]byte`,作为引用类型,只要不删除引用关系,其内存就不会被回收
407 | - 将原求就不是`nil`的 body 截断为 `body[:0]` 即可,不需要使用池来管理
408 | - [x] 对于链式请求或多种请求,可对`POST`和`GET`设置不同的缓存字段
409 | - [x] 链式请求中可以对每个请求单独设置不同的缓存参数
410 | - [x] 声明一个代理api处理方法,参数为一个整型,可以请求代理池中代理的数量返回代理切片,形成代理池。后续可以每次请求一个代理,用于实时补全代理池。这个方法需用户自行实现。
411 | - [ ] 增加对 robots.txt 的判断,默认遵守 robots.txt 规则,但可以选择忽略
412 |
--------------------------------------------------------------------------------
/html/element.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: element.go
5 | * @Created: 2021-07-27 20:35:31
6 | * @Modified: 2022-05-26 20:23:43
7 | */
8 |
9 | package html
10 |
11 | import (
12 | "errors"
13 | "strings"
14 |
15 | "github.com/PuerkitoBio/goquery"
16 | "github.com/go-predator/tools"
17 | "golang.org/x/net/html"
18 | )
19 |
20 | var (
21 | ErrNilElement = errors.New("the current element is nil")
22 | )
23 |
24 | // HTMLElement is the representation of a HTML tag.
25 | type HTMLElement struct {
26 | // Name is the name of the tag
27 | Name string
28 |
29 | // DOM is the goquery parsed DOM object of the page. DOM is relative
30 | // to the current HTMLElement
31 | DOM *goquery.Selection
32 |
33 | // Index stores the position of the current element within
34 | // all the elements matched by an OnHTML callback
35 | Index int
36 |
37 | Node *html.Node
38 | }
39 |
40 | func (he HTMLElement) String() string {
41 | var s strings.Builder
42 |
43 | s.WriteByte('<')
44 | s.WriteString(he.Name)
45 |
46 | for _, attr := range he.Node.Attr {
47 | s.WriteByte(' ')
48 | s.WriteString(attr.Key)
49 |
50 | if len(attr.Val) > 0 {
51 | s.WriteByte('=')
52 | s.WriteByte('"')
53 | s.WriteString(attr.Val)
54 | s.WriteByte('"')
55 | }
56 | }
57 |
58 | s.WriteByte('>')
59 |
60 | if fc := he.Node.FirstChild; fc != nil {
61 | if fc.Type == html.TextNode {
62 | text := strings.TrimSpace(fc.Data)
63 | runes := []rune(text)
64 | if len(runes) == 0 {
65 | s.WriteString("...")
66 | } else if len(runes) > 10 {
67 | s.WriteString(string(runes[:10]))
68 | s.WriteString("...")
69 | } else {
70 | s.WriteString(text)
71 | }
72 | } else {
73 | s.WriteString("...")
74 | }
75 | }
76 |
77 | s.WriteString("")
78 | s.WriteString(he.Name)
79 | s.WriteByte('>')
80 |
81 | return s.String()
82 | }
83 |
84 | // NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
85 | func NewHTMLElementFromSelectionNode(s *goquery.Selection, n *html.Node, index int) *HTMLElement {
86 | return &HTMLElement{
87 | Name: n.Data,
88 | DOM: s,
89 | Index: index,
90 | Node: n,
91 | }
92 | }
93 |
94 | // Attr returns the selected attribute of a HTMLElement or empty string
95 | // if no attribute found
96 | func (he *HTMLElement) Attr(key string) string {
97 | for _, attr := range he.Node.Attr {
98 | if attr.Key == key {
99 | return attr.Val
100 | }
101 | }
102 | return ""
103 | }
104 |
105 | // OuterHtml returns the outer HTML rendering of the first item in
106 | // the selection - that is, the HTML including the first element's
107 | // tag and attributes.
108 | func (he *HTMLElement) OuterHTML() (string, error) {
109 | return goquery.OuterHtml(he.DOM)
110 | }
111 |
112 | // InnerHTML gets the HTML contents of the first element in the set of matched
113 | // elements. It includes text and comment nodes.
114 | func (he *HTMLElement) InnerHTML() (string, error) {
115 | return he.DOM.Html()
116 | }
117 |
118 | // Text gets the combined text contents of each element in the set of matched
119 | // elements, including their descendants.
120 | func (he *HTMLElement) Text() string {
121 | return he.DOM.Text()
122 | }
123 |
124 | // Texts Gets all child text elements in the current element and returns a []string
125 | func (he *HTMLElement) Texts() []string {
126 | if he == nil {
127 | return nil
128 | }
129 |
130 | var texts []string
131 |
132 | // Slightly optimized vs calling Each: no single selection object created
133 | var f func(*html.Node)
134 | f = func(n *html.Node) {
135 | if n.Type == html.TextNode {
136 | text := tools.Strip(n.Data)
137 | if text != "" {
138 | // 当使用 Selection.ReplaceWithHtml 将原节点替换成了一个 TextNode 时
139 | // 很可能会出现多个文本节点连接,这在现实 DOM 结构是不可能存在的,但 ReplaceWithHtml
140 | // 方法的不完备却可能出现此情况,故只能在此判断前面的节点是否为文本节点,如果是则将两个文本
141 | // 节点的文本合并。
142 | if n.PrevSibling != nil && n.PrevSibling.Type == html.TextNode {
143 | if len(texts) > 0 {
144 | texts[len(texts)-1] += text
145 | } else {
146 | texts = append(texts, text)
147 | }
148 | } else {
149 | texts = append(texts, text)
150 | }
151 | }
152 | }
153 | if n.FirstChild != nil {
154 | for c := n.FirstChild; c != nil; c = c.NextSibling {
155 | f(c)
156 | }
157 | }
158 | }
159 | for _, n := range he.DOM.Nodes {
160 | f(n)
161 | }
162 |
163 | return texts
164 | }
165 |
166 | // ChildText returns the concatenated and stripped text content of the matching
167 | // elements.
168 | func (he *HTMLElement) ChildText(selector string) string {
169 | return strings.TrimSpace(he.DOM.Find(selector).Text())
170 | }
171 |
172 | // ChildrenText returns the stripped text content of all the matching
173 | // elements.
174 | func (he *HTMLElement) ChildrenText(selector string) []string {
175 | var res []string
176 | he.Each(selector, func(_ int, h *HTMLElement) bool {
177 | text := h.Text()
178 | if text == "" {
179 | return false
180 | }
181 |
182 | res = append(res, strings.TrimSpace(text))
183 | return false
184 | })
185 | return res
186 | }
187 |
188 | // ChildAttr returns the stripped text content of the first matching
189 | // element's attribute.
190 | func (he *HTMLElement) ChildAttr(selector, attrName string) string {
191 | if attr, ok := he.DOM.Find(selector).Attr(attrName); ok {
192 | return strings.TrimSpace(attr)
193 | }
194 | return ""
195 | }
196 |
197 | // ChildrenAttr returns the stripped text content of all the matching
198 | // element's attributes.
199 | func (he *HTMLElement) ChildrenAttr(selector, attrName string) []string {
200 | var res []string
201 | he.Each(selector, func(_ int, h *HTMLElement) bool {
202 | if attr := h.Attr(attrName); attr != "" {
203 | res = append(res, strings.TrimSpace(attr))
204 | }
205 | return false
206 | })
207 | return res
208 | }
209 |
210 | // Each iterates over the elements matched by the first argument
211 | // and calls the callback function on every HTMLElement match.
212 | //
213 | // The for loop will break when the `callback` returns `true`.
214 | func (he *HTMLElement) Each(selector string, callback func(int, *HTMLElement) bool) {
215 | i := 0
216 | if he == nil {
217 | panic(ErrNilElement)
218 | }
219 |
220 | found := he.DOM.Find(selector)
221 | if found == nil {
222 | return
223 | }
224 |
225 | found.Each(func(_ int, s *goquery.Selection) {
226 | for _, n := range s.Nodes {
227 | if callback(i, NewHTMLElementFromSelectionNode(s, n, i)) {
228 | break
229 | }
230 | i++
231 | }
232 | })
233 | }
234 |
235 | // Child returns the numth matched child element.
236 | // num starts at 1, not at 0.
237 | func (he *HTMLElement) Child(selector string, num int) *HTMLElement {
238 | if he == nil {
239 | panic(ErrNilElement)
240 | }
241 |
242 | s := he.DOM.Find(selector)
243 | nodes := s.Nodes
244 | if len(nodes) == 0 {
245 | return nil
246 | }
247 |
248 | if num == -1 {
249 | num = s.Length()
250 | }
251 |
252 | return NewHTMLElementFromSelectionNode(
253 | goquery.NewDocumentFromNode(nodes[num-1]).Selection,
254 | nodes[num-1],
255 | num-1,
256 | )
257 | }
258 |
259 | // FirstChild returns the first child element that matches the selector.
260 | func (he *HTMLElement) FirstChild(selector string) *HTMLElement {
261 | return he.Child(selector, 1)
262 | }
263 |
264 | // LastChild returns the last child element that matches the selector.
265 | func (he *HTMLElement) LastChild(selector string) *HTMLElement {
266 | return he.Child(selector, -1)
267 | }
268 |
269 | // Parent returns the direct parent element.
270 | func (he *HTMLElement) Parent() *HTMLElement {
271 | // If the current element is tag, return nil
272 | if he.Name == "html" {
273 | return nil
274 | }
275 |
276 | s := he.DOM.Parent()
277 | return NewHTMLElementFromSelectionNode(s, s.Nodes[0], 0)
278 | }
279 |
280 | // Parents returns all parent elements.
281 | func (he *HTMLElement) Parents() []*HTMLElement {
282 | parents := make([]*HTMLElement, 0)
283 |
284 | for {
285 | var parent = he.Parent()
286 | if parent == nil {
287 | break
288 | }
289 | parents = append(parents, parent)
290 | he = parent
291 | }
292 |
293 | return parents
294 | }
295 |
296 | // FindChildByText returns the first child element matching the target text.
297 | func (he *HTMLElement) FindChildByText(selector, text string) *HTMLElement {
298 | var target *HTMLElement
299 | he.Each(selector, func(i int, h *HTMLElement) bool {
300 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && h.Node.FirstChild.Data == text {
301 | target = h
302 | return true
303 | }
304 | return false
305 | })
306 | return target
307 | }
308 |
309 | // FindChildByStripedText returns the first child element matching the stripped text.
310 | func (he *HTMLElement) FindChildByStripedText(selector, text string) *HTMLElement {
311 | var target *HTMLElement
312 | he.Each(selector, func(i int, h *HTMLElement) bool {
313 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && tools.Strip(h.Node.FirstChild.Data) == text {
314 | target = h
315 | return true
316 | }
317 | return false
318 | })
319 | return target
320 | }
321 |
322 | // Children returns all child elements matching the selector
323 | func (he *HTMLElement) Children(selector string) []*HTMLElement {
324 | children := make([]*HTMLElement, 0, 3)
325 | he.Each(selector, func(i int, h *HTMLElement) bool {
326 | children = append(children, h)
327 | return false
328 | })
329 | return children
330 | }
331 |
332 | // FindChildrenByText returns all the child elements matching the target text.
333 | func (he *HTMLElement) FindChildrenByText(selector, text string) []*HTMLElement {
334 | targets := make([]*HTMLElement, 0, 3)
335 | he.Each(selector, func(i int, h *HTMLElement) bool {
336 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && h.Node.FirstChild.Data == text {
337 | targets = append(targets, h)
338 | }
339 | return false
340 | })
341 | return targets
342 | }
343 |
344 | // FindChildrenByStripedText returns all the child elements matching the stripped text.
345 | func (he *HTMLElement) FindChildrenByStripedText(selector, text string) []*HTMLElement {
346 | targets := make([]*HTMLElement, 0, 3)
347 | he.Each(selector, func(i int, h *HTMLElement) bool {
348 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && tools.Strip(h.Node.FirstChild.Data) == text {
349 | targets = append(targets, h)
350 | }
351 | return false
352 | })
353 | return targets
354 | }
355 |
--------------------------------------------------------------------------------
/request.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: request.go
5 | * @Created: 2021-07-24 13:29:11
6 | * @Modified: 2022-11-05 22:05:51
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "bytes"
13 | "crypto/sha1"
14 | "fmt"
15 | "time"
16 |
17 | "net/http"
18 | "net/url"
19 | "os"
20 | "path/filepath"
21 | "sort"
22 | "strings"
23 | "sync"
24 |
25 | pctx "github.com/go-predator/predator/context"
26 | "github.com/go-predator/predator/json"
27 | "github.com/valyala/fasthttp"
28 | )
29 |
30 | type Request struct {
31 | uri *fasthttp.URI
32 | // 请求头
33 | Headers *fasthttp.RequestHeader
34 | // 请求和响应之间共享的上下文
35 | Ctx pctx.Context
36 | // 请求体
37 | Body []byte
38 | // 待缓存的键值对
39 | cachedMap map[string]string
40 | // 唯一标识
41 | ID uint32
42 | // 中断本次请求
43 | abort bool
44 | // 基于原 crawler 重试或发出新请求
45 | crawler *Crawler
46 | // 重试计数器
47 | retryCounter uint32
48 | // 允许重定向的次数,默认等于 0,不允许重定向。
49 | // 大于 0 时,允许最多重定向对应的次数。
50 | // 重定向次数会影响爬虫效率。
51 | maxRedirectsCount uint
52 | timeout time.Duration
53 | }
54 |
55 | func (r Request) IsCached() (bool, error) {
56 | if r.crawler.cache == nil {
57 | return false, ErrNoCache
58 | }
59 |
60 | hash, err := r.Hash()
61 | if err != nil {
62 | return false, err
63 | }
64 |
65 | _, ok := r.crawler.cache.IsCached(hash)
66 | return ok, nil
67 | }
68 |
69 | func (r *Request) Abort() {
70 | r.abort = true
71 | }
72 |
73 | func (r *Request) SetContentType(contentType string) {
74 | r.Headers.Set("Content-Type", contentType)
75 | }
76 |
77 | // AllowRedirect allows up to `maxRedirectsCount` times to be redirected.
78 | func (r *Request) AllowRedirect(maxRedirectsCount uint) {
79 | r.maxRedirectsCount = maxRedirectsCount
80 | }
81 |
82 | // SetTimeout sets the waiting time for each request before
83 | // the remote end returns a response.
84 | //
85 | // The function doesn't follow redirects.
86 | func (r *Request) SetTimeout(t time.Duration) {
87 | r.timeout = t
88 | }
89 |
90 | func (r *Request) SetHeaders(headers map[string]string) {
91 | for k, v := range headers {
92 | r.Headers.Set(k, v)
93 | }
94 | }
95 |
96 | func (r *Request) SetNewHeaders(headers map[string]string, disableNormalizing bool) {
97 | r.Headers.Reset()
98 |
99 | if disableNormalizing {
100 | r.Headers.DisableNormalizing()
101 | }
102 |
103 | for k, v := range headers {
104 | r.Headers.Set(k, v)
105 | }
106 | }
107 |
108 | func (r Request) headers() map[string]string {
109 | h := make(map[string]string)
110 | r.Headers.VisitAll(func(key, value []byte) {
111 | h[string(key)] = string(value)
112 | })
113 | return h
114 | }
115 |
116 | func (r Request) URL() string {
117 | return r.uri.String()
118 | }
119 |
120 | func (r Request) Method() string {
121 | return string(r.Headers.Method())
122 | }
123 |
124 | func (r Request) NumberOfRetries() uint32 {
125 | return r.retryCounter
126 | }
127 |
128 | func (r Request) Get(u string) error {
129 | return r.Request(MethodGet, u, nil, nil)
130 | }
131 |
132 | func (r Request) GetWithCache(URL string, cacheFields ...CacheField) error {
133 | return r.crawler.get(URL, r.headers(), r.Ctx, true, cacheFields...)
134 | }
135 |
136 | func (r Request) Post(URL string, requestData map[string]string) error {
137 | return r.crawler.post(URL, requestData, r.headers(), r.Ctx, true)
138 | }
139 |
140 | func (r Request) PostWithCache(URL string, requestData map[string]string, cacheFields ...CacheField) error {
141 | return r.crawler.post(URL, requestData, r.headers(), r.Ctx, true, cacheFields...)
142 | }
143 | func (r Request) PostJSON(URL string, requestData map[string]any) error {
144 | return r.crawler.postJSON(URL, requestData, r.headers(), r.Ctx, true)
145 | }
146 |
147 | func (r Request) PostJSONWithCache(URL string, requestData map[string]any, cacheFields ...CacheField) error {
148 | return r.crawler.postJSON(URL, requestData, r.headers(), r.Ctx, true, cacheFields...)
149 | }
150 | func (r Request) PostMultipart(URL string, form *MultipartForm) error {
151 | return r.crawler.postMultipart(URL, form, r.headers(), r.Ctx, true)
152 | }
153 |
154 | func (r Request) PostMultipartWithCache(URL string, form *MultipartForm, cacheFields ...CacheField) error {
155 | return r.crawler.postMultipart(URL, form, r.headers(), r.Ctx, true, cacheFields...)
156 | }
157 |
158 | func (r Request) Request(method, URL string, cachedMap map[string]string, body []byte) error {
159 | return r.crawler.request(method, URL, body, cachedMap, r.Headers, r.Ctx, true)
160 | }
161 |
162 | // AbsoluteURL returns with the resolved absolute URL of an URL chunk.
163 | // AbsoluteURL returns empty string if the URL chunk is a fragment or
164 | // could not be parsed
165 | func (r Request) AbsoluteURL(src string) string {
166 | if strings.HasPrefix(src, "#") {
167 | return ""
168 | }
169 |
170 | u, err := url.Parse(r.URL())
171 | if err != nil {
172 | return ""
173 | }
174 |
175 | absoluteURL, err := u.Parse(src)
176 | if err != nil {
177 | return ""
178 | }
179 | absoluteURL.Fragment = ""
180 | if absoluteURL.Scheme == "//" {
181 | absoluteURL.Scheme = u.Scheme
182 | }
183 | return absoluteURL.String()
184 | }
185 |
186 | type cacheRequest struct {
187 | // 访问的链接
188 | URL string
189 | // 请求方法
190 | Method string
191 | // 待缓存的 map
192 | CacheKey []byte
193 | }
194 |
195 | func marshalCachedMap(cachedMap map[string]string) []byte {
196 | keys := make([]string, 0, len(cachedMap))
197 | for k := range cachedMap {
198 | keys = append(keys, k)
199 | }
200 | sort.Strings(keys)
201 |
202 | var b bytes.Buffer
203 |
204 | b.WriteByte('{')
205 | for i, k := range keys {
206 | if i > 0 {
207 | b.WriteString(`, `)
208 | }
209 | b.WriteByte('"')
210 | b.WriteString(k)
211 | b.WriteString(`": `)
212 | b.WriteByte('"')
213 | b.WriteString(cachedMap[k])
214 | b.WriteByte('"')
215 | }
216 | b.WriteByte('}')
217 |
218 | return b.Bytes()
219 | }
220 |
221 | func (r Request) marshal() ([]byte, error) {
222 | cr := &cacheRequest{
223 | URL: r.URL(),
224 | Method: r.Method(),
225 | }
226 |
227 | if r.cachedMap != nil {
228 | cr.CacheKey = marshalCachedMap(r.cachedMap)
229 | } else {
230 | cr.CacheKey = r.Body
231 | }
232 |
233 | if r.Method() == MethodGet {
234 | // 为 GET 设置 cachedFields,则说明一定是因为 url 是变化的,所以不能将整个 url 作为缓存标志,
235 | // 此时将 CacheKey 作为缓存标志是最佳选择
236 | if cr.CacheKey != nil {
237 | return cr.CacheKey, nil
238 | } else {
239 | return []byte(r.URL()), nil
240 | }
241 | }
242 |
243 | return json.Marshal(cr)
244 | }
245 |
246 | func (r Request) Hash() (string, error) {
247 | cacheBody, err := r.marshal()
248 | if err != nil {
249 | return "", err
250 | }
251 | return fmt.Sprintf("%x", sha1.Sum(cacheBody)), nil
252 | }
253 |
254 | func (r *Request) Reset() {
255 | ReleaseRequestHeader(r.Headers)
256 | fasthttp.ReleaseURI(r.uri)
257 |
258 | if r.Body != nil {
259 | // 将 body 长度截为 0,这样不会删除引用关系,GC 不会回收,
260 | // 可以实现 body 的复用
261 | r.Body = r.Body[:0]
262 | }
263 | for k := range r.cachedMap {
264 | delete(r.cachedMap, k)
265 | }
266 | r.ID = 0
267 | r.abort = false
268 | r.crawler = nil
269 | r.retryCounter = 0
270 | r.maxRedirectsCount = 0
271 | }
272 |
273 | var (
274 | requestPool sync.Pool
275 | requestHeaderPool sync.Pool
276 | )
277 |
278 | // AcquireRequest returns an empty Request instance from request pool.
279 | //
280 | // The returned Request instance may be passed to ReleaseRequest when it is
281 | // no longer needed. This allows Request recycling, reduces GC pressure
282 | // and usually improves performance.
283 | func AcquireRequest() *Request {
284 | v := requestPool.Get()
285 | if v == nil {
286 | return &Request{}
287 | }
288 | return v.(*Request)
289 | }
290 |
291 | // AcquireRequestHeader returns an empty Request Header instance from request-header pool.
292 | //
293 | // The returned Request Header instance may be passed to ReleaseRequestHeader when it is
294 | // no longer needed. This allows Request Header recycling, reduces GC pressure
295 | // and usually improves performance.
296 | func AcquireRequestHeader() *fasthttp.RequestHeader {
297 | v := requestHeaderPool.Get()
298 | if v == nil {
299 | return &fasthttp.RequestHeader{}
300 | }
301 | return v.(*fasthttp.RequestHeader)
302 | }
303 |
304 | // ReleaseRequest returns req acquired via AcquireRequest to request pool.
305 | //
306 | // It is forbidden accessing req and/or its' members after returning
307 | // it to request pool.
308 | func ReleaseRequest(req *Request) {
309 | req.Reset()
310 | requestPool.Put(req)
311 | }
312 |
313 | // ReleaseRequestHeader returns request header acquired via AcquireRequestHeader to
314 | // request-header pool.
315 | //
316 | // It is forbidden accessing request-header and/or its' members after returning
317 | // it to request-header pool.
318 | func ReleaseRequestHeader(rh *fasthttp.RequestHeader) {
319 | rh.Reset()
320 | requestHeaderPool.Put(rh)
321 | }
322 |
323 | // MultipartForm 请求体的构造
324 | type MultipartForm struct {
325 | buf *bytes.Buffer
326 | // 每个网站 boundary 前的横线数量是固定的,直接赋给这个字段
327 | boundary string
328 | bodyMap map[string]string
329 | }
330 |
331 | func NewMultipartForm(dash string, f CustomRandomBoundary) *MultipartForm {
332 | return &MultipartForm{
333 | buf: &bytes.Buffer{},
334 | boundary: dash + f(),
335 | bodyMap: make(map[string]string),
336 | }
337 | }
338 |
339 | // Boundary returns the Writer's boundary.
340 | func (mf *MultipartForm) Boundary() string {
341 | return mf.boundary
342 | }
343 |
344 | // FormDataContentType returns the Content-Type for an HTTP
345 | // multipart/form-data with this Writer's Boundary.
346 | func (mf *MultipartForm) FormDataContentType() string {
347 | b := mf.boundary
348 | // We must quote the boundary if it contains any of the
349 | // tspecials characters defined by RFC 2045, or space.
350 | if strings.ContainsAny(b, `()<>@,;:\"/[]?= `) {
351 | b = `"` + b + `"`
352 | }
353 | return "multipart/form-data; boundary=" + b
354 | }
355 |
356 | func (mf *MultipartForm) appendHead() {
357 | bodyBoundary := "--" + mf.boundary
358 | mf.buf.WriteString(bodyBoundary + "\r\n")
359 | }
360 |
361 | func (mf *MultipartForm) appendTail() {
362 | mf.buf.WriteString("\r\n")
363 | }
364 |
365 | func (mf *MultipartForm) AppendString(name, value string) {
366 | mf.appendHead()
367 | mf.buf.WriteString(`Content-Disposition: form-data; name="`)
368 | mf.buf.WriteString(name)
369 | mf.buf.WriteByte('"')
370 | mf.buf.WriteString("\r\n\r\n")
371 | mf.buf.WriteString(value)
372 | mf.appendTail()
373 |
374 | mf.bodyMap[name] = value
375 | }
376 |
377 | func getMimeType(buf []byte) string {
378 | return http.DetectContentType(buf)
379 | }
380 |
381 | func (mf *MultipartForm) AppendFile(name, filePath string) error {
382 | _, filename := filepath.Split(filePath)
383 |
384 | mf.appendHead()
385 | mf.buf.WriteString(`Content-Disposition: form-data; name="`)
386 | mf.buf.WriteString(name)
387 | mf.buf.WriteString(`"; filename="`)
388 | mf.buf.WriteString(filename)
389 | mf.buf.WriteByte('"')
390 | mf.buf.WriteString("\r\nContent-Type: ")
391 |
392 | fileBytes, err := os.ReadFile(filePath)
393 | if err != nil {
394 | return err
395 | }
396 |
397 | // 只需要使用前 512 个字节即可检测出一个文件的类型
398 | contentType := getMimeType(fileBytes[:512])
399 |
400 | mf.buf.WriteString(contentType)
401 | mf.buf.WriteString("\r\n\r\n")
402 |
403 | mf.buf.Write(fileBytes)
404 |
405 | mf.appendTail()
406 |
407 | mf.bodyMap[filename] = filePath
408 |
409 | return nil
410 | }
411 |
412 | func (mf *MultipartForm) Bytes() []byte {
413 | bodyBoundary := "--" + mf.boundary + "--"
414 | mf.buf.WriteString(bodyBoundary)
415 | return mf.buf.Bytes()
416 | }
417 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/craw_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: craw_test.go
5 | * @Created: 2021-07-23 09:22:36
6 | * @Modified: 2022-05-24 09:23:24
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "bufio"
13 | "bytes"
14 | "encoding/json"
15 | "fmt"
16 | "net/http"
17 | "net/http/httptest"
18 | "reflect"
19 | "strings"
20 | "testing"
21 | "time"
22 |
23 | "github.com/go-predator/log"
24 | "github.com/go-predator/predator/html"
25 | "github.com/go-predator/predator/proxy"
26 |
27 | . "github.com/smartystreets/goconvey/convey"
28 | "github.com/tidwall/gjson"
29 | "github.com/valyala/fasthttp"
30 | )
31 |
32 | func TestNewCrawler(t *testing.T) {
33 | Convey("测试设置 UA", t, func() {
34 | for _, ua := range []string{"foo", "bar"} {
35 | c := NewCrawler(WithUserAgent(ua))
36 | So(c.UserAgent, ShouldEqual, ua)
37 | }
38 | })
39 | Convey("测试设置 cookies", t, func() {
40 | cookie := map[string]string{"foo": "bar"}
41 | c := NewCrawler(WithCookies(cookie))
42 | So(c.cookies, ShouldEqual, cookie)
43 | })
44 | Convey("测试设置指定并发数量", t, func() {
45 | count := 10
46 | c := NewCrawler(WithConcurrency(uint64(count), false))
47 | So(c.goPool.GetCap(), ShouldEqual, count)
48 | })
49 | Convey("测试设置重试数量", t, func() {
50 | count := 5
51 | c := NewCrawler(WithRetry(uint32(count), func(r *Response) bool { return true }))
52 | So(c.retryCount, ShouldEqual, count)
53 | })
54 |
55 | Convey("测试设置代理池", t, func() {
56 | pp := make([]string, 0, 5)
57 | for i := 1; i <= 5; i++ {
58 | pp = append(pp, fmt.Sprintf("http://localhost:%d000", i))
59 | }
60 | c := NewCrawler(WithProxyPool(pp))
61 | So(reflect.DeepEqual(c.proxyURLPool, pp), ShouldBeTrue)
62 | })
63 | }
64 |
65 | var serverIndexResponse = []byte("hello world\n")
66 |
67 | func server() *httptest.Server {
68 | mux := http.NewServeMux()
69 |
70 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
71 | w.WriteHeader(200)
72 | w.Write(serverIndexResponse)
73 | })
74 |
75 | mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
76 | if r.Method == "POST" {
77 | w.Header().Set("Content-Type", "text/html")
78 | w.Write([]byte(r.FormValue("name")))
79 | }
80 | })
81 |
82 | mux.HandleFunc("/set_cookie", func(w http.ResponseWriter, r *http.Request) {
83 | c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false}
84 | http.SetCookie(w, c)
85 | w.WriteHeader(200)
86 | w.Write([]byte("ok"))
87 | })
88 |
89 | mux.HandleFunc("/check_cookie", func(w http.ResponseWriter, r *http.Request) {
90 | cs := r.Cookies()
91 | if len(cs) != 1 || r.Cookies()[0].Value != "testv" {
92 | w.WriteHeader(500)
93 | w.Write([]byte("nok"))
94 | return
95 | }
96 | w.WriteHeader(200)
97 | w.Write([]byte("ok"))
98 | })
99 |
100 | mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
101 | w.Header().Set("Content-Type", "text/html")
102 | w.Write([]byte(`
103 |
104 |
105 |
Test Page
106 |
107 |
108 |
Hello World
109 |
This is a 1
110 |
This is a 2
111 |
This is a 3
112 |
113 |
114 | `))
115 | })
116 |
117 | mux.HandleFunc("/redirect", func(w http.ResponseWriter, r *http.Request) {
118 | c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false}
119 | http.SetCookie(w, c)
120 | http.Redirect(w, r, "/html", http.StatusMovedPermanently)
121 | })
122 |
123 | mux.HandleFunc("/json", func(w http.ResponseWriter, r *http.Request) {
124 | w.Header().Set("Content-Type", "application/json; charset=UTF-8")
125 | if r.Method != "POST" {
126 | w.WriteHeader(403)
127 | w.Write([]byte(`{"msg": "only allow access with post method"}`))
128 | return
129 | }
130 |
131 | ct := r.Header.Get("Content-Type")
132 | if ct != "application/json" {
133 | w.WriteHeader(400)
134 | w.Write([]byte(`{"msg": "unkown content type"}`))
135 | return
136 | }
137 |
138 | w.WriteHeader(200)
139 | w.Write([]byte(`{"msg": "ok"}`))
140 | })
141 |
142 | mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
143 | w.Header().Set("Content-Type", "application/octet-stream")
144 | ww := bufio.NewWriter(w)
145 | defer ww.Flush()
146 | for {
147 | // have to check error to detect client aborting download
148 | if _, err := ww.Write([]byte{0x41}); err != nil {
149 | return
150 | }
151 | }
152 | })
153 |
154 | mux.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) {
155 | if r.Method == "POST" {
156 | w.Header().Set("Content-Type", "text/html")
157 | w.Write([]byte(r.FormValue("id")))
158 |
159 | // 随机休眠几秒用于测试并发
160 | // rand.Seed(time.Now().UnixNano())
161 | // time.Sleep(time.Duration(rand.Intn(5)) * time.Second)
162 | return
163 | }
164 | })
165 |
166 | return httptest.NewServer(mux)
167 | }
168 |
169 | func TestRequest(t *testing.T) {
170 | ts := server()
171 | defer ts.Close()
172 |
173 | Convey("测试请求、响应之间的上下文传递和响应结果", t, func() {
174 | c := NewCrawler()
175 |
176 | c.BeforeRequest(func(r *Request) {
177 | r.Ctx.Put("k", "v")
178 | })
179 |
180 | c.AfterResponse(func(r *Response) {
181 | v := r.Ctx.Get("k")
182 | So(v, ShouldEqual, "v")
183 | So(bytes.Equal(serverIndexResponse, r.Body), ShouldBeTrue)
184 | })
185 |
186 | c.Get(ts.URL)
187 | })
188 |
189 | Convey("测试 POST", t, func() {
190 | requestData := map[string]string{
191 | "name": "tom",
192 | "password": "123456",
193 | }
194 |
195 | c := NewCrawler()
196 |
197 | c.BeforeRequest(func(r *Request) {
198 | r.Ctx.Put("k", 2)
199 | })
200 |
201 | c.AfterResponse(func(r *Response) {
202 | v := r.Ctx.GetAny("k").(int)
203 | So(v, ShouldEqual, 2)
204 | So(string(r.Body), ShouldEqual, requestData["name"])
205 | So(string(r.Headers.Peek("Content-Type")), ShouldEqual, "text/html")
206 |
207 | })
208 |
209 | c.Post(ts.URL+"/login", requestData, nil)
210 | })
211 |
212 | // 想运行此示例,需要自行更新 cookie 和 auth_token
213 | Convey("测试 PostMultipart", t, func() {
214 | c := NewCrawler(
215 | WithCookies(map[string]string{
216 | "PHPSESSID": "7ijqglcno1cljiqs76t2vo5oh2",
217 | }))
218 | form := NewMultipartForm(
219 | "-------------------",
220 | randomBoundary,
221 | )
222 |
223 | var err error
224 |
225 | form.AppendString("type", "file")
226 | form.AppendString("action", "upload")
227 | form.AppendString("timestamp", "1627871450610")
228 | form.AppendString("auth_token", "f43cdc8a537eff5169dfddb946c2365d1f897b0c")
229 | form.AppendString("nsfw", "0")
230 | err = form.AppendFile("source", "/Users/thepoy/Pictures/Nginx.png")
231 | So(err, ShouldBeNil)
232 |
233 | c.AfterResponse(func(r *Response) {
234 | status := gjson.ParseBytes(r.Body).Get("status_code").Int()
235 | So(status, ShouldEqual, fasthttp.StatusOK)
236 | })
237 |
238 | err = c.PostMultipart("https://imgtu.com/json", form, nil)
239 | So(err, ShouldBeNil)
240 | })
241 |
242 | }
243 |
244 | func TestHTTPProxy(t *testing.T) {
245 | ts := server()
246 | defer ts.Close()
247 |
248 | u := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp"
249 | validIP := "http://123.73.209.237:46603"
250 | Convey("测试有效代理", t, func() {
251 | c := NewCrawler(
252 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55"),
253 | WithProxy(validIP),
254 | WithLogger(nil),
255 | )
256 |
257 | c.AfterResponse(func(r *Response) {
258 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String()
259 |
260 | So(ip, ShouldEqual, strings.Split(strings.Split(validIP, "//")[1], ":")[0])
261 | })
262 |
263 | c.Get(u)
264 | })
265 |
266 | Convey("测试代理池为空时 panic", t, func() {
267 | defer func() {
268 | if err := recover(); err != nil {
269 | So(err.(proxy.ProxyErr).Code, ShouldEqual, proxy.ErrEmptyProxyPoolCode)
270 | }
271 | }()
272 | ips := []string{
273 | "http://14.134.203.22:45104",
274 | "http://14.134.204.22:45105",
275 | "http://14.134.205.22:45106",
276 | "http://14.134.206.22:45107",
277 | "http://14.134.207.22:45108",
278 | "http://14.134.208.22:45109",
279 | }
280 | c := NewCrawler(WithProxyPool(ips), WithLogger(nil))
281 |
282 | c.Get(u)
283 | })
284 |
285 | Convey("测试删除代理池中某个或某些无效代理", t, func() {
286 | ips := []string{
287 | "http://14.134.204.22:45105",
288 | validIP,
289 | "http://14.134.205.22:45106",
290 | "http://14.134.206.22:45107",
291 | "http://27.29.155.141:45118",
292 | "http://14.134.208.22:45109",
293 | }
294 | c := NewCrawler(WithProxyPool(ips), WithLogger(nil))
295 |
296 | c.AfterResponse(func(r *Response) {
297 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String()
298 | So(c.ProxyPoolAmount(), ShouldBeLessThanOrEqualTo, len(ips))
299 | So(ip, ShouldEqual, strings.Split(strings.Split(validIP, "//")[1], ":")[0])
300 | })
301 |
302 | err := c.Get(u)
303 | So(err, ShouldBeNil)
304 | })
305 |
306 | Convey("测试多个有效代理的随机选择", t, func() {
307 | count := 5
308 | u := "http://t.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=%d&time=1&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=&usertype=2"
309 | client := &fasthttp.Client{}
310 | body := make([]byte, 0)
311 | _, body, err := client.Get(body, fmt.Sprintf(u, count))
312 | if err != nil {
313 | panic(err)
314 | }
315 |
316 | ips := strings.Split(string(body), "\r\n")
317 | for i := 0; i < len(ips); i++ {
318 | ips[i] = "http://" + ips[i]
319 | }
320 |
321 | c := NewCrawler(WithProxyPool(ips), WithDefaultLogger())
322 |
323 | c.BeforeRequest(func(r *Request) {
324 | r.SetHeaders(map[string]string{
325 | // 避免因 keep-alive 的响应无法改变代理
326 | "Connection": "close",
327 | })
328 | })
329 |
330 | c.AfterResponse(func(r *Response) {
331 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String()
332 | t.Log(ip)
333 | })
334 |
335 | ipu := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp"
336 | for i := 0; i < count*2; i++ {
337 | err := c.Get(ipu)
338 | So(err, ShouldBeNil)
339 | }
340 | })
341 | }
342 |
343 | func TestSocks5Proxy(t *testing.T) {
344 | proxyIP := "socks5://222.37.211.49:46601"
345 | u := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp"
346 |
347 | Convey("测试有效代理", t, func() {
348 | c := NewCrawler(
349 | WithProxy(proxyIP),
350 | )
351 |
352 | c.AfterResponse(func(r *Response) {
353 | t.Log(r)
354 |
355 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String()
356 |
357 | So(ip, ShouldEqual, strings.Split(strings.Split(proxyIP, "//")[1], ":")[0])
358 | })
359 |
360 | err := c.Get(u)
361 | So(err, ShouldBeNil)
362 | })
363 | }
364 |
365 | func TestRetry(t *testing.T) {
366 | ts := server()
367 | defer ts.Close()
368 |
369 | Convey("测试对失败响应发起重试", t, func() {
370 | cookie := map[string]string{"test": "ha"}
371 | c := NewCrawler(
372 | WithCookies(cookie),
373 | WithRetry(5, func(r *Response) bool {
374 | return r.StatusCode != 200
375 | }),
376 | )
377 |
378 | c.AfterResponse(func(r *Response) {
379 | So(r.Request.NumberOfRetries(), ShouldEqual, 5)
380 | So(r.StatusCode, ShouldNotEqual, 200)
381 | })
382 |
383 | c.Get(ts.URL + "/check_cookie")
384 | })
385 | }
386 |
387 | func TestCookies(t *testing.T) {
388 | ts := server()
389 | defer ts.Close()
390 |
391 | Convey("测试响应 set-cookie", t, func() {
392 | c := NewCrawler()
393 |
394 | c.AfterResponse(func(r *Response) {
395 | So(r.StatusCode, ShouldEqual, 200)
396 | So(string(r.Headers.Peek("Set-Cookie")), ShouldEqual, "test=testv")
397 | })
398 |
399 | c.Get(ts.URL + "/set_cookie")
400 | })
401 |
402 | Convey("测试使用 cookie 请求", t, func() {
403 | Convey("成功", func() {
404 | cookie := map[string]string{"test": "testv"}
405 | c := NewCrawler(WithCookies(cookie))
406 |
407 | c.AfterResponse(func(r *Response) {
408 | So(r.StatusCode, ShouldEqual, 200)
409 | So(r.String(), ShouldEqual, "ok")
410 | })
411 |
412 | c.Get(ts.URL + "/check_cookie")
413 | })
414 | Convey("失败", func() {
415 | cookie := map[string]string{"test": "ha"}
416 | c := NewCrawler(WithCookies(cookie))
417 |
418 | c.AfterResponse(func(r *Response) {
419 | So(r.StatusCode, ShouldEqual, 500)
420 | So(r.String(), ShouldEqual, "nok")
421 | })
422 |
423 | c.Get(ts.URL + "/check_cookie")
424 | })
425 | })
426 | }
427 |
428 | func TestJSON(t *testing.T) {
429 | ts := server()
430 | defer ts.Close()
431 |
432 | type TestResponse struct {
433 | Msg string `json:"msg"`
434 | }
435 |
436 | Convey("测试请求方法是否正确", t, func() {
437 | Convey("错误", func() {
438 | c := NewCrawler()
439 |
440 | c.AfterResponse(func(r *Response) {
441 | So(r.StatusCode, ShouldEqual, 403)
442 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8")
443 |
444 | var j TestResponse
445 | json.Unmarshal(r.Body, &j)
446 | So(j.Msg, ShouldEqual, "only allow access with post method")
447 | })
448 |
449 | c.Get(ts.URL + "/json")
450 | })
451 | Convey("正确", func() {
452 | c := NewCrawler()
453 |
454 | c.AfterResponse(func(r *Response) {
455 | So(r.StatusCode, ShouldEqual, 400)
456 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8")
457 |
458 | var j TestResponse
459 | json.Unmarshal(r.Body, &j)
460 | So(j.Msg, ShouldEqual, "unkown content type")
461 | })
462 |
463 | c.Post(ts.URL+"/json", nil, nil)
464 | })
465 | })
466 |
467 | Convey("测试请求头 Content-Type", t, func() {
468 | c := NewCrawler()
469 |
470 | c.BeforeRequest(func(r *Request) {
471 | r.SetContentType("application/json")
472 | })
473 |
474 | c.AfterResponse(func(r *Response) {
475 | So(r.StatusCode, ShouldEqual, 200)
476 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8")
477 |
478 | var j TestResponse
479 | json.Unmarshal(r.Body, &j)
480 | So(j.Msg, ShouldEqual, "ok")
481 | })
482 |
483 | c.Post(ts.URL+"/json", nil, nil)
484 | })
485 |
486 | Convey("测试完整 JSON 请求和响应", t, func() {
487 | c := NewCrawler()
488 |
489 | c.AfterResponse(func(r *Response) {
490 | t.Log(r)
491 | })
492 |
493 | type User struct {
494 | Name string `json:"name"`
495 | Age int `json:"age"`
496 | }
497 |
498 | body := map[string]any{
499 | "time": 156546535,
500 | "cid": "10_18772100220-1625540144276-302919",
501 | "args": []int{1, 2, 3, 4, 5},
502 | "dict": map[string]string{
503 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778",
504 | },
505 | "user": User{"Tom", 13},
506 | }
507 |
508 | c.PostJSON("https://httpbin.org/post", body, nil)
509 | })
510 |
511 | Convey("测试带缓存的完整 JSON 请求和响应", t, func() {
512 | c := NewCrawler(
513 | WithCache(nil, false, nil, CacheField{requestBodyParam, "cid"}, CacheField{requestBodyParam, "user.name"}, CacheField{requestBodyParam, "user.age"}),
514 | )
515 |
516 | c.AfterResponse(func(r *Response) {
517 | t.Log(r.FromCache)
518 | })
519 |
520 | type User struct {
521 | Name string `json:"name"`
522 | Age int `json:"age"`
523 | }
524 |
525 | body := map[string]any{
526 | "time": 156546535,
527 | "cid": "10_18772100220-1625540144276-302919",
528 | "args": []int{1, 2, 3, 4, 5},
529 | "dict": map[string]string{
530 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778",
531 | },
532 | "user": User{"Tom", 13},
533 | }
534 |
535 | c.PostJSON("https://httpbin.org/post", body, nil)
536 | })
537 | }
538 |
539 | func TestJSONWithInvalidCacheField(t *testing.T) {
540 | c := NewCrawler(
541 | WithCache(nil, false, nil, CacheField{requestBodyParam, "id"}, CacheField{requestBodyParam, "user.name"}, CacheField{requestBodyParam, "user.age"}),
542 | WithLogger(nil),
543 | )
544 |
545 | c.AfterResponse(func(r *Response) {
546 | t.Log(r.FromCache)
547 | })
548 |
549 | type User struct {
550 | Name string `json:"name"`
551 | Age int `json:"age"`
552 | }
553 |
554 | body := map[string]any{
555 | "time": 156546535,
556 | "cid": "10_18772100220-1625540144276-302919",
557 | "args": []int{1, 2, 3, 4, 5},
558 | "dict": map[string]string{
559 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778",
560 | },
561 | "user": User{"Tom", 13},
562 | }
563 |
564 | c.PostJSON("https://httpbin.org/post", body, nil)
565 | }
566 |
567 | func TestParseHTML(t *testing.T) {
568 | ts := server()
569 | defer ts.Close()
570 |
571 | Convey("测试 HTML 解析", t, func() {
572 | crawl := NewCrawler()
573 |
574 | Convey("测试解析整体 HTML", func() {
575 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) {
576 | h, err := he.OuterHTML()
577 | So(err, ShouldBeNil)
578 | So(h, ShouldEqual, `
579 |
Hello World
580 |
This is a 1
581 |
This is a 2
582 |
This is a 3
583 |
584 |
585 | `)
586 | })
587 | })
588 |
589 | Convey("测试解析内部 HTML", func() {
590 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) {
591 | h, err := he.InnerHTML()
592 | So(err, ShouldBeNil)
593 | So(h, ShouldEqual, `
594 |
Hello World
595 |
This is a 1
596 |
This is a 2
597 |
This is a 3
598 |
599 |
600 | `)
601 | })
602 | })
603 |
604 | Convey("测试解析内部文本", func() {
605 | crawl.ParseHTML("title", func(he *html.HTMLElement, r *Response) {
606 | So(he.Text(), ShouldEqual, "Test Page")
607 | })
608 | })
609 |
610 | Convey("测试获取属性", func() {
611 | crawl.ParseHTML("p", func(he *html.HTMLElement, r *Response) {
612 | attr := he.Attr("class")
613 | So(attr, ShouldEqual, "description")
614 | })
615 | })
616 |
617 | Convey("测试查找子元素", func() {
618 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) {
619 | So(he.FirstChild("p").Attr("class"), ShouldEqual, "description")
620 | So(he.Child("p", 2).Text(), ShouldEqual, "This is a 2")
621 | So(he.ChildAttr("p", "class"), ShouldEqual, "description")
622 | So(len(he.ChildrenAttr("p", "class")), ShouldEqual, 3)
623 | })
624 | })
625 |
626 | crawl.Get(ts.URL + "/html")
627 | })
628 | }
629 |
630 | func timeCost() func() {
631 | start := time.Now()
632 | return func() {
633 | tc := time.Since(start)
634 | fmt.Printf("time cost = %v\n", tc)
635 | }
636 | }
637 |
638 | func TestConcurrency(t *testing.T) {
639 | ts := server()
640 | defer ts.Close()
641 |
642 | Convey("测试并发和同步耗时", t, func() {
643 | Convey("并发", func() {
644 | start := time.Now()
645 | c := NewCrawler(
646 | WithConcurrency(30, false),
647 | )
648 |
649 | for i := 0; i < 10; i++ {
650 | err := c.Post(ts.URL+"/post", map[string]string{
651 | "id": fmt.Sprint(i + 1),
652 | }, nil)
653 | So(err, ShouldBeNil)
654 | }
655 |
656 | delta := time.Since(start)
657 | t.Log(delta)
658 | })
659 |
660 | Convey("同步", func() {
661 | start := time.Now()
662 | c := NewCrawler()
663 |
664 | for i := 0; i < 10; i++ {
665 | err := c.Post(ts.URL+"/post", map[string]string{
666 | "id": fmt.Sprint(i + 1),
667 | }, nil)
668 | So(err, ShouldBeNil)
669 | }
670 |
671 | delta := time.Since(start)
672 | t.Log(delta)
673 | })
674 | })
675 | }
676 |
677 | func TestLog(t *testing.T) {
678 | ts := server()
679 | defer ts.Close()
680 |
681 | Convey("默认在终端美化输出 INFO 等级\n", t, func() {
682 | c := NewCrawler(
683 | WithLogger(nil),
684 | )
685 |
686 | c.Get(ts.URL)
687 | })
688 |
689 | Convey("在终端美化输出 DEBUG 等级\n", t, func() {
690 | c := NewCrawler(
691 | WithLogger(log.NewLogger(log.DEBUG, log.ToConsole())),
692 | )
693 |
694 | c.BeforeRequest(func(r *Request) {
695 | r.Ctx.Put("key", "value")
696 | })
697 |
698 | c.Get(ts.URL)
699 | })
700 |
701 | Convey("保存到文件\n", t, func() {
702 | c := NewCrawler(
703 | WithLogger(log.NewLogger(log.DEBUG, log.MustToFile("test.log", -1))),
704 | )
705 |
706 | c.Get(ts.URL)
707 | })
708 |
709 | Convey("既保存到文件,也输出到终端\n", t, func() {
710 | c := NewCrawler(
711 | WithLogger(log.NewLogger(log.DEBUG, log.MustToConsoleAndFile("test2.log", -1))),
712 | )
713 |
714 | c.BeforeRequest(func(r *Request) {
715 | r.Ctx.Put("key", "value")
716 | })
717 |
718 | c.Get(ts.URL)
719 | })
720 | }
721 |
722 | func TestRedirect(t *testing.T) {
723 | ts := server()
724 | defer ts.Close()
725 |
726 | Convey("测试默认情况", t, func() {
727 | c := NewCrawler()
728 |
729 | c.AfterResponse(func(r *Response) {
730 | So(r.StatusCode, ShouldEqual, 301)
731 | })
732 |
733 | c.Get(ts.URL + "/redirect")
734 | })
735 |
736 | Convey("测试设置重定向次数的情况", t, func() {
737 | c := NewCrawler()
738 |
739 | c.BeforeRequest(func(r *Request) {
740 | r.AllowRedirect(1)
741 | })
742 |
743 | c.AfterResponse(func(r *Response) {
744 | So(r.StatusCode, ShouldEqual, 200)
745 | })
746 |
747 | c.Get(ts.URL + "/redirect")
748 | })
749 | }
750 |
751 | func getRawCookie(c *Crawler, ts *httptest.Server) string {
752 | var rawCookie string
753 |
754 | c.AfterResponse(func(r *Response) {
755 | if r.StatusCode == 301 {
756 | rawCookie = string(r.Headers.Peek("Set-Cookie"))
757 | }
758 | })
759 |
760 | c.Post(ts.URL+"/redirect", map[string]string{"username": "test", "password": "test"}, nil)
761 | return rawCookie
762 | }
763 |
764 | func TestClone(t *testing.T) {
765 | ts := server()
766 | defer ts.Close()
767 |
768 | Convey("测试克隆", t, func() {
769 | c := NewCrawler()
770 |
771 | rawCookie := getRawCookie(c, ts)
772 |
773 | WithRawCookie(rawCookie)(c)
774 | WithConcurrency(10, false)(c)
775 |
776 | c.AfterResponse(func(r *Response) {
777 | fmt.Println(r.StatusCode)
778 | fmt.Println(r)
779 | So(r.StatusCode, ShouldEqual, 200)
780 | So(r.String(), ShouldEqual, "ok")
781 | })
782 |
783 | c.Get(ts.URL + "/check_cookie")
784 | c.Wait()
785 | })
786 | }
787 |
--------------------------------------------------------------------------------
/craw.go:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: thepoy
3 | * @Email: thepoy@163.com
4 | * @File Name: craw.go
5 | * @Created: 2021-07-23 08:52:17
6 | * @Modified: 2022-11-29 16:13:58
7 | */
8 |
9 | package predator
10 |
11 | import (
12 | "context"
13 | "errors"
14 | "fmt"
15 | "math/rand"
16 | "net"
17 | "net/url"
18 | "strings"
19 | "sync"
20 | "sync/atomic"
21 | "time"
22 |
23 | "github.com/PuerkitoBio/goquery"
24 | "github.com/go-predator/log"
25 | pctx "github.com/go-predator/predator/context"
26 | "github.com/go-predator/predator/html"
27 | "github.com/go-predator/predator/json"
28 | "github.com/go-predator/predator/proxy"
29 | "github.com/valyala/fasthttp"
30 | )
31 |
32 | // HandleRequest is used to patch the request
33 | type HandleRequest func(r *Request)
34 |
35 | // HandleResponse is used to handle the response
36 | type HandleResponse func(r *Response)
37 |
38 | // HandleHTML is used to process html
39 | type HandleHTML func(he *html.HTMLElement, r *Response)
40 |
41 | type HandleJSON func(j json.JSONResult, r *Response)
42 |
43 | // HTMLParser is used to parse html
44 | type HTMLParser struct {
45 | Selector string
46 | Handle HandleHTML
47 | }
48 |
49 | // JSONParser is used to parse json
50 | type JSONParser struct {
51 | strict bool
52 | Handle HandleJSON
53 | }
54 |
55 | // CustomRandomBoundary generates a custom boundary
56 | type CustomRandomBoundary func() string
57 |
58 | type CacheCondition func(r *Response) bool
59 |
60 | type ProxyInvalidCondition func(r *Response) error
61 |
62 | type ComplementProxyPool func() []string
63 |
64 | // Crawler is the provider of crawlers
65 | type Crawler struct {
66 | lock *sync.RWMutex
67 | // UserAgent is the User-Agent string used by HTTP requests
68 | UserAgent string
69 | retryCount uint32
70 | // Retry condition, the crawler will retry only
71 | // if it returns true
72 | retryCondition RetryCondition
73 | client *fasthttp.Client
74 | cookies map[string]string
75 | goPool *Pool
76 | proxyURLPool []string
77 | proxyInvalidCondition ProxyInvalidCondition
78 | proxyInUse string
79 | complementProxyPool ComplementProxyPool
80 | requestCount uint32
81 | responseCount uint32
82 | // TODO: 在多协程中这个上下文管理可以用来退出或取消多个协程
83 | Context context.Context
84 |
85 | // Cache successful response
86 | cache Cache
87 | // List of fields to be cached in the request body, and
88 | // the combination of these fields can represent the unique
89 | // request body.
90 | // The fewer fields the better.
91 | cacheFields []CacheField
92 | cacheCondition CacheCondition
93 |
94 | requestHandler []HandleRequest
95 |
96 | // Array of functions to handle the response
97 | responseHandler []HandleResponse
98 | // Array of functions to handle parsed html
99 | htmlHandler []*HTMLParser
100 | jsonHandler []*JSONParser
101 |
102 | wg *sync.WaitGroup
103 |
104 | log *log.Logger
105 | }
106 |
107 | // NewCrawler creates a new Crawler instance with some CrawlerOptions
108 | func NewCrawler(opts ...CrawlerOption) *Crawler {
109 | c := new(Crawler)
110 |
111 | c.UserAgent = "Predator"
112 |
113 | c.client = new(fasthttp.Client)
114 |
115 | for _, op := range opts {
116 | op(c)
117 | }
118 |
119 | // If there is `DEBUG` in the environment variable and `c.log` is nil,
120 | // create a logger with a level of `DEBUG`
121 | if c.log == nil && log.IsDebug() {
122 | c.log = log.NewLogger(
123 | log.DEBUG,
124 | log.ToConsole(),
125 | )
126 | }
127 |
128 | c.lock = &sync.RWMutex{}
129 |
130 | c.Context = context.Background()
131 |
132 | capacityState := c.goPool != nil
133 |
134 | if c.log != nil {
135 | if capacityState {
136 | c.Info("concurrent", log.Arg{Key: "state", Value: capacityState}, log.Arg{Key: "capacity", Value: c.goPool.capacity})
137 | } else {
138 | c.Info("concurrent", log.Arg{Key: "state", Value: capacityState})
139 | }
140 | }
141 |
142 | if c.log != nil && c.goPool != nil {
143 | c.goPool.log = c.log
144 | }
145 |
146 | return c
147 | }
148 |
149 | // Clone creates an exact copy of a Crawler without callbacks.
150 | func (c *Crawler) Clone() *Crawler {
151 | var (
152 | pool *Pool
153 | err error
154 | )
155 | if c.goPool == nil {
156 | pool = nil
157 | } else {
158 | pool, err = NewPool(c.goPool.capacity)
159 | if err != nil {
160 | c.FatalOrPanic(err)
161 | }
162 | }
163 | return &Crawler{
164 | lock: c.lock,
165 | UserAgent: c.UserAgent,
166 | retryCount: c.retryCount,
167 | retryCondition: c.retryCondition,
168 | client: c.client,
169 | cookies: c.cookies,
170 | goPool: pool,
171 | proxyURLPool: c.proxyURLPool,
172 | Context: c.Context,
173 | cache: c.cache,
174 | cacheCondition: c.cacheCondition,
175 | cacheFields: c.cacheFields,
176 | requestHandler: make([]HandleRequest, 0, 5),
177 | responseHandler: make([]HandleResponse, 0, 5),
178 | htmlHandler: make([]*HTMLParser, 0, 5),
179 | jsonHandler: make([]*JSONParser, 0, 1),
180 | wg: &sync.WaitGroup{},
181 | log: c.log,
182 | }
183 | }
184 |
185 | /************************* http 请求方法 ****************************/
186 |
187 | func (c *Crawler) request(method, URL string, body []byte, cachedMap map[string]string, reqHeader *fasthttp.RequestHeader, ctx pctx.Context, isChained bool) error {
188 | defer func() {
189 | if c.goPool != nil {
190 | if err := recover(); err != nil {
191 | c.FatalOrPanic(fmt.Errorf("worker panic: %s", err))
192 | }
193 | }
194 | }()
195 |
196 | var err error
197 |
198 | reqHeader.SetMethod(method)
199 | if reqHeader.UserAgent() == nil {
200 | reqHeader.SetUserAgent(c.UserAgent)
201 | }
202 |
203 | if c.cookies != nil {
204 | for k, v := range c.cookies {
205 | reqHeader.SetCookie(k, v)
206 | }
207 | if c.log != nil {
208 | c.Debug("cookies is set", log.Arg{Key: "cookies", Value: reqHeader.Peek("Cookie")})
209 | }
210 | }
211 |
212 | if ctx == nil {
213 | ctx, err = pctx.AcquireCtx()
214 | if err != nil {
215 | if c.log != nil {
216 | c.log.Error(err)
217 | }
218 | return err
219 | }
220 | }
221 |
222 | u, err := url.Parse(URL)
223 | if err != nil {
224 | return err
225 | }
226 | // Convert non-ascii characters in query parameters to ascii characters
227 | u.RawQuery = u.Query().Encode()
228 |
229 | uri := fasthttp.AcquireURI()
230 | uri.Parse([]byte(u.Host), []byte(u.String()))
231 |
232 | request := AcquireRequest()
233 | request.Headers = reqHeader
234 | request.Ctx = ctx
235 | request.Body = body
236 | request.cachedMap = cachedMap
237 | request.ID = atomic.AddUint32(&c.requestCount, 1)
238 | request.crawler = c
239 | request.uri = uri
240 |
241 | if c.goPool != nil {
242 | c.wg.Add(1)
243 | task := &Task{
244 | crawler: c,
245 | req: request,
246 | isChained: isChained,
247 | }
248 | err = c.goPool.Put(task)
249 | if err != nil {
250 | if c.log != nil {
251 | c.log.Error(err)
252 | }
253 | return err
254 | }
255 | return nil
256 | }
257 |
258 | err = c.prepare(request, isChained)
259 | if err != nil {
260 | return err
261 | }
262 |
263 | return nil
264 | }
265 |
266 | func (c *Crawler) prepare(request *Request, isChained bool) (err error) {
267 | if c.goPool != nil {
268 | defer c.wg.Done()
269 | }
270 |
271 | c.processRequestHandler(request)
272 |
273 | if request.abort {
274 | if c.log != nil {
275 | c.Debug("the request is aborted", log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)})
276 | }
277 | return
278 | }
279 |
280 | if c.log != nil {
281 | c.Info(
282 | "requesting",
283 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)},
284 | log.Arg{Key: "method", Value: request.Method()},
285 | log.Arg{Key: "url", Value: request.URL()},
286 | log.Arg{Key: "timeout", Value: request.timeout.String()},
287 | )
288 | }
289 |
290 | if request.Ctx.Length() > 0 {
291 | if c.log != nil {
292 | c.Debug("using context", log.Arg{Key: "context", Value: request.Ctx.String()})
293 | }
294 | }
295 |
296 | var response *Response
297 |
298 | var key string
299 |
300 | if c.cache != nil {
301 | key, err = request.Hash()
302 | if err != nil {
303 | if c.log != nil {
304 | c.log.Error(err)
305 | }
306 | return
307 | }
308 |
309 | if c.log != nil {
310 | c.Debug(
311 | "generate cache key",
312 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)},
313 | log.Arg{Key: "cache_key", Value: key},
314 | )
315 | }
316 |
317 | response, err = c.checkCache(key)
318 | if err != nil {
319 | return
320 | }
321 |
322 | if response != nil && c.log != nil {
323 | c.log.Debug("response is in the cache",
324 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)},
325 | log.Arg{Key: "cache_key", Value: key},
326 | )
327 | }
328 | }
329 |
330 | var rawResp *fasthttp.Response
331 | // A new request is issued when there
332 | // is no response from the cache
333 | if response == nil {
334 | response, rawResp, err = c.do(request)
335 | if err != nil {
336 | return
337 | }
338 |
339 | // Cache the response from the request if the statuscode is 20X
340 | if c.cache != nil && c.cacheCondition(response) && key != "" {
341 | cacheVal, err := response.Marshal()
342 | if err != nil {
343 | if c.log != nil {
344 | c.log.Error(err)
345 | }
346 | return err
347 | }
348 |
349 | if cacheVal != nil {
350 | c.lock.Lock()
351 | err = c.cache.Cache(key, cacheVal)
352 | if err != nil {
353 | if c.log != nil {
354 | c.log.Error(err)
355 | }
356 | return err
357 | }
358 | c.lock.Unlock()
359 | }
360 | }
361 | } else {
362 | response.Request = request
363 | response.Ctx = request.Ctx
364 | }
365 |
366 | if response.StatusCode == fasthttp.StatusFound {
367 | location := response.Headers.Peek("location")
368 |
369 | if c.log != nil {
370 | c.log.Info("response",
371 | log.Arg{Key: "method", Value: request.Method()},
372 | log.Arg{Key: "status_code", Value: response.StatusCode},
373 | log.Arg{Key: "location", Value: string(location)},
374 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)},
375 | )
376 | }
377 | } else {
378 | if c.log != nil {
379 | l := c.log.L.Info().
380 | Str("method", request.Method()).
381 | Int("status_code", response.StatusCode)
382 |
383 | if !response.FromCache {
384 | if c.ProxyPoolAmount() > 0 {
385 | l = l.Str("proxy", response.ClientIP())
386 | } else {
387 | l = l.Str("server_addr", response.ClientIP())
388 | }
389 | }
390 |
391 | l.Bool("from_cache", response.FromCache).
392 | Uint32("request_id", atomic.LoadUint32(&request.ID)).
393 | Msg("response")
394 | }
395 | }
396 |
397 | c.processResponseHandler(response)
398 |
399 | if !response.invalid {
400 | err = c.processHTMLHandler(response)
401 | if err != nil {
402 | return
403 | }
404 |
405 | c.processJSONHandler(response)
406 | }
407 |
408 | ReleaseResponse(response, !isChained)
409 | if rawResp != nil {
410 | // 原始响应应该在自定义响应之后释放,不然一些字段的值会出错
411 | fasthttp.ReleaseResponse(rawResp)
412 | }
413 |
414 | return
415 | }
416 |
417 | func (c *Crawler) FatalOrPanic(err error) {
418 | if c.log != nil {
419 | c.Fatal(err)
420 | } else {
421 | panic(err)
422 | }
423 | }
424 |
425 | func (c *Crawler) checkCache(key string) (*Response, error) {
426 | var err error
427 | cachedBody, ok := c.cache.IsCached(key)
428 | if !ok {
429 | return nil, nil
430 | }
431 |
432 | resp := new(Response)
433 | err = resp.Unmarshal(cachedBody)
434 | if err != nil {
435 | if c.log != nil {
436 | c.log.Error(err)
437 | }
438 | return nil, err
439 | }
440 | resp.FromCache = true
441 | return resp, nil
442 | }
443 |
444 | func newFasthttpRequest(request *Request) *fasthttp.Request {
445 | req := fasthttp.AcquireRequest()
446 |
447 | request.Headers.CopyTo(&req.Header)
448 | req.SetURI(request.uri)
449 |
450 | if request.Method() == MethodPost {
451 | req.SetBody(request.Body)
452 | }
453 |
454 | if request.Method() == MethodPost && req.Header.ContentType() == nil {
455 | req.Header.SetContentType("application/x-www-form-urlencoded")
456 | }
457 |
458 | if req.Header.Peek("Accept") == nil {
459 | req.Header.Set("Accept", "*/*")
460 | }
461 |
462 | uri := req.URI()
463 | if len(req.Header.Host()) == 0 {
464 | host := uri.Host()
465 | req.Header.SetHostBytes(host)
466 | }
467 | req.Header.SetRequestURIBytes(uri.RequestURI())
468 |
469 | return req
470 | }
471 |
472 | func (c *Crawler) do(request *Request) (*Response, *fasthttp.Response, error) {
473 | req := newFasthttpRequest(request)
474 |
475 | if len(c.proxyURLPool) > 0 {
476 | rand.Seed(time.Now().UnixMicro())
477 |
478 | c.lock.Lock()
479 | c.client.Dial = func(addr string) (net.Conn, error) {
480 | return c.ProxyDialerWithTimeout(c.proxyURLPool[rand.Intn(len(c.proxyURLPool))], request.timeout)(addr)
481 | }
482 | c.lock.Unlock()
483 | c.Debug("request infomation", log.Arg{Key: "header", Value: req.Header.String()}, log.Arg{Key: "proxy", Value: c.ProxyInUse()})
484 | } else {
485 | c.Debug("request infomation", log.Arg{Key: "header", Value: req.Header.String()})
486 | }
487 |
488 | var err error
489 |
490 | resp := fasthttp.AcquireResponse()
491 |
492 | if request.maxRedirectsCount == 0 {
493 | if c.ProxyPoolAmount() > 0 {
494 | req.SetConnectionClose()
495 | }
496 |
497 | if request.timeout > 0 {
498 | err = c.client.DoTimeout(req, resp, request.timeout)
499 | } else {
500 | err = c.client.Do(req, resp)
501 | }
502 | } else {
503 | err = c.client.DoRedirects(req, resp, int(request.maxRedirectsCount))
504 | }
505 | req.Header.CopyTo(request.Headers)
506 |
507 | response := AcquireResponse()
508 | response.StatusCode = resp.StatusCode()
509 | response.Body = append(response.Body, resp.Body()...)
510 | response.Ctx = request.Ctx
511 | response.Request = request
512 | resp.Header.CopyTo(&response.Headers)
513 | response.clientIP = resp.RemoteAddr()
514 | response.localIP = resp.LocalAddr()
515 |
516 | if response.StatusCode == fasthttp.StatusOK && len(response.Body) == 0 {
517 | // fasthttp.Response 会将空响应的状态码设置为 200,这不合理
518 | response.StatusCode = 0
519 | }
520 |
521 | if x, ok := err.(interface{ Timeout() bool }); ok && x.Timeout() {
522 | response.timeout = true
523 | err = ErrTimeout
524 | }
525 |
526 | if err == nil || err == ErrTimeout || err == fasthttp.ErrDialTimeout {
527 | if c.ProxyPoolAmount() > 0 && c.proxyInvalidCondition != nil {
528 | e := c.proxyInvalidCondition(response)
529 | if e != nil {
530 | err = e
531 | }
532 | }
533 | }
534 |
535 | if err != nil {
536 | if p, ok := proxy.IsProxyError(err); ok {
537 | c.Warning("proxy is invalid",
538 | log.Arg{Key: "proxy", Value: p},
539 | log.Arg{Key: "proxy_pool", Value: c.proxyURLPool},
540 | log.Arg{Key: "msg", Value: err},
541 | )
542 |
543 | err = c.removeInvalidProxy(p)
544 | if err != nil {
545 | c.FatalOrPanic(err)
546 | }
547 |
548 | c.Info("removed invalid proxy",
549 | log.Arg{Key: "invalid_proxy", Value: p},
550 | log.Arg{Key: "new_proxy_pool", Value: c.proxyURLPool},
551 | )
552 |
553 | fasthttp.ReleaseRequest(req)
554 | fasthttp.ReleaseResponse(resp)
555 |
556 | return c.do(request)
557 | } else {
558 | if err == ErrTimeout || err == fasthttp.ErrDialTimeout {
559 | // re-request if the request timed out.
560 | // re-request 3 times by default when the request times out.
561 |
562 | // if you are using a proxy, the timeout error is probably
563 | // because the proxy is invalid, and it is recommended
564 | // to try a new proxy
565 | if c.retryCount == 0 {
566 | c.retryCount = 3
567 | }
568 |
569 | c.Error(err, log.Arg{Key: "timeout", Value: request.timeout.String()}, log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)})
570 |
571 | if atomic.LoadUint32(&request.retryCounter) < c.retryCount {
572 | c.retryPrepare(request, req, resp)
573 | return c.do(request)
574 | }
575 | fasthttp.ReleaseRequest(req)
576 | fasthttp.ReleaseResponse(resp)
577 | ReleaseResponse(response, true)
578 |
579 | return nil, nil, ErrTimeout
580 | } else {
581 | if err == fasthttp.ErrConnectionClosed {
582 | // Feature error of fasthttp, there is no solution yet, only try again if c.retryCount > 0 or panic
583 | c.Error(err, log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)})
584 |
585 | if c.retryCount == 0 {
586 | c.retryCount = 1
587 | }
588 |
589 | if atomic.LoadUint32(&request.retryCounter) < c.retryCount {
590 | c.retryPrepare(request, req, resp)
591 | return c.do(request)
592 | }
593 | }
594 | c.FatalOrPanic(err)
595 | return nil, nil, err
596 | }
597 | }
598 | }
599 |
600 | c.Debug("response header", log.Arg{Key: "header", Value: resp.Header.String()})
601 |
602 | // Only count successful responses
603 | atomic.AddUint32(&c.responseCount, 1)
604 | // release req
605 | fasthttp.ReleaseRequest(req)
606 |
607 | if c.retryCount > 0 && atomic.LoadUint32(&request.retryCounter) < c.retryCount {
608 | if c.retryCondition != nil && c.retryCondition(response) {
609 | c.Warning("the response meets the retry condition and will be retried soon")
610 | c.retryPrepare(request, req, resp)
611 | return c.do(request)
612 | }
613 | }
614 |
615 | return response, resp, nil
616 | }
617 |
618 | func (c *Crawler) retryPrepare(request *Request, req *fasthttp.Request, resp *fasthttp.Response) {
619 | atomic.AddUint32(&request.retryCounter, 1)
620 | c.Info(
621 | "retrying",
622 | log.Arg{Key: "retry_count", Value: atomic.LoadUint32(&request.retryCounter)},
623 | log.Arg{Key: "method", Value: request.Method()},
624 | log.Arg{Key: "url", Value: request.URL()},
625 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)},
626 | )
627 | fasthttp.ReleaseRequest(req)
628 | fasthttp.ReleaseResponse(resp)
629 | }
630 |
631 | func createBody(requestData map[string]string) []byte {
632 | if requestData == nil {
633 | return nil
634 | }
635 | form := url.Values{}
636 | for k, v := range requestData {
637 | form.Add(k, v)
638 | }
639 | return []byte(form.Encode())
640 | }
641 |
642 | func NewRequestHeaders(headers map[string]string) *fasthttp.RequestHeader {
643 | reqHeaders := new(fasthttp.RequestHeader)
644 |
645 | for k, v := range headers {
646 | reqHeaders.Set(k, v)
647 | }
648 |
649 | return reqHeaders
650 | }
651 |
652 | func setRequestHeaders(headers map[string]string) *fasthttp.RequestHeader {
653 | header := AcquireRequestHeader()
654 | for k, v := range headers {
655 | header.Set(k, v)
656 | }
657 |
658 | return header
659 | }
660 |
661 | func (c *Crawler) get(URL string, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error {
662 | // Parse the query parameters and create a `cachedMap` based on `cacheFields`
663 | u, err := url.Parse(URL)
664 | if err != nil {
665 | c.Error(err)
666 | return err
667 | }
668 |
669 | params := u.Query()
670 | var cachedMap map[string]string
671 | if len(cacheFields) > 0 {
672 | cachedMap = make(map[string]string)
673 | for _, field := range cacheFields {
674 | if field.code != queryParam {
675 | c.FatalOrPanic(ErrNotAllowedCacheFieldType)
676 | }
677 |
678 | key, value, err := addQueryParamCacheField(params, field)
679 | if err != nil {
680 | c.FatalOrPanic(err)
681 | }
682 |
683 | cachedMap[key] = value
684 | }
685 |
686 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap})
687 | }
688 |
689 | reqHeader := setRequestHeaders(headers)
690 |
691 | return c.request(MethodGet, URL, nil, cachedMap, reqHeader, ctx, isChained)
692 | }
693 |
694 | // Get is used to send GET requests
695 | func (c *Crawler) Get(URL string) error {
696 | return c.GetWithCtx(URL, nil)
697 | }
698 |
699 | // GetWithCtx is used to send GET requests with a context
700 | func (c *Crawler) GetWithCtx(URL string, ctx pctx.Context) error {
701 | return c.get(URL, nil, ctx, false, c.cacheFields...)
702 | }
703 |
704 | func (c *Crawler) post(URL string, requestData, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error {
705 | var cachedMap map[string]string
706 | if len(cacheFields) > 0 {
707 | cachedMap = make(map[string]string)
708 |
709 | var queryParams url.Values
710 | for _, field := range cacheFields {
711 | var (
712 | err error
713 | key, value string
714 | )
715 |
716 | switch field.code {
717 | case queryParam:
718 | if queryParams == nil {
719 | u, err := url.Parse(URL)
720 | if err != nil {
721 | c.FatalOrPanic(err)
722 | }
723 |
724 | queryParams = u.Query()
725 | }
726 |
727 | key, value, err = addQueryParamCacheField(queryParams, field)
728 | case requestBodyParam:
729 | if val, ok := requestData[field.Field]; ok {
730 | key, value = field.String(), val
731 | } else {
732 | keys := make([]string, 0, len(requestData))
733 | for k := range requestData {
734 | keys = append(keys, k)
735 | }
736 |
737 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field.Field, keys)
738 | }
739 | default:
740 | err = ErrInvalidCacheTypeCode
741 | }
742 |
743 | if err != nil {
744 | c.FatalOrPanic(err)
745 | }
746 |
747 | cachedMap[key] = value
748 | }
749 |
750 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap})
751 | }
752 |
753 | if len(headers) == 0 {
754 | headers = make(map[string]string)
755 | }
756 | if _, ok := headers["Content-Type"]; !ok {
757 | // use default `Content-Type`
758 | headers["Content-Type"] = "application/x-www-form-urlencoded"
759 | }
760 |
761 | reqHeader := setRequestHeaders(headers)
762 |
763 | return c.request(MethodPost, URL, createBody(requestData), cachedMap, reqHeader, ctx, isChained)
764 | }
765 |
766 | // Post is used to send POST requests
767 | func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error {
768 | return c.post(URL, requestData, nil, ctx, false, c.cacheFields...)
769 | }
770 |
771 | func (c *Crawler) createJSONBody(requestData map[string]any) []byte {
772 | if requestData == nil {
773 | return nil
774 | }
775 | body, err := json.Marshal(requestData)
776 | if err != nil {
777 | c.FatalOrPanic(err)
778 | }
779 | return body
780 | }
781 |
782 | func (c *Crawler) postJSON(URL string, requestData map[string]any, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error {
783 | body := c.createJSONBody(requestData)
784 |
785 | var cachedMap map[string]string
786 | if len(cacheFields) > 0 {
787 | cachedMap = make(map[string]string)
788 | bodyJson := json.ParseBytesToJSON(body)
789 |
790 | var queryParams url.Values
791 |
792 | for _, field := range cacheFields {
793 | var (
794 | err error
795 | key, value string
796 | )
797 |
798 | switch field.code {
799 | case queryParam:
800 | if queryParams == nil {
801 | u, err := url.Parse(URL)
802 | if err != nil {
803 | c.FatalOrPanic(err)
804 | }
805 |
806 | queryParams = u.Query()
807 | }
808 |
809 | key, value, err = addQueryParamCacheField(queryParams, field)
810 | case requestBodyParam:
811 | if !bodyJson.Get(field.Field).Exists() {
812 | m := bodyJson.Map()
813 | var keys = make([]string, 0, len(m))
814 | for k := range m {
815 | keys = append(keys, k)
816 | }
817 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field, keys)
818 | } else {
819 | key, value = field.String(), bodyJson.Get(field.Field).String()
820 | }
821 | default:
822 | err = ErrInvalidCacheTypeCode
823 | }
824 |
825 | if err != nil {
826 | c.FatalOrPanic(err)
827 | }
828 |
829 | cachedMap[key] = value
830 | }
831 |
832 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap})
833 | }
834 |
835 | if len(headers) == 0 {
836 | headers = make(map[string]string)
837 | }
838 | headers["Content-Type"] = "application/json"
839 |
840 | reqHeader := setRequestHeaders(headers)
841 |
842 | return c.request(MethodPost, URL, body, cachedMap, reqHeader, ctx, isChained)
843 | }
844 |
845 | // PostJSON is used to send POST requests whose content-type is json
846 | func (c *Crawler) PostJSON(URL string, requestData map[string]any, ctx pctx.Context) error {
847 | return c.postJSON(URL, requestData, nil, ctx, false, c.cacheFields...)
848 | }
849 |
850 | func (c *Crawler) postMultipart(URL string, form *MultipartForm, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error {
851 | var cachedMap map[string]string
852 | if len(cacheFields) > 0 {
853 | cachedMap = make(map[string]string)
854 |
855 | var queryParams url.Values
856 |
857 | for _, field := range cacheFields {
858 | var (
859 | err error
860 | key, value string
861 | )
862 |
863 | switch field.code {
864 | case queryParam:
865 | if queryParams == nil {
866 | u, err := url.Parse(URL)
867 | if err != nil {
868 | c.FatalOrPanic(err)
869 | }
870 |
871 | queryParams = u.Query()
872 | }
873 |
874 | key, value, err = addQueryParamCacheField(queryParams, field)
875 | case requestBodyParam:
876 | if val, ok := form.bodyMap[field.Field]; ok {
877 | key, value = field.String(), val
878 | } else {
879 | var keys = make([]string, 0, len(form.bodyMap))
880 | for k := range form.bodyMap {
881 | keys = append(keys, k)
882 | }
883 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field, keys)
884 | }
885 | default:
886 | err = ErrInvalidCacheTypeCode
887 | }
888 |
889 | if err != nil {
890 | c.FatalOrPanic(err)
891 | }
892 |
893 | cachedMap[key] = value
894 | }
895 |
896 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap})
897 | }
898 |
899 | if len(headers) == 0 {
900 | headers = make(map[string]string)
901 | }
902 | headers["Content-Type"] = form.FormDataContentType()
903 |
904 | reqHeader := setRequestHeaders(headers)
905 |
906 | return c.request(MethodPost, URL, form.Bytes(), cachedMap, reqHeader, ctx, isChained)
907 | }
908 |
909 | // PostMultipart is used to send POST requests whose content-type is `multipart/form-data`
910 | func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error {
911 | return c.postMultipart(URL, form, nil, ctx, false, c.cacheFields...)
912 | }
913 |
914 | // PostRaw is used to send POST requests whose content-type is not in [json, `application/x-www-form-urlencoded`, `multipart/form-data`]
915 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error {
916 | cachedMap := map[string]string{
917 | "cache": string(body),
918 | }
919 | return c.request(MethodPost, URL, body, cachedMap, nil, ctx, false)
920 | }
921 |
922 | /************************* Public methods ****************************/
923 |
924 | // ClearCache will clear all cache
925 | func (c *Crawler) ClearCache() error {
926 | if c.cache == nil {
927 | c.Error(ErrNoCache)
928 | return ErrNoCache
929 | }
930 | if c.log != nil {
931 | c.Warning("clear all cache")
932 | }
933 | return c.cache.Clear()
934 | }
935 |
936 | func (c Crawler) ProxyInUse() string {
937 | c.lock.RLock()
938 | defer c.lock.RUnlock()
939 |
940 | if strings.Contains(c.proxyInUse, "//") {
941 | return strings.Split(c.proxyInUse, "//")[1]
942 | }
943 | return c.proxyInUse
944 | }
945 |
946 | func (c *Crawler) ConcurrencyState() bool {
947 | return c.goPool != nil
948 | }
949 |
950 | /************************* 公共注册方法 ****************************/
951 |
952 | // BeforeRequest used to process requests, such as
953 | // setting headers, passing context, etc.
954 | func (c *Crawler) BeforeRequest(f HandleRequest) {
955 | c.lock.Lock()
956 | if c.requestHandler == nil {
957 | // 一个 ccrawler 不应该有太多处理请求的方法,这里设置为 5 个,
958 | // 当不够时自动扩容
959 | c.requestHandler = make([]HandleRequest, 0, 5)
960 | }
961 | c.requestHandler = append(c.requestHandler, f)
962 | c.lock.Unlock()
963 | }
964 |
965 | // ParseHTML can parse html to find the data you need,
966 | // and process the data
967 | func (c *Crawler) ParseHTML(selector string, f HandleHTML) {
968 | c.lock.Lock()
969 | if c.htmlHandler == nil {
970 | // 一个 ccrawler 不应该有太多处理 html 的方法,这里设置为 5 个,
971 | // 当不够时自动扩容
972 | c.htmlHandler = make([]*HTMLParser, 0, 5)
973 | }
974 | c.htmlHandler = append(c.htmlHandler, &HTMLParser{selector, f})
975 | c.lock.Unlock()
976 | }
977 |
978 | // ParseJSON can parse json to find the data you need,
979 | // and process the data.
980 | //
981 | // If you set `strict` to true, responses that do not contain
982 | // `application/json` in the content-type of the response header will
983 | // not be processed.
984 | //
985 | // It is recommended to do full processing of the json response in one
986 | // call to `ParseJSON` instead of multiple calls to `ParseJSON`.
987 | func (c *Crawler) ParseJSON(strict bool, f HandleJSON) {
988 | c.lock.Lock()
989 | if c.jsonHandler == nil {
990 | c.jsonHandler = make([]*JSONParser, 0, 1)
991 | }
992 | c.jsonHandler = append(c.jsonHandler, &JSONParser{strict, f})
993 | c.lock.Unlock()
994 | }
995 |
996 | // AfterResponse is used to process the response, this
997 | // method should be used for the response body in non-html format
998 | func (c *Crawler) AfterResponse(f HandleResponse) {
999 | c.lock.Lock()
1000 | if c.responseHandler == nil {
1001 | // 一个 ccrawler 不应该有太多处理响应的方法,这里设置为 5 个,
1002 | // 当不够时自动扩容
1003 | c.responseHandler = make([]HandleResponse, 0, 5)
1004 | }
1005 | c.responseHandler = append(c.responseHandler, f)
1006 | c.lock.Unlock()
1007 | }
1008 |
1009 | // ProxyPoolAmount returns the number of proxies in
1010 | // the proxy pool
1011 | func (c Crawler) ProxyPoolAmount() int {
1012 | return len(c.proxyURLPool)
1013 | }
1014 |
1015 | // Wait waits for the end of all concurrent tasks
1016 | func (c *Crawler) Wait() {
1017 | c.wg.Wait()
1018 | c.goPool.Close()
1019 | }
1020 |
1021 | func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition) {
1022 | c.proxyInvalidCondition = condition
1023 | }
1024 |
1025 | func (c *Crawler) AddProxy(newProxy string) {
1026 | c.lock.Lock()
1027 |
1028 | c.proxyURLPool = append(c.proxyURLPool, newProxy)
1029 |
1030 | c.lock.Unlock()
1031 | }
1032 |
1033 | func (c *Crawler) AddCookie(key, val string) {
1034 | c.lock.Lock()
1035 |
1036 | c.cookies[key] = val
1037 |
1038 | c.lock.Unlock()
1039 | }
1040 |
1041 | // SetConcurrency 使用并发,参数为要创建的协程池数量
1042 | func (c *Crawler) SetConcurrency(count uint64, blockPanic bool) {
1043 | if c.goPool == nil {
1044 | p, err := NewPool(count)
1045 | if err != nil {
1046 | panic(err)
1047 | }
1048 | p.blockPanic = blockPanic
1049 | p.log = c.log
1050 |
1051 | c.goPool = p
1052 | c.wg = new(sync.WaitGroup)
1053 | } else {
1054 | c.FatalOrPanic(errors.New("`c.goPool` is not nil"))
1055 | }
1056 | }
1057 |
1058 | func (c *Crawler) SetRetry(count uint32, cond RetryCondition) {
1059 | c.retryCount = count
1060 | c.retryCondition = cond
1061 | }
1062 |
1063 | func (c *Crawler) SetCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField) {
1064 | cc.Compressed(compressed)
1065 | err := cc.Init()
1066 | if err != nil {
1067 | panic(err)
1068 | }
1069 | c.cache = cc
1070 | if cacheCondition == nil {
1071 | cacheCondition = func(r *Response) bool {
1072 | return r.StatusCode/100 == 2
1073 | }
1074 | }
1075 | c.cacheCondition = cacheCondition
1076 | if len(cacheFileds) > 0 {
1077 | c.cacheFields = cacheFileds
1078 | } else {
1079 | c.cacheFields = nil
1080 | }
1081 | }
1082 |
1083 | // 有时发出的请求不能缓存,可以用此方法关闭特定的 Crawler 实例的缓存。
1084 | //
1085 | // 通常用来关闭`Clone()`实例的缓存。
1086 | func (c *Crawler) UnsetCache() {
1087 | if c.cache != nil {
1088 | c.cache = nil
1089 |
1090 | if c.cacheCondition != nil {
1091 | c.cacheCondition = nil
1092 | }
1093 |
1094 | if c.cacheFields != nil {
1095 | c.cacheFields = nil
1096 | }
1097 | }
1098 | }
1099 |
1100 | func (c Crawler) Lock() {
1101 | c.lock.Lock()
1102 | }
1103 |
1104 | func (c Crawler) Unlock() {
1105 | c.lock.Unlock()
1106 | }
1107 |
1108 | func (c Crawler) RLock() {
1109 | c.lock.RLock()
1110 | }
1111 |
1112 | func (c Crawler) RUnlock() {
1113 | c.lock.RUnlock()
1114 | }
1115 |
1116 | /************************* 私有注册方法 ****************************/
1117 |
1118 | func (c *Crawler) processRequestHandler(r *Request) {
1119 | for _, f := range c.requestHandler {
1120 | f(r)
1121 | }
1122 | }
1123 |
1124 | func (c *Crawler) processResponseHandler(r *Response) {
1125 | for _, f := range c.responseHandler {
1126 | if r.invalid {
1127 | break
1128 | }
1129 | f(r)
1130 | }
1131 | }
1132 |
1133 | func (c *Crawler) processJSONHandler(r *Response) {
1134 | if c.jsonHandler == nil {
1135 | return
1136 | }
1137 |
1138 | if len(c.jsonHandler) > 1 {
1139 | if c.log != nil {
1140 | c.Warning("it is recommended to do full processing of the json response in one call to `ParseJSON` instead of multiple calls to `ParseJSON`")
1141 | }
1142 | }
1143 |
1144 | result := json.ParseBytesToJSON(r.Body)
1145 | for _, parser := range c.jsonHandler {
1146 | if parser.strict {
1147 | if !strings.Contains(strings.ToLower(r.ContentType()), "application/json") {
1148 | if c.log != nil {
1149 | c.Debug(
1150 | `the "Content-Type" of the response header is not of the "json" type`,
1151 | log.Arg{Key: "Content-Type", Value: r.ContentType()},
1152 | )
1153 | }
1154 | continue
1155 | }
1156 | }
1157 | parser.Handle(result, r)
1158 | }
1159 | }
1160 |
1161 | func (c *Crawler) processHTMLHandler(r *Response) error {
1162 | if len(c.htmlHandler) == 0 {
1163 | return nil
1164 | }
1165 |
1166 | if !strings.Contains(strings.ToLower(r.ContentType()), "html") {
1167 | if c.log != nil {
1168 | c.Debug(
1169 | `the "Content-Type" of the response header is not of the "html" type`,
1170 | log.Arg{Key: "Content-Type", Value: r.ContentType()},
1171 | )
1172 | }
1173 | return nil
1174 | }
1175 |
1176 | doc, err := html.ParseHTML(r.Body)
1177 | if err != nil {
1178 | if c.log != nil {
1179 | c.log.Error(err)
1180 | }
1181 | return err
1182 | }
1183 |
1184 | for _, parser := range c.htmlHandler {
1185 | if r.invalid {
1186 | break
1187 | }
1188 |
1189 | i := 0
1190 | doc.Find(parser.Selector).Each(func(_ int, s *goquery.Selection) {
1191 | for _, n := range s.Nodes {
1192 | parser.Handle(html.NewHTMLElementFromSelectionNode(s, n, i), r)
1193 | i++
1194 | }
1195 | })
1196 | }
1197 | return nil
1198 | }
1199 |
1200 | // removeInvalidProxy 只有在使用代理池且当前请求使用的代理来自于代理池时,才能真正删除失效代理
1201 | func (c *Crawler) removeInvalidProxy(proxyAddr string) error {
1202 | c.lock.Lock()
1203 | defer c.lock.Unlock()
1204 |
1205 | if c.ProxyPoolAmount() == 0 {
1206 | return proxy.ProxyErr{
1207 | Code: proxy.ErrEmptyProxyPoolCode,
1208 | Msg: "the current proxy pool is empty",
1209 | }
1210 | }
1211 |
1212 | if c.ProxyPoolAmount() == 1 && c.complementProxyPool != nil {
1213 | newProxyPool := c.complementProxyPool()
1214 | c.proxyURLPool = append(c.proxyURLPool, newProxyPool...)
1215 | c.log.Info(
1216 | "a new proxy pool has replaced to the old proxy pool",
1217 | log.Arg{Key: "new_proxy_pool", Value: newProxyPool},
1218 | )
1219 | }
1220 |
1221 | targetIndex := -1
1222 | for i, p := range c.proxyURLPool {
1223 | addr := strings.Split(p, "//")[1]
1224 | if addr == proxyAddr {
1225 | targetIndex = i
1226 | break
1227 | }
1228 | }
1229 |
1230 | if targetIndex >= 0 {
1231 | c.proxyURLPool = append(
1232 | c.proxyURLPool[:targetIndex],
1233 | c.proxyURLPool[targetIndex+1:]...,
1234 | )
1235 |
1236 | if c.log != nil {
1237 | c.Debug(
1238 | "invalid proxy have been deleted from the proxy pool",
1239 | log.Arg{Key: "proxy", Value: proxyAddr},
1240 | )
1241 | }
1242 |
1243 | if len(c.proxyURLPool) == 0 {
1244 | return proxy.ProxyErr{
1245 | Code: proxy.ErrEmptyProxyPoolCode,
1246 | Msg: "the current proxy pool is empty after removing a invalid proxy",
1247 | }
1248 | }
1249 | } else {
1250 | // 并发时可能也会存在找不到失效的代理的情况,这时不能返回 error
1251 | if c.goPool != nil {
1252 | return nil
1253 | }
1254 |
1255 | // 没有在代理池中找到失效代理,这个代理来路不明,一样报错
1256 | return &proxy.ProxyErr{
1257 | Code: proxy.ErrUnkownProxyIPCode,
1258 | Msg: "proxy address is unkown",
1259 | Args: map[string]string{
1260 | "unkown_proxy_addr": proxyAddr,
1261 | },
1262 | }
1263 | }
1264 |
1265 | return nil
1266 | }
1267 |
1268 | func (c *Crawler) Debug(msg string, args ...log.Arg) {
1269 | if c.log != nil {
1270 | c.log.Debug(msg, args...)
1271 | }
1272 | }
1273 |
1274 | func (c *Crawler) Info(msg string, args ...log.Arg) {
1275 | if c.log != nil {
1276 | c.log.Info(msg, args...)
1277 | }
1278 | }
1279 |
1280 | func (c *Crawler) Warning(msg string, args ...log.Arg) {
1281 | if c.log != nil {
1282 | c.log.Warning(msg, args...)
1283 | }
1284 | }
1285 |
1286 | func (c *Crawler) Error(err error, args ...log.Arg) {
1287 | if c.log != nil {
1288 | c.log.Error(err, args...)
1289 | }
1290 | }
1291 |
1292 | func (c *Crawler) Fatal(err error, args ...log.Arg) {
1293 | if c.log != nil {
1294 | c.log.Fatal(err, args...)
1295 | }
1296 | }
1297 |
--------------------------------------------------------------------------------