├── .gitignore ├── LICENSE ├── README.md ├── README.zh_CN.md ├── async_test.go ├── cache.go ├── context ├── api.go ├── context_test.go ├── read.go └── write.go ├── craw.go ├── craw_test.go ├── errors.go ├── example └── multipart │ └── main.go ├── go.mod ├── go.sum ├── html ├── element.go ├── html_test.go └── parser.go ├── json ├── json.go ├── json_test.go └── parser.go ├── methods.go ├── options.go ├── pool.go ├── proxy.go ├── proxy ├── errors.go ├── http.go └── socks5.go ├── request.go ├── response.go └── status.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Predator 2 | 3 | A high-performance(maybe) crawler framework based on fasthttp. 4 | 5 | ## Usage 6 | 7 | ### 1 Create a new `Crawler` 8 | 9 | ```go 10 | import "github.com/go-predator/predator" 11 | 12 | 13 | func main() { 14 | c := predator.NewCrawler( 15 | predator.WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"), 16 | predator.WithCookies(map[string]string{"JSESSIONID": cookie}), // or use predator.WithRawCookie(cookie string) 17 | predator.WithProxy(ip), // or use a proxy pool -> predator.WithProxyPool([]string) 18 | ) 19 | } 20 | ``` 21 | 22 | Please refer to [predator/options.go](https://github.com/go-predator/predator/blob/main/options.go) for all options。 23 | 24 | ### 2 Send request with GET method 25 | 26 | ```go 27 | // BeforeRequest can do some patching on the request before sending it 28 | c.BeforeRequest(func(r *predator.Request) { 29 | headers := map[string]string{ 30 | "Accept": "*/*", 31 | "Accept-Language": "zh-CN", 32 | "Accept-Encoding": "gzip, deflate", 33 | "X-Requested-With": "XMLHttpRequest", 34 | "Origin": "http://example.com", 35 | } 36 | 37 | r.SetHeaders(headers) 38 | }) 39 | 40 | c.AfterResponse(func(r *predator.Response) { 41 | // Get the required parameters from the context 42 | id := r.Ctx.GetAny("id").(int) 43 | name := r.Ctx.Get("name") 44 | page := r.Ctx.Get("page") 45 | 46 | fmt.Println(r.String()) 47 | }) 48 | 49 | // Send a request 50 | c.Get("http://www.example.com") 51 | 52 | // Or send a request with context 53 | ctx, _ := context.AcquireCtx() 54 | ctx.Put('page', 1) 55 | ctx.Put("id", 10) 56 | ctx.Put("name", "Tom") 57 | c.GetWithCtx("http://www.example.com", ctx) 58 | ``` 59 | 60 | ### 3 Send request with POST method 61 | 62 | #### 3.1 Request body's media-type is `application/x-www-form-urlencoded` 63 | 64 | ```go 65 | // BeforeRequest can do some patching on the request before sending it 66 | c.BeforeRequest(func(r *predator.Request) { 67 | headers := map[string]string{ 68 | "Accept": "*/*", 69 | "Accept-Language": "zh-CN", 70 | "Accept-Encoding": "gzip, deflate", 71 | "X-Requested-With": "XMLHttpRequest", 72 | "Origin": "http://example.com", 73 | } 74 | 75 | r.SetHeaders(headers) 76 | }) 77 | 78 | c.AfterResponse(func(r *predator.Response) { 79 | // Get the required parameters from the context 80 | id := r.Ctx.GetAny("id").(int) 81 | name := r.Ctx.Get("name") 82 | 83 | fmt.Println(r.String()) 84 | }) 85 | 86 | 87 | body := map[string]string{"foo": "bar"} 88 | 89 | // Send a request with context 90 | ctx, _ := context.AcquireCtx() 91 | ctx.Put("id", 10) 92 | ctx.Put("name", "Tom") 93 | 94 | c.Post("http://www.example.com", body, ctx) 95 | ``` 96 | 97 | If you don't need to pass a context, you can pass `nil`: 98 | 99 | ```go 100 | c.Post("http://www.example.com", body, nil) 101 | ``` 102 | 103 | #### 3.2 Request body's media-type is `multipart/form-data` 104 | 105 | Please refer to the complete example:https://github.com/go-predator/predator/blob/main/example/multipart/main.go 106 | 107 | #### 3.3 Request body's media-type is `application/json` 108 | 109 | ```go 110 | import ( 111 | ... 112 | 113 | "github.com/go-predator/predator" 114 | "github.com/go-predator/predator/context" 115 | "github.com/go-predator/predator/json" 116 | ) 117 | 118 | type User struct { 119 | Name string `json:"name"` 120 | Age int `json:"age"` 121 | } 122 | 123 | func main() { 124 | c := predator.NewCrawler() 125 | 126 | c.ParseJSON(true, func(j json.JSONResult) { 127 | fmt.Println(j.Get("json")) 128 | }) 129 | 130 | body := map[string]any{ 131 | "time": 156546535, 132 | "cid": "10_18772100220-1625540144276-302919", 133 | "args": []int{1, 2, 3, 4, 5}, 134 | "dict": map[string]string{ 135 | "mod": "1592215036_002", "t": "1628346994", "eleTop": "778", 136 | }, 137 | "user": User{"Tom", 13}, 138 | } 139 | 140 | c.PostJSON("https://httpbin.org/post", body, nil) 141 | } 142 | ``` 143 | 144 | #### 3.4 Request body's media-type is others 145 | 146 | If the three request functions above cannot meet your needs, please send your own binary request body via `PostRaw`. 147 | 148 | ```go 149 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error 150 | ``` 151 | 152 | ### 4 Allow Redirects 153 | 154 | Redirection is disabled by default. 155 | 156 | If you need to use redirects, you need to set the maximum number of redirects allowed via `AllowRedirect` in `BeforeRequest`. 157 | 158 | ```go 159 | c.BeforeRequest(func(r *predator.Request) { 160 | if r.URL()[8:12] == "abcd" { 161 | r.AllowRedirect(1) 162 | } else if r.URL()[8:12] == "efgh" { 163 | r.AllowRedirect(3) 164 | } 165 | }) 166 | ``` 167 | 168 | Setting global redirects is not allowed. 169 | 170 | ### 5 Context 171 | 172 | The context is an interface, and the following two contexts are currently implemented: 173 | 174 | - _ReadOp_:Based on `sync.Map`, it is suitable for scenarios with many reading contexts 175 | 176 | ```go 177 | ctx, err := AcquireCtx(context.ReadOp) 178 | ``` 179 | 180 | - _WriteOp_(Default):Based on `map`, it is suitable for scenarios where the frequency of reading and writing is not much different or there are more writes than reads. This is the default context 181 | 182 | ```go 183 | ctx, err := AcquireCtx() 184 | ``` 185 | 186 | If you implement the `Context` interface yourself: 187 | 188 | ```go 189 | ctx := YourContext() 190 | ``` 191 | 192 | ### 6 Parse the HTML response 193 | 194 | Responses to web requests are mostly **HTML** and **JSON**. 195 | 196 | You can use the `ParseHTML` method to find html elements in combination with **CSS selector**. 197 | 198 | > :warning: The `Content-Type` of the response header must be `text/html`. 199 | 200 | ```go 201 | crawl := NewCrawler() 202 | 203 | crawl.ParseHTML("#main", func(he *html.HTMLElement) { 204 | he.String() 205 | 206 | h, err := he.InnerHTML() 207 | 208 | h, err := he.OuterHTML() 209 | 210 | he.Text() 211 | 212 | he.ChildText("#title") 213 | 214 | he.ChildrenText("li>a") 215 | 216 | he.Attr("class") 217 | 218 | he.FirstChild("p") 219 | 220 | he.LastChild("p") 221 | 222 | he.Child("p", 2) 223 | 224 | he.Children("p") 225 | 226 | he.ChildAttr("p", "class") 227 | 228 | he.ChildrenAttr("p", "class") 229 | 230 | he.Parent() 231 | 232 | he.Parents() 233 | 234 | he.Each("li>a", func (i, h) { 235 | if i < 10 { 236 | fmt.Println(h.Attr("href")) 237 | return false 238 | } else { 239 | return true 240 | } 241 | }) 242 | 243 | he.FindChildByText("span.addr", "New York") 244 | 245 | he.FindChildByStripedText("span.addr", "New York") // if addr like ' New York ' 246 | } 247 | ``` 248 | 249 | ### 7 Goroutine pool 250 | 251 | ```go 252 | c := NewCrawler( 253 | // Use a goroutine pool with a capacity of 30 for web requests 254 | predator.WithConcurrency(30), 255 | ) 256 | 257 | c.AfterResponse(func(r *predator.Response) { 258 | // handle response 259 | }) 260 | 261 | for i := 0; i < 10; i++ { 262 | c.Post("http://www.example.com", map[string]string{ 263 | "id": fmt.Sprint(i + 1), 264 | }, nil) 265 | } 266 | 267 | c.Wait() 268 | ``` 269 | 270 | ### 8 Cache 271 | 272 | By default no cache is used. 273 | 274 | [`Cache`](https://github.com/go-predator/predator/blob/main/cache.go) is an interface. 275 | 276 | SQLite-based caching is currently implemented. 277 | 278 | If the response length is too long, in order to reduce the space usage, you can enable cache compression. 279 | 280 | ```go 281 | import ( 282 | "github.com/go-predator/cache" 283 | ) 284 | 285 | // SQLite3 286 | c := NewCrawler( 287 | predator.WithCache(&cache.SQLiteCache{ 288 | URI: "test.sqlite", 289 | }, true), // enable compression 290 | ) 291 | ``` 292 | 293 | ### 9 Proxy 294 | 295 | You can use proxy pool: 296 | 297 | ```go 298 | predator.WithProxyPool([]string{"http://ip:port", "socks5://ip:port"}) 299 | ``` 300 | 301 | A proxy is randomly selected from the proxy pool before each request. 302 | 303 | When a proxy fails it is automatically removed from the proxy pool, and panic when the proxy pool is empty. 304 | 305 | To avoid panic, you can use `WithComplementProxyPool` to supplement the proxy pool when the proxy pool is empty. 306 | 307 | ```go 308 | func GetProxyIPs() []string { 309 | api := "http://proxy.api" 310 | client := &fasthttp.Client{} 311 | body := make([]byte, 0) 312 | _, body, err := client.Get(body, api) 313 | if err != nil { 314 | panic(err) 315 | } 316 | 317 | return strings.Split(string(body), "\r\n") 318 | } 319 | 320 | predator.WithComplementProxyPool(GetProxyIPs) 321 | ``` 322 | 323 | ### 10 Logging 324 | 325 | Based on [`zerolog`](https://github.com/rs/zerolog). 326 | 327 | Logging is off by default. 328 | 329 | Use the `WithLogger` option to enable logging: 330 | 331 | ```go 332 | func WithLogger(logger *log.Logger) CrawlerOption 333 | ``` 334 | 335 | If `logger` is nil, logs of level WARNING and above will be printed to the console. 336 | 337 | ```go 338 | crawler := predator.NewCrawler( 339 | predator.WithLogger(nil), // equal to predator.WithDefaultLogger() 340 | ) 341 | ``` 342 | 343 | If you want to print lower level logs, refer to the following code: 344 | 345 | ```go 346 | import "github.com/go-predator/predator/log" 347 | 348 | func main() { 349 | // print to console 350 | logger := log.NewLogger(log.DEBUG, log.ToConsole(), 1) 351 | // save to file 352 | logger := log.NewLogger(log.DEBUG, log.MustToFile("demo.log", -1), 1) 353 | // print to console and save to file 354 | logger := log.NewLogger(log.DEBUG, log.MustToConsoleAndFile("demo.log", -1), 1) 355 | 356 | crawler := predator.NewCrawler( 357 | predator.WithLogger(logger), 358 | ) 359 | } 360 | ``` 361 | 362 | ### 11 Other considerations 363 | 364 | If you need to serialize some data structures into json strings, or deserialize json strings, it is recommended to use `github.com/go-predator/predator/json` instead of `encdoing/json`. 365 | 366 | ```GO 367 | import "github.com/go-predator/predator/json" 368 | 369 | json.Marshal(any) ([]byte, error) 370 | json.Unmarshal([]byte, any) error 371 | json.UnmarshalFromString(string, any) error 372 | ``` 373 | -------------------------------------------------------------------------------- /README.zh_CN.md: -------------------------------------------------------------------------------- 1 | # predator / 掠食者 2 | 基于 fasthttp 开发的高性能爬虫框架 3 | 4 | ## 使用 5 | 6 | 下面是一个示例,基本包含了当前已完成的所有功能,使用方法可以参考注释。 7 | 8 | ### 1 创建一个 Crawler 9 | 10 | ```go 11 | import "github.com/go-predator/predator" 12 | 13 | 14 | func main() { 15 | crawler := predator.NewCrawler( 16 | predator.WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"), 17 | predator.WithCookies(map[string]string{"JSESSIONID": cookie}), 18 | predator.WithProxy(ip), // 或者使用代理池 predator.WithProxyPool([]string) 19 | ) 20 | } 21 | ``` 22 | 23 | 创建`Crawler`时有一些可选项用来功能增强。所有可选项参考[predator/options.go](https://github.com/go-predator/predator/blob/main/options.go)。 24 | 25 | ### 2 发送 Get 请求 26 | 27 | ```go 28 | crawler.Get("http://www.baidu.com") 29 | ``` 30 | 31 | 对请求和响应的处理参考的是 colly,我觉得 colly 的处理方式非常舒服。 32 | 33 | ```go 34 | // BeforeRequest 可以在发送请求前,对请求进行一些修补 35 | crawler.BeforeRequest(func(r *predator.Request) { 36 | headers := map[string]string{ 37 | "Accept": "*/*", 38 | "Accept-Language": "zh-CN", 39 | "Accept-Encoding": "gzip, deflate", 40 | "X-Requested-With": "XMLHttpRequest", 41 | "Origin": "http://example.com", 42 | } 43 | 44 | r.SetHeaders(headers) 45 | 46 | // 请求和响应之间的上下文传递,上下文见下面的上下文示例 47 | r.Ctx.Put("id", 10) 48 | r.Ctx.Put("name", "tom") 49 | }) 50 | 51 | crawler.AfterResponse(func(r *predator.Response) { 52 | // 从请求发送的上下文中取值 53 | id := r.Ctx.GetAny("id").(int) 54 | name := r.Ctx.Get("name") 55 | 56 | // 对于 json 响应,建议使用 gjson 进行处理 57 | body := gjson.ParseBytes(r.Body) 58 | amount := body.Get("amount").Int() 59 | types := body.Get("types").Array() 60 | }) 61 | 62 | // 请求语句要在 BeforeRequest 和 AfterResponse 后面调用 63 | crawler.Get("http://www.baidu.com") 64 | ``` 65 | 66 | ### 3 发送 Post 请求 67 | 68 | 与 Get 请求有一点不同,通常每个 Post 的请求的参数是不同的,而这些参数都在请求体中,在`BeforeRequest`中重新解析请求体获取关键参数虽然可以,但绝非最佳选择。所以在构造 Post 请求时,可以直接传入上下文,用以解决与响应的信息传递。 69 | 70 | #### 3.1 普通 POST 表单(application/x-www-form-urlencoded) 71 | 72 | ```go 73 | // BeforeRequest 可以在发送请求前,对请求进行一些修补 74 | crawler.BeforeRequest(func(r *predator.Request) { 75 | headers := map[string]string{ 76 | "Accept": "*/*", 77 | "Accept-Language": "zh-CN", 78 | "Accept-Encoding": "gzip, deflate", 79 | "X-Requested-With": "XMLHttpRequest", 80 | "Origin": "http://example.com", 81 | } 82 | 83 | r.SetHeaders(headers) 84 | }) 85 | 86 | crawler.AfterResponse(func(r *predator.Response) { 87 | // 从请求发送的上下文中取值 88 | id := r.Ctx.GetAny("id").(int) 89 | name := r.Ctx.Get("name") 90 | 91 | // 对于 json 响应,建议使用 gjson 进行处理 92 | body := gjson.ParseBytes(r.Body) 93 | amount := body.Get("amount").Int() 94 | types := body.Get("types").Array() 95 | }) 96 | 97 | 98 | body := map[string]string{"foo": "bar"} 99 | 100 | // 在 Post 请求中,应该将关键参数用这种方式放进上下文 101 | ctx, _ := context.AcquireCtx() 102 | ctx.Put("id", 10) 103 | ctx.Put("name", "tom") 104 | 105 | crawler.Post("http://www.baidu.com", body, ctx) 106 | ``` 107 | 108 | 如果不需要传入上下文,可以直接用`nil`代替: 109 | 110 | ```go 111 | crawler.Post("http://www.baidu.com", body, nil) 112 | ``` 113 | 114 | #### 3.2 复杂 POST 请求(multipart/form-data) 115 | 116 | `multipart/form-data`方法需要使用专门的`PostMultipart`方法,示例可能较长,这里不便书写。 117 | 118 | 使用方法请参考示例:https://github.com/go-predator/predator/blob/main/example/multipart/main.go 119 | 120 | #### 3.3 JSON 请求 121 | 122 | JSON 请求也有专门的方法`PostJSON`来完成,在使用`PostJSON`时会自动在请求头中添加`Content-Type: application/json`,无需重复添加。当然,你再重新添加一次也可以,最终将会使用你添加的`Content-Type`。 123 | 124 | 示例: 125 | 126 | ```go 127 | func main() { 128 | c := NewCrawler() 129 | 130 | c.AfterResponse(func(r *Response) { 131 | t.Log(r) 132 | }) 133 | 134 | type User struct { 135 | Name string `json:"name"` 136 | Age int `json:"age"` 137 | } 138 | 139 | body := map[string]any{ 140 | "time": 156546535, 141 | "cid": "10_18772100220-1625540144276-302919", 142 | "args": []int{1, 2, 3, 4, 5}, 143 | "dict": map[string]string{ 144 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778", 145 | }, 146 | "user": User{"Tom", 13}, 147 | } 148 | 149 | c.PostJSON("https://httpbin.org/post", body, nil) 150 | } 151 | ``` 152 | 153 | #### 3.4 其他 POST 请求 154 | 155 | 虽然以上三种方式已解决大部分的网站的请求,但仍然存在一小部分网站比较特殊,此时需要使用`PostRaw`方法: 156 | 157 | ```go 158 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error 159 | ``` 160 | 161 | 其中的请求体需要你自行构造,原始请求体可以是任何形式,构造完成后再序列化为`[]byte`作为请求体。 162 | 163 | ### 4 允许重定向 164 | 165 | 考虑到爬虫的效率问题,默认情况下是不允许重定向的。 166 | 167 | 但在正常的爬虫业务中难免遇到重定向问题,你可以根据每个请求的不同情况设置不同的最大重定向次数。 168 | 169 | ```go 170 | crawler.BeforeRequest(func(r *predator.Request) { 171 | // 用 GET 请求时可以根据 r.URL 判断,POST 请求时可以根据请求体判断,下面仅是示例 172 | if r.URL == 情况一 { 173 | // 允许重定向 1 次 174 | r.AllowRedirect(1) 175 | } else if r.URL == 情况二 { 176 | // 允许重定向 3 次 177 | r.AllowRedirect(3) 178 | } 179 | }) 180 | ``` 181 | 182 | 不允许设置全局重定向,只能针对每个请求进行修补。 183 | 184 | 当然,如果全局重定向呼声高的话,再考虑是否加入。 185 | 186 | ### 5 上下文 187 | 188 | 上下文是一个接口,我实现了两种上下文: 189 | 190 | - *ReadOp*:基于`sync.Map`实现,适用于读取上下文较多的场景 191 | - *WriteOp*:用`map`实现,适用于读写频率相差不大或写多于读的场景,这是默认采用的上下文 192 | 193 | 爬虫中如果遇到了读远多于写时就应该换`ReadOp`了,如下代码所示: 194 | 195 | ```go 196 | ctx, err := AcquireCtx(context.ReadOp) 197 | ``` 198 | 199 | ### 6 处理 HTML 200 | 201 | 爬虫的结果大体可分为两种,一是 HTML 响应,另一种是 JSON 格式的响应。 202 | 203 | 与 JSON 相比,HTML 需要更多的代码处理。 204 | 205 | 本框架对 HTML 处理进行了一些函数封装,能方便地通过 css selector 进行元素的查找,可以提取元素中的属性和文本等。 206 | 207 | 响应头必须是`text/html`或其扩展类型如`text/html; charset=utf-8`才能执行此方法。 208 | 209 | ```go 210 | crawl := NewCrawler() 211 | 212 | crawl.ParseHTML("body", func(he *html.HTMLElement) { 213 | // 元素内部 HTML 214 | h, err := he.InnerHTML() 215 | // 元素整体 HTML 216 | h, err := he.OuterHTML() 217 | // 元素内的文本(包括子元素的文本) 218 | he.Text() 219 | // 元素的属性 220 | he.Attr("class") 221 | // 第一个匹配的子元素 222 | he.FirstChild("p") 223 | // 最后一个匹配的子元素 224 | he.LastChild("p") 225 | // 第 2 个匹配的子元素 226 | he.Child("p", 2) 227 | // 第一个匹配的子元素的属性 228 | he.ChildAttr("p", "class") 229 | // 所有匹配到的子元素的属性切片 230 | he.ChildrenAttr("p", "class") 231 | } 232 | ``` 233 | 234 | ### 7 异步 / 多协程请求 235 | 236 | ```go 237 | c := NewCrawler( 238 | // 使用此 option 时自动使用指定数量的协程池发出请求,不使用此 option 则默认使用同步方式请求 239 | // 设置的数量不宜过少,也不宜过多,请自行测试设置不同数量时的效率 240 | WithConcurrency(30), 241 | ) 242 | 243 | c.AfterResponse(func(r *predator.Response) { 244 | // handle response 245 | }) 246 | 247 | for i := 0; i < 10; i++ { 248 | c.Post(ts.URL+"/post", map[string]string{ 249 | "id": fmt.Sprint(i + 1), 250 | }, nil) 251 | } 252 | 253 | c.Wait() 254 | ``` 255 | 256 | ### 8 使用缓存 257 | 258 | 默认情况下,缓存是不启用的,所有的请求都直接放行。 259 | 260 | 已经实现的缓存: 261 | 262 | - MySQL 263 | - PostgreSQL 264 | - Redis 265 | - SQLite3 266 | 267 | 缓存接口中有一个方法`Compressed(yes bool)`用来压缩响应的,毕竟有时,响应长度非常长,直接保存到数据库中会影响插入和查询时的性能。 268 | 269 | 这四个接口的使用方法示例: 270 | 271 | ```go 272 | // MySQL 273 | c := NewCrawler( 274 | WithCache(&cache.MySQLCache{ 275 | Host: "127.0.0.1", 276 | Port: "3306", 277 | Database: "predator", 278 | Username: "root", 279 | Password: "123456", 280 | }, false), // false 为关闭压缩,true 为开启压缩,下同 281 | ) 282 | 283 | // PostgreSQL 284 | c := NewCrawler( 285 | WithCache(&cache.PostgreSQLCache{ 286 | Host: "127.0.0.1", 287 | Port: "54322", 288 | Database: "predator", 289 | Username: "postgres", 290 | Password: "123456", 291 | }, false), 292 | ) 293 | 294 | // Redis 295 | c := NewCrawler( 296 | WithCache(&cache.RedisCache{ 297 | Addr: "localhost:6379", 298 | }, true), 299 | ) 300 | 301 | // SQLite3 302 | c := NewCrawler( 303 | WithCache(&cache.SQLiteCache{ 304 | URI: uri, // uri 为数据库存放的位置,尽量加上后缀名 .sqlite 305 | }, true), 306 | ) 307 | // 也可以使用默认值。WithCache 的第一个为 nil 时, 308 | // 默认使用 SQLite 作为缓存,且会将缓存保存在当前 309 | // 目录下的 predator-cache.sqlite 中 310 | c := NewCrawler(WithCache(nil, true)) 311 | ``` 312 | 313 | ### 9 代理 314 | 315 | 支持 HTTP 代理和 Socks5 代理。 316 | 317 | 使用代理时需要加上协议,如: 318 | 319 | ```go 320 | WithProxyPool([]string{"http://ip:port", "socks5://ip:port"}) 321 | ``` 322 | 323 | ### 10 日志 324 | 325 | 日志使用的是流行日志库[`zerolog`](https://github.com/rs/zerolog)。 326 | 327 | 默认情况下,日志是不开启的,需要手动开启。 328 | 329 | `WithLogger`选项需要填入一个参数`*predator.LogOp`,当填入`nil`时,默认会以`INFO`等级从终端美化输出。 330 | 331 | ```go 332 | crawler := predator.NewCrawler( 333 | predator.WithLogger(nil), 334 | ) 335 | ``` 336 | 337 | `predator.LogOp`对外公开四个方法: 338 | 339 | - *SetLevel*:设置日志等级。等级可选:`DEBUG`、`INFO`、`WARNING`、`ERROR`、`FATAL` 340 | 341 | ```go 342 | logOp := new(predator.LogOp) 343 | // 设置为 INFO 344 | logOp.SetLevel(log.INFO) 345 | ``` 346 | 347 | - *ToConsole*:美化输出到终端。 348 | 349 | - *ToFile*:JSON 格式输出到文件。 350 | 351 | - *ToConsoleAndFile*:既美化输出到终端,同时以 JSON 格式输出到文件。 352 | 353 | 日志的完整示例: 354 | 355 | ```go 356 | import "github.com/go-predator/predator/log" 357 | 358 | func main() { 359 | logOp := new(predator.LogOp) 360 | logOp.SetLevel(log.INFO) 361 | logOp.ToConsoleAndFile("test.log") 362 | 363 | crawler := predator.NewCrawler( 364 | predator.WithLogger(logOp), 365 | ) 366 | } 367 | ``` 368 | 369 | ### 11 关于 JSON 370 | 371 | 本来想着封装一个 JSON 包用来快速处理 JSON 响应,但是想了一两天也没想出个好办法来,因为我能想到的,[gjson](https://github.com/tidwall/gjson)都已经解决了。 372 | 373 | 对于 JSON 响应,能用`gjson`处理就不要老想着反序列化了。对于爬虫而言,反序列化是不明智的选择。 374 | 375 | 当然,如果你确实有反序列化的需求,也不要用标准库,使用封装的 JSON 包中的序列化和反序列化方法比标准库性能高。 376 | 377 | ```GO 378 | import "github.com/go-predator/predator/json" 379 | 380 | json.Marshal() 381 | json.Unmarshal() 382 | json.UnmarshalFromString() 383 | ``` 384 | 385 | 对付 JSON 响应,当前足够用了。 386 | 387 | ## 目标 388 | 389 | - [x] 完成对失败响应的重新请求,直到重试了传入的重试次数时才算最终请求失败 390 | - [x] 识别因代理失效而造成的请求失败。当使用代理池时,代理池中剔除此代理;代理池为空时,终止整个爬虫程序 391 | - 考虑到使用代理必然是因为不想将本地 ip 暴露给目标网站或服务器,所以在使用代理后,当所有代理都失效时,不再继续发出请求 392 | - [x] HTML 页面解析。方便定位查找元素 393 | - [x] json 扩展,用来处理、筛选 json 响应的数据,原生 json 库不适合用在爬虫上 394 | - 暂时没想到如何封装便捷好用的 json ,当前 json 包中只能算是使用示例 395 | - [x] 协程池,实现在多协程时对每个 goroutine 的复用,避免重复创建 396 | - [x] 定义缓存接口,并完成一种或多种缓存。因为临时缓存在爬虫中并不实用,所以 predator 采用持久化缓存。 397 | - 默认使用 sqlite3 进行缓存,可以使用已实现的其他缓存数据库,也可以自己实现缓存接口 398 | - 可用缓存存储有 SQLite3、MySQL、PostgreSQL、Redis 399 | - 因为采用持久化缓存,所以不实现以内存作为缓存,如果需要请自行根据缓存接口实现 400 | - [x] 数据库管理接口,用来保存爬虫数据,并完成一种或多种数据库的管理 401 | - SQL 数据库接口已实现了,NoSQL 接口与 SQL 差别较大,就不实现了,如果有使用 NoSQL 的需求,请自己实现 402 | - 数据库接口没有封装在 Crawler 方法中,根据需要使用,一般场景下够用,复杂场景中仍然需要自己重写数据库管理 403 | - [x] 添加日志 404 | - 可能还不完善 405 | - [x] 为`Request`和`Response`的请求体`Body`添加池管理,减少 GC 次数 406 | - body 本身就是`[]byte`,作为引用类型,只要不删除引用关系,其内存就不会被回收 407 | - 将原求就不是`nil`的 body 截断为 `body[:0]` 即可,不需要使用池来管理 408 | - [x] 对于链式请求或多种请求,可对`POST`和`GET`设置不同的缓存字段 409 | - [x] 链式请求中可以对每个请求单独设置不同的缓存参数 410 | - [x] 声明一个代理api处理方法,参数为一个整型,可以请求代理池中代理的数量返回代理切片,形成代理池。后续可以每次请求一个代理,用于实时补全代理池。这个方法需用户自行实现。 411 | - [ ] 增加对 robots.txt 的判断,默认遵守 robots.txt 规则,但可以选择忽略 412 | -------------------------------------------------------------------------------- /async_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: async_test.go 5 | * @Created: 2021-07-31 13:14:09 6 | * @Modified: 2021-11-17 11:38:16 7 | */ 8 | 9 | package predator 10 | 11 | import ( 12 | "fmt" 13 | "math/rand" 14 | "strings" 15 | "testing" 16 | 17 | "github.com/go-predator/predator/context" 18 | . "github.com/smartystreets/goconvey/convey" 19 | ) 20 | 21 | func buildRequestBody(queryID string, page int) map[string]string { 22 | return map[string]string{ 23 | "id": queryID, 24 | "page": fmt.Sprint(page), 25 | "key1": "value1", 26 | "key2": "value2", 27 | "key3": "value3", 28 | "key4": "", 29 | } 30 | } 31 | 32 | func randomBoundary() string { 33 | var s strings.Builder 34 | count := 29 35 | for i := 0; i < count; i++ { 36 | if i == 0 { 37 | s.WriteString(fmt.Sprint(rand.Intn(9) + 1)) 38 | } else { 39 | s.WriteString(fmt.Sprint(rand.Intn(10))) 40 | } 41 | } 42 | return s.String() 43 | } 44 | 45 | func parsePerPage(c *Crawler, u, queryID string, page int) error { 46 | // 创造请求体 47 | body := buildRequestBody(queryID, page) 48 | form := NewMultipartForm( 49 | "-------------------", 50 | randomBoundary, 51 | ) 52 | for k, v := range body { 53 | form.AppendString(k, v) 54 | } 55 | 56 | // 将请求体中的关键参数传入上下文 57 | ctx, _ := context.NewContext() 58 | ctx.Put("qid", queryID) 59 | ctx.Put("page", page) 60 | 61 | return c.PostMultipart(u, form, ctx) 62 | } 63 | 64 | func testAsync(crawler *Crawler, t *testing.T) { 65 | crawler.BeforeRequest(func(r *Request) { 66 | headers := map[string]string{ 67 | "Accept": "*/*", 68 | "Accept-Language": "zh-CN", 69 | "Accept-Encoding": "gzip, deflate", 70 | } 71 | 72 | r.SetHeaders(headers) 73 | 74 | }) 75 | 76 | crawler.AfterResponse(func(r *Response) { 77 | qid := r.Ctx.Get("qid") 78 | page := r.Ctx.GetAny("page").(int) 79 | t.Logf("qid=%s page=%d", qid, page) 80 | }) 81 | 82 | // 请求多个分类的第一页内容 83 | for i := 0; i < 100; i++ { 84 | err := parsePerPage(crawler, "https://httpbin.org/post", fmt.Sprint(i+100), i+1) 85 | if err != nil { 86 | t.Error("爬取失败:", err) 87 | } 88 | } 89 | } 90 | 91 | func TestAsync(t *testing.T) { 92 | Convey("同步耗时", t, func() { 93 | defer timeCost()() 94 | crawler := NewCrawler( 95 | WithCache(nil, true, nil), 96 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"), 97 | ) 98 | 99 | testAsync(crawler, t) 100 | crawler.ClearCache() 101 | }) 102 | 103 | Convey("异步耗时", t, func() { 104 | defer timeCost()() 105 | crawler := NewCrawler( 106 | WithCache(nil, true, nil), 107 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"), 108 | WithConcurrency(30, false), 109 | ) 110 | 111 | testAsync(crawler, t) 112 | 113 | crawler.Wait() 114 | crawler.ClearCache() 115 | }) 116 | } 117 | -------------------------------------------------------------------------------- /cache.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: cache.go 5 | * @Created: 2021-11-24 20:39:11 6 | * @Modified: 2022-10-30 20:01:00 7 | */ 8 | 9 | package predator 10 | 11 | import ( 12 | "fmt" 13 | "net/url" 14 | ) 15 | 16 | type Cache interface { 17 | // 是否开启压缩。压缩后能减小数据量,但压缩过程会耗时。 18 | // 如果原数据长度很长,压缩耗时要比查询耗时低得多,此时开启压缩功能是最佳选择。 19 | // 但如果原数据长度较短,压缩或不压缩,整体耗时区别不大。 20 | // 是否开启压缩,需要自行测试抉择。 21 | Compressed(yes bool) 22 | // 初始化,用来迁移数据库 / 表,和一些与数据库有关的前期准备工作 23 | Init() error 24 | // 当前请求是否已缓存过,如果缓存过,则返回缓存中的响应 25 | IsCached(key string) ([]byte, bool) 26 | // 将没有缓存过的请求保存到缓存中 27 | Cache(key string, val []byte) error 28 | // 清除全部缓存 29 | Clear() error 30 | } 31 | 32 | type CacheModel struct { 33 | Key string `gorm:"primaryKey"` 34 | Value []byte 35 | } 36 | 37 | func (CacheModel) TableName() string { 38 | return "predator-cache" 39 | } 40 | 41 | type cacheFieldType uint8 42 | 43 | const ( 44 | // A key or field from URL query parameters 45 | queryParam cacheFieldType = iota 46 | // A key or field from request body parameters 47 | requestBodyParam 48 | ) 49 | 50 | type CacheField struct { 51 | code cacheFieldType 52 | Field string 53 | } 54 | 55 | func (cf CacheField) String() string { 56 | return fmt.Sprintf("%d-%s", cf.code, cf.Field) 57 | } 58 | 59 | func addQueryParamCacheField(params url.Values, field CacheField) (string, string, error) { 60 | if val := params.Get(field.Field); val != "" { 61 | return field.String(), val, nil 62 | } else { 63 | // 如果设置了 cachedFields,但 url 查询参数中却没有某个 field,则报异常退出 64 | return "", "", fmt.Errorf("there is no such field [%s] in the query parameters: %v", field.Field, params.Encode()) 65 | } 66 | } 67 | 68 | func NewQueryParamField(field string) CacheField { 69 | return CacheField{queryParam, field} 70 | } 71 | 72 | func NewRequestBodyParamField(field string) CacheField { 73 | return CacheField{requestBodyParam, field} 74 | } 75 | -------------------------------------------------------------------------------- /context/api.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: api.go 5 | * @Created: 2021-07-24 08:55:30 6 | * @Modified: 2022-04-18 13:29:17 7 | */ 8 | 9 | package context 10 | 11 | import ( 12 | "fmt" 13 | "sync" 14 | ) 15 | 16 | type Context interface { 17 | // Get 通过 key 在上下文中获取一个字符串 18 | Get(key string) string 19 | // GetAny 通过 key 在上下文中获取一个任意类型 20 | GetAny(key string) any 21 | // Put 向上下文中传入一个 key: value 22 | Put(key string, val any) 23 | // GetAndDelete 获取并删除一个 key 24 | GetAndDelete(key string) any 25 | // Delete 在上下文中删除指定的 key 26 | Delete(key string) 27 | // ForEach 将上下文中的全部 key 和 value 用传 28 | // 入的函数处理后返回一个处理结果的切片 29 | ForEach(func(key string, val any) any) []any 30 | // Clear 清空一个上下文 31 | Clear() 32 | // Length 返回上下文的长度 33 | Length() int 34 | // String 将上下文转换为 json(非标准) 字符串 35 | String() string 36 | // Bytes 格式化为 json 用的字节切片 37 | Bytes() []byte 38 | } 39 | 40 | // 上下文类型 41 | type CtxOp int 42 | 43 | const ( 44 | // 以读为主的上下文, 45 | // 适用于读操作远多于写的场景 46 | ReadOp CtxOp = iota 47 | // 适用于读写各半或写多于读的场景 48 | WriteOp 49 | ) 50 | 51 | var ctxPool sync.Pool 52 | 53 | // AcquireCtx returns an empty Context instance from context pool. 54 | // 55 | // The returned Context instance may be passed to ReleaseCtx when it is 56 | // no longer needed. This allows Context recycling, reduces GC pressure 57 | // and usually improves performance. 58 | func AcquireCtx(ops ...CtxOp) (Context, error) { 59 | if len(ops) > 1 { 60 | return nil, fmt.Errorf("only 1 op can be passed in as most, but you passed %d ops", len(ops)) 61 | } 62 | v := ctxPool.Get() 63 | if v == nil { 64 | return NewContext(ops...) 65 | } 66 | return v.(Context), nil 67 | } 68 | 69 | // ReleaseCtx returns ctx acquired via AcquireCtx to Context pool. 70 | // 71 | // It is forbidden accessing ctx and/or its' members after returning 72 | // it to Context pool. 73 | func ReleaseCtx(ctx Context) { 74 | ctx.Clear() 75 | ctxPool.Put(ctx) 76 | } 77 | 78 | // NewContext returns a new Context instance 79 | func NewContext(ops ...CtxOp) (Context, error) { 80 | if len(ops) > 1 { 81 | return nil, fmt.Errorf("only 1 op can be passed in as most, but you passed %d ops", len(ops)) 82 | } 83 | 84 | var op CtxOp 85 | if len(ops) == 0 { 86 | op = WriteOp 87 | } else { 88 | op = ops[0] 89 | } 90 | 91 | switch op { 92 | case ReadOp: 93 | return &rcontext{}, nil 94 | case WriteOp: 95 | return &wcontext{ 96 | m: make(map[string]any), 97 | l: &sync.RWMutex{}, 98 | }, nil 99 | default: 100 | return nil, fmt.Errorf("unkown op: %d", op) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /context/context_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: context_test.go 5 | * @Created: 2021-07-24 12:18:30 6 | * @Modified: 2022-04-18 13:29:16 7 | */ 8 | 9 | package context 10 | 11 | import ( 12 | "bytes" 13 | "testing" 14 | 15 | . "github.com/smartystreets/goconvey/convey" 16 | ) 17 | 18 | func putSomeCtx(ctx Context) { 19 | ctx.Put("four", 4) 20 | ctx.Put("five", "5") 21 | ctx.Put("six", '6') 22 | ctx.Put("seven", []int{7}) 23 | ctx.Put("eight", [1]int{8}) 24 | } 25 | 26 | func ctxTest(ctx Context) { 27 | Convey("添加整数", func() { 28 | ctx.Put("one", 1) 29 | Convey("获取整数", func() { 30 | val := ctx.GetAny("one").(int) 31 | So(val, ShouldEqual, 1) 32 | Convey("删除添加的整数", func() { 33 | ctx.Delete("one") 34 | val := ctx.GetAny("one") 35 | So(val, ShouldBeNil) 36 | }) 37 | }) 38 | }) 39 | Convey("添加字符串", func() { 40 | ctx.Put("two", "2") 41 | Convey("获取字符串", func() { 42 | val := ctx.Get("two") 43 | So(val, ShouldEqual, "2") 44 | Convey("获取并删除添加的字符串", func() { 45 | deleted := ctx.GetAndDelete("two") 46 | val := ctx.Get("two") 47 | So(deleted.(string), ShouldEqual, "2") 48 | So(val, ShouldEqual, "") 49 | }) 50 | }) 51 | }) 52 | Convey("添加字节切片", func() { 53 | ctx.Put("three", []byte("3")) 54 | Convey("获取字节切片", func() { 55 | val := ctx.GetAny("three").([]byte) 56 | So(bytes.Equal(val, []byte("3")), ShouldBeTrue) 57 | }) 58 | Convey("遍历上下文", func() { 59 | ctx.Put("four", 4) 60 | ctx.Put("five", "5") 61 | ctx.Put("six", '6') 62 | ctx.Put("seven", []int{7}) 63 | ctx.Put("eight", [1]int{8}) 64 | val := ctx.ForEach(func(key string, val any) any { 65 | return val 66 | }) 67 | So(len(val), ShouldEqual, 6) 68 | }) 69 | }) 70 | Convey("上下文长度", func() { 71 | putSomeCtx(ctx) 72 | So(ctx.Length(), ShouldEqual, 5) 73 | }) 74 | Convey("清空上下文", func() { 75 | putSomeCtx(ctx) 76 | ctx.Clear() 77 | So(ctx.Length(), ShouldEqual, 0) 78 | }) 79 | } 80 | 81 | func TestContext(t *testing.T) { 82 | Convey("上下文测试", t, func() { 83 | Convey("读上下文测试", func() { 84 | ctx, _ := NewContext(ReadOp) 85 | ctxTest(ctx) 86 | }) 87 | Convey("写上下文测试", func() { 88 | ctx, _ := NewContext(WriteOp) 89 | ctxTest(ctx) 90 | }) 91 | }) 92 | } 93 | -------------------------------------------------------------------------------- /context/read.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: read.go 5 | * @Created: 2021-07-24 08:56:04 6 | * @Modified: 2022-04-18 13:29:10 7 | */ 8 | 9 | package context 10 | 11 | import ( 12 | "bytes" 13 | "fmt" 14 | "strings" 15 | "sync" 16 | ) 17 | 18 | type rcontext struct { 19 | sync.Map 20 | } 21 | 22 | func (r *rcontext) GetAny(key string) any { 23 | val, ok := r.Load(key) 24 | if ok { 25 | return val 26 | } 27 | return nil 28 | } 29 | 30 | func (r *rcontext) Get(key string) string { 31 | val := r.GetAny(key) 32 | if val == nil { 33 | return "" 34 | } 35 | return val.(string) 36 | } 37 | 38 | func (r *rcontext) Put(key string, val any) { 39 | r.Store(key, val) 40 | } 41 | 42 | func (r *rcontext) GetAndDelete(key string) any { 43 | val, ok := r.LoadAndDelete(key) 44 | if ok { 45 | return val 46 | } 47 | return nil 48 | } 49 | 50 | func (r *rcontext) Delete(key string) { 51 | r.Map.Delete(key) 52 | } 53 | 54 | func (r *rcontext) ForEach(f func(key string, val any) any) []any { 55 | result := make([]any, 0, r.Length()) 56 | r.Range(func(key, value any) bool { 57 | result = append(result, f(key.(string), value)) 58 | return true 59 | }) 60 | return result 61 | } 62 | 63 | func (r *rcontext) Clear() { 64 | r.Range(func(key, value any) bool { 65 | r.Map.Delete(key) 66 | return true 67 | }) 68 | } 69 | 70 | func (r *rcontext) Length() int { 71 | l := 0 72 | r.Range(func(key, value any) bool { 73 | l++ 74 | return true 75 | }) 76 | return l 77 | } 78 | 79 | func (r *rcontext) Bytes() []byte { 80 | var b bytes.Buffer 81 | b.WriteByte('{') 82 | i := 0 83 | r.Range(func(key, value any) bool { 84 | if i > 0 { 85 | b.WriteString(`, `) 86 | } 87 | b.WriteByte('"') 88 | b.WriteString(key.(string)) 89 | b.WriteString(`": "`) 90 | b.WriteString(fmt.Sprint(value)) 91 | b.WriteByte('"') 92 | i++ 93 | return true 94 | }) 95 | b.WriteByte('}') 96 | return b.Bytes() 97 | } 98 | 99 | func (r *rcontext) String() string { 100 | var s strings.Builder 101 | s.WriteByte('{') 102 | i := 0 103 | r.Range(func(key, value any) bool { 104 | if i > 0 { 105 | s.WriteString(`, `) 106 | } 107 | s.WriteByte('"') 108 | s.WriteString(key.(string)) 109 | s.WriteString(`": "`) 110 | s.WriteString(fmt.Sprint(value)) 111 | s.WriteByte('"') 112 | i++ 113 | return true 114 | }) 115 | s.WriteByte('}') 116 | return strings.ReplaceAll(s.String(), ", }", "}") 117 | } 118 | -------------------------------------------------------------------------------- /context/write.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: write.go 5 | * @Created: 2021-07-24 08:56:16 6 | * @Modified: 2022-04-18 13:28:52 7 | */ 8 | 9 | package context 10 | 11 | import ( 12 | "bytes" 13 | "fmt" 14 | "strings" 15 | "sync" 16 | ) 17 | 18 | type wcontext struct { 19 | m map[string]any 20 | l *sync.RWMutex 21 | } 22 | 23 | func (w *wcontext) GetAny(key string) any { 24 | w.l.RLock() 25 | defer w.l.RUnlock() 26 | 27 | if v, ok := w.m[key]; ok { 28 | return v 29 | } 30 | return nil 31 | } 32 | 33 | func (w *wcontext) Get(key string) string { 34 | val := w.GetAny(key) 35 | if val == nil { 36 | return "" 37 | } 38 | return val.(string) 39 | } 40 | 41 | func (w *wcontext) Put(key string, val any) { 42 | w.l.Lock() 43 | w.m[key] = val 44 | w.l.Unlock() 45 | } 46 | 47 | func (w *wcontext) GetAndDelete(key string) any { 48 | w.l.Lock() 49 | defer w.l.Unlock() 50 | 51 | v, ok := w.m[key] 52 | if !ok { 53 | return nil 54 | } 55 | 56 | delete(w.m, key) 57 | 58 | return v 59 | } 60 | 61 | func (w *wcontext) Delete(key string) { 62 | w.GetAndDelete(key) 63 | } 64 | 65 | // ForEach 将上下文中的全部 key 和 value 用传入的函数处理后返回一个处理结果的切片 66 | func (w *wcontext) ForEach(f func(key string, val any) any) []any { 67 | w.l.RLock() 68 | defer w.l.RUnlock() 69 | 70 | result := make([]any, 0, len(w.m)) 71 | for k, v := range w.m { 72 | result = append(result, f(k, v)) 73 | } 74 | return result 75 | } 76 | 77 | func (w *wcontext) Clear() { 78 | w.l.Lock() 79 | // 不需要释放内存,而是应该复用内存,频繁地申请内存是不必要的 80 | for k := range w.m { 81 | delete(w.m, k) 82 | } 83 | w.l.Unlock() 84 | } 85 | 86 | func (w *wcontext) Length() int { 87 | w.l.RLock() 88 | defer w.l.RUnlock() 89 | 90 | return len(w.m) 91 | } 92 | 93 | func (w *wcontext) Bytes() []byte { 94 | w.l.RLock() 95 | defer w.l.RUnlock() 96 | 97 | var b bytes.Buffer 98 | b.WriteByte('{') 99 | i := 0 100 | for k, v := range w.m { 101 | if i > 0 { 102 | b.WriteString(`, `) 103 | } 104 | b.WriteByte('"') 105 | b.WriteString(k) 106 | b.WriteString(`": "`) 107 | b.WriteString(fmt.Sprint(v)) 108 | b.WriteByte('"') 109 | i++ 110 | } 111 | b.WriteByte('}') 112 | return b.Bytes() 113 | } 114 | 115 | func (w *wcontext) String() string { 116 | w.l.RLock() 117 | defer w.l.RUnlock() 118 | 119 | var s strings.Builder 120 | s.WriteByte('{') 121 | i := 0 122 | for k, v := range w.m { 123 | if i > 0 { 124 | s.WriteString(`, `) 125 | } 126 | s.WriteByte('"') 127 | s.WriteString(k) 128 | s.WriteString(`": "`) 129 | s.WriteString(fmt.Sprint(v)) 130 | s.WriteByte('"') 131 | i++ 132 | } 133 | s.WriteByte('}') 134 | return s.String() 135 | } 136 | -------------------------------------------------------------------------------- /craw.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: craw.go 5 | * @Created: 2021-07-23 08:52:17 6 | * @Modified: 2022-11-29 16:13:58 7 | */ 8 | 9 | package predator 10 | 11 | import ( 12 | "context" 13 | "errors" 14 | "fmt" 15 | "math/rand" 16 | "net" 17 | "net/url" 18 | "strings" 19 | "sync" 20 | "sync/atomic" 21 | "time" 22 | 23 | "github.com/PuerkitoBio/goquery" 24 | "github.com/go-predator/log" 25 | pctx "github.com/go-predator/predator/context" 26 | "github.com/go-predator/predator/html" 27 | "github.com/go-predator/predator/json" 28 | "github.com/go-predator/predator/proxy" 29 | "github.com/valyala/fasthttp" 30 | ) 31 | 32 | // HandleRequest is used to patch the request 33 | type HandleRequest func(r *Request) 34 | 35 | // HandleResponse is used to handle the response 36 | type HandleResponse func(r *Response) 37 | 38 | // HandleHTML is used to process html 39 | type HandleHTML func(he *html.HTMLElement, r *Response) 40 | 41 | type HandleJSON func(j json.JSONResult, r *Response) 42 | 43 | // HTMLParser is used to parse html 44 | type HTMLParser struct { 45 | Selector string 46 | Handle HandleHTML 47 | } 48 | 49 | // JSONParser is used to parse json 50 | type JSONParser struct { 51 | strict bool 52 | Handle HandleJSON 53 | } 54 | 55 | // CustomRandomBoundary generates a custom boundary 56 | type CustomRandomBoundary func() string 57 | 58 | type CacheCondition func(r *Response) bool 59 | 60 | type ProxyInvalidCondition func(r *Response) error 61 | 62 | type ComplementProxyPool func() []string 63 | 64 | // Crawler is the provider of crawlers 65 | type Crawler struct { 66 | lock *sync.RWMutex 67 | // UserAgent is the User-Agent string used by HTTP requests 68 | UserAgent string 69 | retryCount uint32 70 | // Retry condition, the crawler will retry only 71 | // if it returns true 72 | retryCondition RetryCondition 73 | client *fasthttp.Client 74 | cookies map[string]string 75 | goPool *Pool 76 | proxyURLPool []string 77 | proxyInvalidCondition ProxyInvalidCondition 78 | proxyInUse string 79 | complementProxyPool ComplementProxyPool 80 | requestCount uint32 81 | responseCount uint32 82 | // TODO: 在多协程中这个上下文管理可以用来退出或取消多个协程 83 | Context context.Context 84 | 85 | // Cache successful response 86 | cache Cache 87 | // List of fields to be cached in the request body, and 88 | // the combination of these fields can represent the unique 89 | // request body. 90 | // The fewer fields the better. 91 | cacheFields []CacheField 92 | cacheCondition CacheCondition 93 | 94 | requestHandler []HandleRequest 95 | 96 | // Array of functions to handle the response 97 | responseHandler []HandleResponse 98 | // Array of functions to handle parsed html 99 | htmlHandler []*HTMLParser 100 | jsonHandler []*JSONParser 101 | 102 | wg *sync.WaitGroup 103 | 104 | log *log.Logger 105 | } 106 | 107 | // NewCrawler creates a new Crawler instance with some CrawlerOptions 108 | func NewCrawler(opts ...CrawlerOption) *Crawler { 109 | c := new(Crawler) 110 | 111 | c.UserAgent = "Predator" 112 | 113 | c.client = new(fasthttp.Client) 114 | 115 | for _, op := range opts { 116 | op(c) 117 | } 118 | 119 | // If there is `DEBUG` in the environment variable and `c.log` is nil, 120 | // create a logger with a level of `DEBUG` 121 | if c.log == nil && log.IsDebug() { 122 | c.log = log.NewLogger( 123 | log.DEBUG, 124 | log.ToConsole(), 125 | ) 126 | } 127 | 128 | c.lock = &sync.RWMutex{} 129 | 130 | c.Context = context.Background() 131 | 132 | capacityState := c.goPool != nil 133 | 134 | if c.log != nil { 135 | if capacityState { 136 | c.Info("concurrent", log.Arg{Key: "state", Value: capacityState}, log.Arg{Key: "capacity", Value: c.goPool.capacity}) 137 | } else { 138 | c.Info("concurrent", log.Arg{Key: "state", Value: capacityState}) 139 | } 140 | } 141 | 142 | if c.log != nil && c.goPool != nil { 143 | c.goPool.log = c.log 144 | } 145 | 146 | return c 147 | } 148 | 149 | // Clone creates an exact copy of a Crawler without callbacks. 150 | func (c *Crawler) Clone() *Crawler { 151 | var ( 152 | pool *Pool 153 | err error 154 | ) 155 | if c.goPool == nil { 156 | pool = nil 157 | } else { 158 | pool, err = NewPool(c.goPool.capacity) 159 | if err != nil { 160 | c.FatalOrPanic(err) 161 | } 162 | } 163 | return &Crawler{ 164 | lock: c.lock, 165 | UserAgent: c.UserAgent, 166 | retryCount: c.retryCount, 167 | retryCondition: c.retryCondition, 168 | client: c.client, 169 | cookies: c.cookies, 170 | goPool: pool, 171 | proxyURLPool: c.proxyURLPool, 172 | Context: c.Context, 173 | cache: c.cache, 174 | cacheCondition: c.cacheCondition, 175 | cacheFields: c.cacheFields, 176 | requestHandler: make([]HandleRequest, 0, 5), 177 | responseHandler: make([]HandleResponse, 0, 5), 178 | htmlHandler: make([]*HTMLParser, 0, 5), 179 | jsonHandler: make([]*JSONParser, 0, 1), 180 | wg: &sync.WaitGroup{}, 181 | log: c.log, 182 | } 183 | } 184 | 185 | /************************* http 请求方法 ****************************/ 186 | 187 | func (c *Crawler) request(method, URL string, body []byte, cachedMap map[string]string, reqHeader *fasthttp.RequestHeader, ctx pctx.Context, isChained bool) error { 188 | defer func() { 189 | if c.goPool != nil { 190 | if err := recover(); err != nil { 191 | c.FatalOrPanic(fmt.Errorf("worker panic: %s", err)) 192 | } 193 | } 194 | }() 195 | 196 | var err error 197 | 198 | reqHeader.SetMethod(method) 199 | if reqHeader.UserAgent() == nil { 200 | reqHeader.SetUserAgent(c.UserAgent) 201 | } 202 | 203 | if c.cookies != nil { 204 | for k, v := range c.cookies { 205 | reqHeader.SetCookie(k, v) 206 | } 207 | if c.log != nil { 208 | c.Debug("cookies is set", log.Arg{Key: "cookies", Value: reqHeader.Peek("Cookie")}) 209 | } 210 | } 211 | 212 | if ctx == nil { 213 | ctx, err = pctx.AcquireCtx() 214 | if err != nil { 215 | if c.log != nil { 216 | c.log.Error(err) 217 | } 218 | return err 219 | } 220 | } 221 | 222 | u, err := url.Parse(URL) 223 | if err != nil { 224 | return err 225 | } 226 | // Convert non-ascii characters in query parameters to ascii characters 227 | u.RawQuery = u.Query().Encode() 228 | 229 | uri := fasthttp.AcquireURI() 230 | uri.Parse([]byte(u.Host), []byte(u.String())) 231 | 232 | request := AcquireRequest() 233 | request.Headers = reqHeader 234 | request.Ctx = ctx 235 | request.Body = body 236 | request.cachedMap = cachedMap 237 | request.ID = atomic.AddUint32(&c.requestCount, 1) 238 | request.crawler = c 239 | request.uri = uri 240 | 241 | if c.goPool != nil { 242 | c.wg.Add(1) 243 | task := &Task{ 244 | crawler: c, 245 | req: request, 246 | isChained: isChained, 247 | } 248 | err = c.goPool.Put(task) 249 | if err != nil { 250 | if c.log != nil { 251 | c.log.Error(err) 252 | } 253 | return err 254 | } 255 | return nil 256 | } 257 | 258 | err = c.prepare(request, isChained) 259 | if err != nil { 260 | return err 261 | } 262 | 263 | return nil 264 | } 265 | 266 | func (c *Crawler) prepare(request *Request, isChained bool) (err error) { 267 | if c.goPool != nil { 268 | defer c.wg.Done() 269 | } 270 | 271 | c.processRequestHandler(request) 272 | 273 | if request.abort { 274 | if c.log != nil { 275 | c.Debug("the request is aborted", log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}) 276 | } 277 | return 278 | } 279 | 280 | if c.log != nil { 281 | c.Info( 282 | "requesting", 283 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}, 284 | log.Arg{Key: "method", Value: request.Method()}, 285 | log.Arg{Key: "url", Value: request.URL()}, 286 | log.Arg{Key: "timeout", Value: request.timeout.String()}, 287 | ) 288 | } 289 | 290 | if request.Ctx.Length() > 0 { 291 | if c.log != nil { 292 | c.Debug("using context", log.Arg{Key: "context", Value: request.Ctx.String()}) 293 | } 294 | } 295 | 296 | var response *Response 297 | 298 | var key string 299 | 300 | if c.cache != nil { 301 | key, err = request.Hash() 302 | if err != nil { 303 | if c.log != nil { 304 | c.log.Error(err) 305 | } 306 | return 307 | } 308 | 309 | if c.log != nil { 310 | c.Debug( 311 | "generate cache key", 312 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}, 313 | log.Arg{Key: "cache_key", Value: key}, 314 | ) 315 | } 316 | 317 | response, err = c.checkCache(key) 318 | if err != nil { 319 | return 320 | } 321 | 322 | if response != nil && c.log != nil { 323 | c.log.Debug("response is in the cache", 324 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}, 325 | log.Arg{Key: "cache_key", Value: key}, 326 | ) 327 | } 328 | } 329 | 330 | var rawResp *fasthttp.Response 331 | // A new request is issued when there 332 | // is no response from the cache 333 | if response == nil { 334 | response, rawResp, err = c.do(request) 335 | if err != nil { 336 | return 337 | } 338 | 339 | // Cache the response from the request if the statuscode is 20X 340 | if c.cache != nil && c.cacheCondition(response) && key != "" { 341 | cacheVal, err := response.Marshal() 342 | if err != nil { 343 | if c.log != nil { 344 | c.log.Error(err) 345 | } 346 | return err 347 | } 348 | 349 | if cacheVal != nil { 350 | c.lock.Lock() 351 | err = c.cache.Cache(key, cacheVal) 352 | if err != nil { 353 | if c.log != nil { 354 | c.log.Error(err) 355 | } 356 | return err 357 | } 358 | c.lock.Unlock() 359 | } 360 | } 361 | } else { 362 | response.Request = request 363 | response.Ctx = request.Ctx 364 | } 365 | 366 | if response.StatusCode == fasthttp.StatusFound { 367 | location := response.Headers.Peek("location") 368 | 369 | if c.log != nil { 370 | c.log.Info("response", 371 | log.Arg{Key: "method", Value: request.Method()}, 372 | log.Arg{Key: "status_code", Value: response.StatusCode}, 373 | log.Arg{Key: "location", Value: string(location)}, 374 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}, 375 | ) 376 | } 377 | } else { 378 | if c.log != nil { 379 | l := c.log.L.Info(). 380 | Str("method", request.Method()). 381 | Int("status_code", response.StatusCode) 382 | 383 | if !response.FromCache { 384 | if c.ProxyPoolAmount() > 0 { 385 | l = l.Str("proxy", response.ClientIP()) 386 | } else { 387 | l = l.Str("server_addr", response.ClientIP()) 388 | } 389 | } 390 | 391 | l.Bool("from_cache", response.FromCache). 392 | Uint32("request_id", atomic.LoadUint32(&request.ID)). 393 | Msg("response") 394 | } 395 | } 396 | 397 | c.processResponseHandler(response) 398 | 399 | if !response.invalid { 400 | err = c.processHTMLHandler(response) 401 | if err != nil { 402 | return 403 | } 404 | 405 | c.processJSONHandler(response) 406 | } 407 | 408 | ReleaseResponse(response, !isChained) 409 | if rawResp != nil { 410 | // 原始响应应该在自定义响应之后释放,不然一些字段的值会出错 411 | fasthttp.ReleaseResponse(rawResp) 412 | } 413 | 414 | return 415 | } 416 | 417 | func (c *Crawler) FatalOrPanic(err error) { 418 | if c.log != nil { 419 | c.Fatal(err) 420 | } else { 421 | panic(err) 422 | } 423 | } 424 | 425 | func (c *Crawler) checkCache(key string) (*Response, error) { 426 | var err error 427 | cachedBody, ok := c.cache.IsCached(key) 428 | if !ok { 429 | return nil, nil 430 | } 431 | 432 | resp := new(Response) 433 | err = resp.Unmarshal(cachedBody) 434 | if err != nil { 435 | if c.log != nil { 436 | c.log.Error(err) 437 | } 438 | return nil, err 439 | } 440 | resp.FromCache = true 441 | return resp, nil 442 | } 443 | 444 | func newFasthttpRequest(request *Request) *fasthttp.Request { 445 | req := fasthttp.AcquireRequest() 446 | 447 | request.Headers.CopyTo(&req.Header) 448 | req.SetURI(request.uri) 449 | 450 | if request.Method() == MethodPost { 451 | req.SetBody(request.Body) 452 | } 453 | 454 | if request.Method() == MethodPost && req.Header.ContentType() == nil { 455 | req.Header.SetContentType("application/x-www-form-urlencoded") 456 | } 457 | 458 | if req.Header.Peek("Accept") == nil { 459 | req.Header.Set("Accept", "*/*") 460 | } 461 | 462 | uri := req.URI() 463 | if len(req.Header.Host()) == 0 { 464 | host := uri.Host() 465 | req.Header.SetHostBytes(host) 466 | } 467 | req.Header.SetRequestURIBytes(uri.RequestURI()) 468 | 469 | return req 470 | } 471 | 472 | func (c *Crawler) do(request *Request) (*Response, *fasthttp.Response, error) { 473 | req := newFasthttpRequest(request) 474 | 475 | if len(c.proxyURLPool) > 0 { 476 | rand.Seed(time.Now().UnixMicro()) 477 | 478 | c.lock.Lock() 479 | c.client.Dial = func(addr string) (net.Conn, error) { 480 | return c.ProxyDialerWithTimeout(c.proxyURLPool[rand.Intn(len(c.proxyURLPool))], request.timeout)(addr) 481 | } 482 | c.lock.Unlock() 483 | c.Debug("request infomation", log.Arg{Key: "header", Value: req.Header.String()}, log.Arg{Key: "proxy", Value: c.ProxyInUse()}) 484 | } else { 485 | c.Debug("request infomation", log.Arg{Key: "header", Value: req.Header.String()}) 486 | } 487 | 488 | var err error 489 | 490 | resp := fasthttp.AcquireResponse() 491 | 492 | if request.maxRedirectsCount == 0 { 493 | if c.ProxyPoolAmount() > 0 { 494 | req.SetConnectionClose() 495 | } 496 | 497 | if request.timeout > 0 { 498 | err = c.client.DoTimeout(req, resp, request.timeout) 499 | } else { 500 | err = c.client.Do(req, resp) 501 | } 502 | } else { 503 | err = c.client.DoRedirects(req, resp, int(request.maxRedirectsCount)) 504 | } 505 | req.Header.CopyTo(request.Headers) 506 | 507 | response := AcquireResponse() 508 | response.StatusCode = resp.StatusCode() 509 | response.Body = append(response.Body, resp.Body()...) 510 | response.Ctx = request.Ctx 511 | response.Request = request 512 | resp.Header.CopyTo(&response.Headers) 513 | response.clientIP = resp.RemoteAddr() 514 | response.localIP = resp.LocalAddr() 515 | 516 | if response.StatusCode == fasthttp.StatusOK && len(response.Body) == 0 { 517 | // fasthttp.Response 会将空响应的状态码设置为 200,这不合理 518 | response.StatusCode = 0 519 | } 520 | 521 | if x, ok := err.(interface{ Timeout() bool }); ok && x.Timeout() { 522 | response.timeout = true 523 | err = ErrTimeout 524 | } 525 | 526 | if err == nil || err == ErrTimeout || err == fasthttp.ErrDialTimeout { 527 | if c.ProxyPoolAmount() > 0 && c.proxyInvalidCondition != nil { 528 | e := c.proxyInvalidCondition(response) 529 | if e != nil { 530 | err = e 531 | } 532 | } 533 | } 534 | 535 | if err != nil { 536 | if p, ok := proxy.IsProxyError(err); ok { 537 | c.Warning("proxy is invalid", 538 | log.Arg{Key: "proxy", Value: p}, 539 | log.Arg{Key: "proxy_pool", Value: c.proxyURLPool}, 540 | log.Arg{Key: "msg", Value: err}, 541 | ) 542 | 543 | err = c.removeInvalidProxy(p) 544 | if err != nil { 545 | c.FatalOrPanic(err) 546 | } 547 | 548 | c.Info("removed invalid proxy", 549 | log.Arg{Key: "invalid_proxy", Value: p}, 550 | log.Arg{Key: "new_proxy_pool", Value: c.proxyURLPool}, 551 | ) 552 | 553 | fasthttp.ReleaseRequest(req) 554 | fasthttp.ReleaseResponse(resp) 555 | 556 | return c.do(request) 557 | } else { 558 | if err == ErrTimeout || err == fasthttp.ErrDialTimeout { 559 | // re-request if the request timed out. 560 | // re-request 3 times by default when the request times out. 561 | 562 | // if you are using a proxy, the timeout error is probably 563 | // because the proxy is invalid, and it is recommended 564 | // to try a new proxy 565 | if c.retryCount == 0 { 566 | c.retryCount = 3 567 | } 568 | 569 | c.Error(err, log.Arg{Key: "timeout", Value: request.timeout.String()}, log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}) 570 | 571 | if atomic.LoadUint32(&request.retryCounter) < c.retryCount { 572 | c.retryPrepare(request, req, resp) 573 | return c.do(request) 574 | } 575 | fasthttp.ReleaseRequest(req) 576 | fasthttp.ReleaseResponse(resp) 577 | ReleaseResponse(response, true) 578 | 579 | return nil, nil, ErrTimeout 580 | } else { 581 | if err == fasthttp.ErrConnectionClosed { 582 | // Feature error of fasthttp, there is no solution yet, only try again if c.retryCount > 0 or panic 583 | c.Error(err, log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}) 584 | 585 | if c.retryCount == 0 { 586 | c.retryCount = 1 587 | } 588 | 589 | if atomic.LoadUint32(&request.retryCounter) < c.retryCount { 590 | c.retryPrepare(request, req, resp) 591 | return c.do(request) 592 | } 593 | } 594 | c.FatalOrPanic(err) 595 | return nil, nil, err 596 | } 597 | } 598 | } 599 | 600 | c.Debug("response header", log.Arg{Key: "header", Value: resp.Header.String()}) 601 | 602 | // Only count successful responses 603 | atomic.AddUint32(&c.responseCount, 1) 604 | // release req 605 | fasthttp.ReleaseRequest(req) 606 | 607 | if c.retryCount > 0 && atomic.LoadUint32(&request.retryCounter) < c.retryCount { 608 | if c.retryCondition != nil && c.retryCondition(response) { 609 | c.Warning("the response meets the retry condition and will be retried soon") 610 | c.retryPrepare(request, req, resp) 611 | return c.do(request) 612 | } 613 | } 614 | 615 | return response, resp, nil 616 | } 617 | 618 | func (c *Crawler) retryPrepare(request *Request, req *fasthttp.Request, resp *fasthttp.Response) { 619 | atomic.AddUint32(&request.retryCounter, 1) 620 | c.Info( 621 | "retrying", 622 | log.Arg{Key: "retry_count", Value: atomic.LoadUint32(&request.retryCounter)}, 623 | log.Arg{Key: "method", Value: request.Method()}, 624 | log.Arg{Key: "url", Value: request.URL()}, 625 | log.Arg{Key: "request_id", Value: atomic.LoadUint32(&request.ID)}, 626 | ) 627 | fasthttp.ReleaseRequest(req) 628 | fasthttp.ReleaseResponse(resp) 629 | } 630 | 631 | func createBody(requestData map[string]string) []byte { 632 | if requestData == nil { 633 | return nil 634 | } 635 | form := url.Values{} 636 | for k, v := range requestData { 637 | form.Add(k, v) 638 | } 639 | return []byte(form.Encode()) 640 | } 641 | 642 | func NewRequestHeaders(headers map[string]string) *fasthttp.RequestHeader { 643 | reqHeaders := new(fasthttp.RequestHeader) 644 | 645 | for k, v := range headers { 646 | reqHeaders.Set(k, v) 647 | } 648 | 649 | return reqHeaders 650 | } 651 | 652 | func setRequestHeaders(headers map[string]string) *fasthttp.RequestHeader { 653 | header := AcquireRequestHeader() 654 | for k, v := range headers { 655 | header.Set(k, v) 656 | } 657 | 658 | return header 659 | } 660 | 661 | func (c *Crawler) get(URL string, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error { 662 | // Parse the query parameters and create a `cachedMap` based on `cacheFields` 663 | u, err := url.Parse(URL) 664 | if err != nil { 665 | c.Error(err) 666 | return err 667 | } 668 | 669 | params := u.Query() 670 | var cachedMap map[string]string 671 | if len(cacheFields) > 0 { 672 | cachedMap = make(map[string]string) 673 | for _, field := range cacheFields { 674 | if field.code != queryParam { 675 | c.FatalOrPanic(ErrNotAllowedCacheFieldType) 676 | } 677 | 678 | key, value, err := addQueryParamCacheField(params, field) 679 | if err != nil { 680 | c.FatalOrPanic(err) 681 | } 682 | 683 | cachedMap[key] = value 684 | } 685 | 686 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap}) 687 | } 688 | 689 | reqHeader := setRequestHeaders(headers) 690 | 691 | return c.request(MethodGet, URL, nil, cachedMap, reqHeader, ctx, isChained) 692 | } 693 | 694 | // Get is used to send GET requests 695 | func (c *Crawler) Get(URL string) error { 696 | return c.GetWithCtx(URL, nil) 697 | } 698 | 699 | // GetWithCtx is used to send GET requests with a context 700 | func (c *Crawler) GetWithCtx(URL string, ctx pctx.Context) error { 701 | return c.get(URL, nil, ctx, false, c.cacheFields...) 702 | } 703 | 704 | func (c *Crawler) post(URL string, requestData, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error { 705 | var cachedMap map[string]string 706 | if len(cacheFields) > 0 { 707 | cachedMap = make(map[string]string) 708 | 709 | var queryParams url.Values 710 | for _, field := range cacheFields { 711 | var ( 712 | err error 713 | key, value string 714 | ) 715 | 716 | switch field.code { 717 | case queryParam: 718 | if queryParams == nil { 719 | u, err := url.Parse(URL) 720 | if err != nil { 721 | c.FatalOrPanic(err) 722 | } 723 | 724 | queryParams = u.Query() 725 | } 726 | 727 | key, value, err = addQueryParamCacheField(queryParams, field) 728 | case requestBodyParam: 729 | if val, ok := requestData[field.Field]; ok { 730 | key, value = field.String(), val 731 | } else { 732 | keys := make([]string, 0, len(requestData)) 733 | for k := range requestData { 734 | keys = append(keys, k) 735 | } 736 | 737 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field.Field, keys) 738 | } 739 | default: 740 | err = ErrInvalidCacheTypeCode 741 | } 742 | 743 | if err != nil { 744 | c.FatalOrPanic(err) 745 | } 746 | 747 | cachedMap[key] = value 748 | } 749 | 750 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap}) 751 | } 752 | 753 | if len(headers) == 0 { 754 | headers = make(map[string]string) 755 | } 756 | if _, ok := headers["Content-Type"]; !ok { 757 | // use default `Content-Type` 758 | headers["Content-Type"] = "application/x-www-form-urlencoded" 759 | } 760 | 761 | reqHeader := setRequestHeaders(headers) 762 | 763 | return c.request(MethodPost, URL, createBody(requestData), cachedMap, reqHeader, ctx, isChained) 764 | } 765 | 766 | // Post is used to send POST requests 767 | func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error { 768 | return c.post(URL, requestData, nil, ctx, false, c.cacheFields...) 769 | } 770 | 771 | func (c *Crawler) createJSONBody(requestData map[string]any) []byte { 772 | if requestData == nil { 773 | return nil 774 | } 775 | body, err := json.Marshal(requestData) 776 | if err != nil { 777 | c.FatalOrPanic(err) 778 | } 779 | return body 780 | } 781 | 782 | func (c *Crawler) postJSON(URL string, requestData map[string]any, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error { 783 | body := c.createJSONBody(requestData) 784 | 785 | var cachedMap map[string]string 786 | if len(cacheFields) > 0 { 787 | cachedMap = make(map[string]string) 788 | bodyJson := json.ParseBytesToJSON(body) 789 | 790 | var queryParams url.Values 791 | 792 | for _, field := range cacheFields { 793 | var ( 794 | err error 795 | key, value string 796 | ) 797 | 798 | switch field.code { 799 | case queryParam: 800 | if queryParams == nil { 801 | u, err := url.Parse(URL) 802 | if err != nil { 803 | c.FatalOrPanic(err) 804 | } 805 | 806 | queryParams = u.Query() 807 | } 808 | 809 | key, value, err = addQueryParamCacheField(queryParams, field) 810 | case requestBodyParam: 811 | if !bodyJson.Get(field.Field).Exists() { 812 | m := bodyJson.Map() 813 | var keys = make([]string, 0, len(m)) 814 | for k := range m { 815 | keys = append(keys, k) 816 | } 817 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field, keys) 818 | } else { 819 | key, value = field.String(), bodyJson.Get(field.Field).String() 820 | } 821 | default: 822 | err = ErrInvalidCacheTypeCode 823 | } 824 | 825 | if err != nil { 826 | c.FatalOrPanic(err) 827 | } 828 | 829 | cachedMap[key] = value 830 | } 831 | 832 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap}) 833 | } 834 | 835 | if len(headers) == 0 { 836 | headers = make(map[string]string) 837 | } 838 | headers["Content-Type"] = "application/json" 839 | 840 | reqHeader := setRequestHeaders(headers) 841 | 842 | return c.request(MethodPost, URL, body, cachedMap, reqHeader, ctx, isChained) 843 | } 844 | 845 | // PostJSON is used to send POST requests whose content-type is json 846 | func (c *Crawler) PostJSON(URL string, requestData map[string]any, ctx pctx.Context) error { 847 | return c.postJSON(URL, requestData, nil, ctx, false, c.cacheFields...) 848 | } 849 | 850 | func (c *Crawler) postMultipart(URL string, form *MultipartForm, headers map[string]string, ctx pctx.Context, isChained bool, cacheFields ...CacheField) error { 851 | var cachedMap map[string]string 852 | if len(cacheFields) > 0 { 853 | cachedMap = make(map[string]string) 854 | 855 | var queryParams url.Values 856 | 857 | for _, field := range cacheFields { 858 | var ( 859 | err error 860 | key, value string 861 | ) 862 | 863 | switch field.code { 864 | case queryParam: 865 | if queryParams == nil { 866 | u, err := url.Parse(URL) 867 | if err != nil { 868 | c.FatalOrPanic(err) 869 | } 870 | 871 | queryParams = u.Query() 872 | } 873 | 874 | key, value, err = addQueryParamCacheField(queryParams, field) 875 | case requestBodyParam: 876 | if val, ok := form.bodyMap[field.Field]; ok { 877 | key, value = field.String(), val 878 | } else { 879 | var keys = make([]string, 0, len(form.bodyMap)) 880 | for k := range form.bodyMap { 881 | keys = append(keys, k) 882 | } 883 | err = fmt.Errorf("there is no such field [%s] in the request body: %v", field, keys) 884 | } 885 | default: 886 | err = ErrInvalidCacheTypeCode 887 | } 888 | 889 | if err != nil { 890 | c.FatalOrPanic(err) 891 | } 892 | 893 | cachedMap[key] = value 894 | } 895 | 896 | c.Debug("use some specified cache fields", log.Arg{Key: "cached_map", Value: cachedMap}) 897 | } 898 | 899 | if len(headers) == 0 { 900 | headers = make(map[string]string) 901 | } 902 | headers["Content-Type"] = form.FormDataContentType() 903 | 904 | reqHeader := setRequestHeaders(headers) 905 | 906 | return c.request(MethodPost, URL, form.Bytes(), cachedMap, reqHeader, ctx, isChained) 907 | } 908 | 909 | // PostMultipart is used to send POST requests whose content-type is `multipart/form-data` 910 | func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error { 911 | return c.postMultipart(URL, form, nil, ctx, false, c.cacheFields...) 912 | } 913 | 914 | // PostRaw is used to send POST requests whose content-type is not in [json, `application/x-www-form-urlencoded`, `multipart/form-data`] 915 | func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error { 916 | cachedMap := map[string]string{ 917 | "cache": string(body), 918 | } 919 | return c.request(MethodPost, URL, body, cachedMap, nil, ctx, false) 920 | } 921 | 922 | /************************* Public methods ****************************/ 923 | 924 | // ClearCache will clear all cache 925 | func (c *Crawler) ClearCache() error { 926 | if c.cache == nil { 927 | c.Error(ErrNoCache) 928 | return ErrNoCache 929 | } 930 | if c.log != nil { 931 | c.Warning("clear all cache") 932 | } 933 | return c.cache.Clear() 934 | } 935 | 936 | func (c Crawler) ProxyInUse() string { 937 | c.lock.RLock() 938 | defer c.lock.RUnlock() 939 | 940 | if strings.Contains(c.proxyInUse, "//") { 941 | return strings.Split(c.proxyInUse, "//")[1] 942 | } 943 | return c.proxyInUse 944 | } 945 | 946 | func (c *Crawler) ConcurrencyState() bool { 947 | return c.goPool != nil 948 | } 949 | 950 | /************************* 公共注册方法 ****************************/ 951 | 952 | // BeforeRequest used to process requests, such as 953 | // setting headers, passing context, etc. 954 | func (c *Crawler) BeforeRequest(f HandleRequest) { 955 | c.lock.Lock() 956 | if c.requestHandler == nil { 957 | // 一个 ccrawler 不应该有太多处理请求的方法,这里设置为 5 个, 958 | // 当不够时自动扩容 959 | c.requestHandler = make([]HandleRequest, 0, 5) 960 | } 961 | c.requestHandler = append(c.requestHandler, f) 962 | c.lock.Unlock() 963 | } 964 | 965 | // ParseHTML can parse html to find the data you need, 966 | // and process the data 967 | func (c *Crawler) ParseHTML(selector string, f HandleHTML) { 968 | c.lock.Lock() 969 | if c.htmlHandler == nil { 970 | // 一个 ccrawler 不应该有太多处理 html 的方法,这里设置为 5 个, 971 | // 当不够时自动扩容 972 | c.htmlHandler = make([]*HTMLParser, 0, 5) 973 | } 974 | c.htmlHandler = append(c.htmlHandler, &HTMLParser{selector, f}) 975 | c.lock.Unlock() 976 | } 977 | 978 | // ParseJSON can parse json to find the data you need, 979 | // and process the data. 980 | // 981 | // If you set `strict` to true, responses that do not contain 982 | // `application/json` in the content-type of the response header will 983 | // not be processed. 984 | // 985 | // It is recommended to do full processing of the json response in one 986 | // call to `ParseJSON` instead of multiple calls to `ParseJSON`. 987 | func (c *Crawler) ParseJSON(strict bool, f HandleJSON) { 988 | c.lock.Lock() 989 | if c.jsonHandler == nil { 990 | c.jsonHandler = make([]*JSONParser, 0, 1) 991 | } 992 | c.jsonHandler = append(c.jsonHandler, &JSONParser{strict, f}) 993 | c.lock.Unlock() 994 | } 995 | 996 | // AfterResponse is used to process the response, this 997 | // method should be used for the response body in non-html format 998 | func (c *Crawler) AfterResponse(f HandleResponse) { 999 | c.lock.Lock() 1000 | if c.responseHandler == nil { 1001 | // 一个 ccrawler 不应该有太多处理响应的方法,这里设置为 5 个, 1002 | // 当不够时自动扩容 1003 | c.responseHandler = make([]HandleResponse, 0, 5) 1004 | } 1005 | c.responseHandler = append(c.responseHandler, f) 1006 | c.lock.Unlock() 1007 | } 1008 | 1009 | // ProxyPoolAmount returns the number of proxies in 1010 | // the proxy pool 1011 | func (c Crawler) ProxyPoolAmount() int { 1012 | return len(c.proxyURLPool) 1013 | } 1014 | 1015 | // Wait waits for the end of all concurrent tasks 1016 | func (c *Crawler) Wait() { 1017 | c.wg.Wait() 1018 | c.goPool.Close() 1019 | } 1020 | 1021 | func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition) { 1022 | c.proxyInvalidCondition = condition 1023 | } 1024 | 1025 | func (c *Crawler) AddProxy(newProxy string) { 1026 | c.lock.Lock() 1027 | 1028 | c.proxyURLPool = append(c.proxyURLPool, newProxy) 1029 | 1030 | c.lock.Unlock() 1031 | } 1032 | 1033 | func (c *Crawler) AddCookie(key, val string) { 1034 | c.lock.Lock() 1035 | 1036 | c.cookies[key] = val 1037 | 1038 | c.lock.Unlock() 1039 | } 1040 | 1041 | // SetConcurrency 使用并发,参数为要创建的协程池数量 1042 | func (c *Crawler) SetConcurrency(count uint64, blockPanic bool) { 1043 | if c.goPool == nil { 1044 | p, err := NewPool(count) 1045 | if err != nil { 1046 | panic(err) 1047 | } 1048 | p.blockPanic = blockPanic 1049 | p.log = c.log 1050 | 1051 | c.goPool = p 1052 | c.wg = new(sync.WaitGroup) 1053 | } else { 1054 | c.FatalOrPanic(errors.New("`c.goPool` is not nil")) 1055 | } 1056 | } 1057 | 1058 | func (c *Crawler) SetRetry(count uint32, cond RetryCondition) { 1059 | c.retryCount = count 1060 | c.retryCondition = cond 1061 | } 1062 | 1063 | func (c *Crawler) SetCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField) { 1064 | cc.Compressed(compressed) 1065 | err := cc.Init() 1066 | if err != nil { 1067 | panic(err) 1068 | } 1069 | c.cache = cc 1070 | if cacheCondition == nil { 1071 | cacheCondition = func(r *Response) bool { 1072 | return r.StatusCode/100 == 2 1073 | } 1074 | } 1075 | c.cacheCondition = cacheCondition 1076 | if len(cacheFileds) > 0 { 1077 | c.cacheFields = cacheFileds 1078 | } else { 1079 | c.cacheFields = nil 1080 | } 1081 | } 1082 | 1083 | // 有时发出的请求不能缓存,可以用此方法关闭特定的 Crawler 实例的缓存。 1084 | // 1085 | // 通常用来关闭`Clone()`实例的缓存。 1086 | func (c *Crawler) UnsetCache() { 1087 | if c.cache != nil { 1088 | c.cache = nil 1089 | 1090 | if c.cacheCondition != nil { 1091 | c.cacheCondition = nil 1092 | } 1093 | 1094 | if c.cacheFields != nil { 1095 | c.cacheFields = nil 1096 | } 1097 | } 1098 | } 1099 | 1100 | func (c Crawler) Lock() { 1101 | c.lock.Lock() 1102 | } 1103 | 1104 | func (c Crawler) Unlock() { 1105 | c.lock.Unlock() 1106 | } 1107 | 1108 | func (c Crawler) RLock() { 1109 | c.lock.RLock() 1110 | } 1111 | 1112 | func (c Crawler) RUnlock() { 1113 | c.lock.RUnlock() 1114 | } 1115 | 1116 | /************************* 私有注册方法 ****************************/ 1117 | 1118 | func (c *Crawler) processRequestHandler(r *Request) { 1119 | for _, f := range c.requestHandler { 1120 | f(r) 1121 | } 1122 | } 1123 | 1124 | func (c *Crawler) processResponseHandler(r *Response) { 1125 | for _, f := range c.responseHandler { 1126 | if r.invalid { 1127 | break 1128 | } 1129 | f(r) 1130 | } 1131 | } 1132 | 1133 | func (c *Crawler) processJSONHandler(r *Response) { 1134 | if c.jsonHandler == nil { 1135 | return 1136 | } 1137 | 1138 | if len(c.jsonHandler) > 1 { 1139 | if c.log != nil { 1140 | c.Warning("it is recommended to do full processing of the json response in one call to `ParseJSON` instead of multiple calls to `ParseJSON`") 1141 | } 1142 | } 1143 | 1144 | result := json.ParseBytesToJSON(r.Body) 1145 | for _, parser := range c.jsonHandler { 1146 | if parser.strict { 1147 | if !strings.Contains(strings.ToLower(r.ContentType()), "application/json") { 1148 | if c.log != nil { 1149 | c.Debug( 1150 | `the "Content-Type" of the response header is not of the "json" type`, 1151 | log.Arg{Key: "Content-Type", Value: r.ContentType()}, 1152 | ) 1153 | } 1154 | continue 1155 | } 1156 | } 1157 | parser.Handle(result, r) 1158 | } 1159 | } 1160 | 1161 | func (c *Crawler) processHTMLHandler(r *Response) error { 1162 | if len(c.htmlHandler) == 0 { 1163 | return nil 1164 | } 1165 | 1166 | if !strings.Contains(strings.ToLower(r.ContentType()), "html") { 1167 | if c.log != nil { 1168 | c.Debug( 1169 | `the "Content-Type" of the response header is not of the "html" type`, 1170 | log.Arg{Key: "Content-Type", Value: r.ContentType()}, 1171 | ) 1172 | } 1173 | return nil 1174 | } 1175 | 1176 | doc, err := html.ParseHTML(r.Body) 1177 | if err != nil { 1178 | if c.log != nil { 1179 | c.log.Error(err) 1180 | } 1181 | return err 1182 | } 1183 | 1184 | for _, parser := range c.htmlHandler { 1185 | if r.invalid { 1186 | break 1187 | } 1188 | 1189 | i := 0 1190 | doc.Find(parser.Selector).Each(func(_ int, s *goquery.Selection) { 1191 | for _, n := range s.Nodes { 1192 | parser.Handle(html.NewHTMLElementFromSelectionNode(s, n, i), r) 1193 | i++ 1194 | } 1195 | }) 1196 | } 1197 | return nil 1198 | } 1199 | 1200 | // removeInvalidProxy 只有在使用代理池且当前请求使用的代理来自于代理池时,才能真正删除失效代理 1201 | func (c *Crawler) removeInvalidProxy(proxyAddr string) error { 1202 | c.lock.Lock() 1203 | defer c.lock.Unlock() 1204 | 1205 | if c.ProxyPoolAmount() == 0 { 1206 | return proxy.ProxyErr{ 1207 | Code: proxy.ErrEmptyProxyPoolCode, 1208 | Msg: "the current proxy pool is empty", 1209 | } 1210 | } 1211 | 1212 | if c.ProxyPoolAmount() == 1 && c.complementProxyPool != nil { 1213 | newProxyPool := c.complementProxyPool() 1214 | c.proxyURLPool = append(c.proxyURLPool, newProxyPool...) 1215 | c.log.Info( 1216 | "a new proxy pool has replaced to the old proxy pool", 1217 | log.Arg{Key: "new_proxy_pool", Value: newProxyPool}, 1218 | ) 1219 | } 1220 | 1221 | targetIndex := -1 1222 | for i, p := range c.proxyURLPool { 1223 | addr := strings.Split(p, "//")[1] 1224 | if addr == proxyAddr { 1225 | targetIndex = i 1226 | break 1227 | } 1228 | } 1229 | 1230 | if targetIndex >= 0 { 1231 | c.proxyURLPool = append( 1232 | c.proxyURLPool[:targetIndex], 1233 | c.proxyURLPool[targetIndex+1:]..., 1234 | ) 1235 | 1236 | if c.log != nil { 1237 | c.Debug( 1238 | "invalid proxy have been deleted from the proxy pool", 1239 | log.Arg{Key: "proxy", Value: proxyAddr}, 1240 | ) 1241 | } 1242 | 1243 | if len(c.proxyURLPool) == 0 { 1244 | return proxy.ProxyErr{ 1245 | Code: proxy.ErrEmptyProxyPoolCode, 1246 | Msg: "the current proxy pool is empty after removing a invalid proxy", 1247 | } 1248 | } 1249 | } else { 1250 | // 并发时可能也会存在找不到失效的代理的情况,这时不能返回 error 1251 | if c.goPool != nil { 1252 | return nil 1253 | } 1254 | 1255 | // 没有在代理池中找到失效代理,这个代理来路不明,一样报错 1256 | return &proxy.ProxyErr{ 1257 | Code: proxy.ErrUnkownProxyIPCode, 1258 | Msg: "proxy address is unkown", 1259 | Args: map[string]string{ 1260 | "unkown_proxy_addr": proxyAddr, 1261 | }, 1262 | } 1263 | } 1264 | 1265 | return nil 1266 | } 1267 | 1268 | func (c *Crawler) Debug(msg string, args ...log.Arg) { 1269 | if c.log != nil { 1270 | c.log.Debug(msg, args...) 1271 | } 1272 | } 1273 | 1274 | func (c *Crawler) Info(msg string, args ...log.Arg) { 1275 | if c.log != nil { 1276 | c.log.Info(msg, args...) 1277 | } 1278 | } 1279 | 1280 | func (c *Crawler) Warning(msg string, args ...log.Arg) { 1281 | if c.log != nil { 1282 | c.log.Warning(msg, args...) 1283 | } 1284 | } 1285 | 1286 | func (c *Crawler) Error(err error, args ...log.Arg) { 1287 | if c.log != nil { 1288 | c.log.Error(err, args...) 1289 | } 1290 | } 1291 | 1292 | func (c *Crawler) Fatal(err error, args ...log.Arg) { 1293 | if c.log != nil { 1294 | c.log.Fatal(err, args...) 1295 | } 1296 | } 1297 | -------------------------------------------------------------------------------- /craw_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: craw_test.go 5 | * @Created: 2021-07-23 09:22:36 6 | * @Modified: 2022-05-24 09:23:24 7 | */ 8 | 9 | package predator 10 | 11 | import ( 12 | "bufio" 13 | "bytes" 14 | "encoding/json" 15 | "fmt" 16 | "net/http" 17 | "net/http/httptest" 18 | "reflect" 19 | "strings" 20 | "testing" 21 | "time" 22 | 23 | "github.com/go-predator/log" 24 | "github.com/go-predator/predator/html" 25 | "github.com/go-predator/predator/proxy" 26 | 27 | . "github.com/smartystreets/goconvey/convey" 28 | "github.com/tidwall/gjson" 29 | "github.com/valyala/fasthttp" 30 | ) 31 | 32 | func TestNewCrawler(t *testing.T) { 33 | Convey("测试设置 UA", t, func() { 34 | for _, ua := range []string{"foo", "bar"} { 35 | c := NewCrawler(WithUserAgent(ua)) 36 | So(c.UserAgent, ShouldEqual, ua) 37 | } 38 | }) 39 | Convey("测试设置 cookies", t, func() { 40 | cookie := map[string]string{"foo": "bar"} 41 | c := NewCrawler(WithCookies(cookie)) 42 | So(c.cookies, ShouldEqual, cookie) 43 | }) 44 | Convey("测试设置指定并发数量", t, func() { 45 | count := 10 46 | c := NewCrawler(WithConcurrency(uint64(count), false)) 47 | So(c.goPool.GetCap(), ShouldEqual, count) 48 | }) 49 | Convey("测试设置重试数量", t, func() { 50 | count := 5 51 | c := NewCrawler(WithRetry(uint32(count), func(r *Response) bool { return true })) 52 | So(c.retryCount, ShouldEqual, count) 53 | }) 54 | 55 | Convey("测试设置代理池", t, func() { 56 | pp := make([]string, 0, 5) 57 | for i := 1; i <= 5; i++ { 58 | pp = append(pp, fmt.Sprintf("http://localhost:%d000", i)) 59 | } 60 | c := NewCrawler(WithProxyPool(pp)) 61 | So(reflect.DeepEqual(c.proxyURLPool, pp), ShouldBeTrue) 62 | }) 63 | } 64 | 65 | var serverIndexResponse = []byte("hello world\n") 66 | 67 | func server() *httptest.Server { 68 | mux := http.NewServeMux() 69 | 70 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 71 | w.WriteHeader(200) 72 | w.Write(serverIndexResponse) 73 | }) 74 | 75 | mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { 76 | if r.Method == "POST" { 77 | w.Header().Set("Content-Type", "text/html") 78 | w.Write([]byte(r.FormValue("name"))) 79 | } 80 | }) 81 | 82 | mux.HandleFunc("/set_cookie", func(w http.ResponseWriter, r *http.Request) { 83 | c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false} 84 | http.SetCookie(w, c) 85 | w.WriteHeader(200) 86 | w.Write([]byte("ok")) 87 | }) 88 | 89 | mux.HandleFunc("/check_cookie", func(w http.ResponseWriter, r *http.Request) { 90 | cs := r.Cookies() 91 | if len(cs) != 1 || r.Cookies()[0].Value != "testv" { 92 | w.WriteHeader(500) 93 | w.Write([]byte("nok")) 94 | return 95 | } 96 | w.WriteHeader(200) 97 | w.Write([]byte("ok")) 98 | }) 99 | 100 | mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) { 101 | w.Header().Set("Content-Type", "text/html") 102 | w.Write([]byte(` 103 | 104 |
105 |This is a 1
110 |This is a 2
111 |This is a 3
112 | 113 | 114 | `)) 115 | }) 116 | 117 | mux.HandleFunc("/redirect", func(w http.ResponseWriter, r *http.Request) { 118 | c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false} 119 | http.SetCookie(w, c) 120 | http.Redirect(w, r, "/html", http.StatusMovedPermanently) 121 | }) 122 | 123 | mux.HandleFunc("/json", func(w http.ResponseWriter, r *http.Request) { 124 | w.Header().Set("Content-Type", "application/json; charset=UTF-8") 125 | if r.Method != "POST" { 126 | w.WriteHeader(403) 127 | w.Write([]byte(`{"msg": "only allow access with post method"}`)) 128 | return 129 | } 130 | 131 | ct := r.Header.Get("Content-Type") 132 | if ct != "application/json" { 133 | w.WriteHeader(400) 134 | w.Write([]byte(`{"msg": "unkown content type"}`)) 135 | return 136 | } 137 | 138 | w.WriteHeader(200) 139 | w.Write([]byte(`{"msg": "ok"}`)) 140 | }) 141 | 142 | mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) { 143 | w.Header().Set("Content-Type", "application/octet-stream") 144 | ww := bufio.NewWriter(w) 145 | defer ww.Flush() 146 | for { 147 | // have to check error to detect client aborting download 148 | if _, err := ww.Write([]byte{0x41}); err != nil { 149 | return 150 | } 151 | } 152 | }) 153 | 154 | mux.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) { 155 | if r.Method == "POST" { 156 | w.Header().Set("Content-Type", "text/html") 157 | w.Write([]byte(r.FormValue("id"))) 158 | 159 | // 随机休眠几秒用于测试并发 160 | // rand.Seed(time.Now().UnixNano()) 161 | // time.Sleep(time.Duration(rand.Intn(5)) * time.Second) 162 | return 163 | } 164 | }) 165 | 166 | return httptest.NewServer(mux) 167 | } 168 | 169 | func TestRequest(t *testing.T) { 170 | ts := server() 171 | defer ts.Close() 172 | 173 | Convey("测试请求、响应之间的上下文传递和响应结果", t, func() { 174 | c := NewCrawler() 175 | 176 | c.BeforeRequest(func(r *Request) { 177 | r.Ctx.Put("k", "v") 178 | }) 179 | 180 | c.AfterResponse(func(r *Response) { 181 | v := r.Ctx.Get("k") 182 | So(v, ShouldEqual, "v") 183 | So(bytes.Equal(serverIndexResponse, r.Body), ShouldBeTrue) 184 | }) 185 | 186 | c.Get(ts.URL) 187 | }) 188 | 189 | Convey("测试 POST", t, func() { 190 | requestData := map[string]string{ 191 | "name": "tom", 192 | "password": "123456", 193 | } 194 | 195 | c := NewCrawler() 196 | 197 | c.BeforeRequest(func(r *Request) { 198 | r.Ctx.Put("k", 2) 199 | }) 200 | 201 | c.AfterResponse(func(r *Response) { 202 | v := r.Ctx.GetAny("k").(int) 203 | So(v, ShouldEqual, 2) 204 | So(string(r.Body), ShouldEqual, requestData["name"]) 205 | So(string(r.Headers.Peek("Content-Type")), ShouldEqual, "text/html") 206 | 207 | }) 208 | 209 | c.Post(ts.URL+"/login", requestData, nil) 210 | }) 211 | 212 | // 想运行此示例,需要自行更新 cookie 和 auth_token 213 | Convey("测试 PostMultipart", t, func() { 214 | c := NewCrawler( 215 | WithCookies(map[string]string{ 216 | "PHPSESSID": "7ijqglcno1cljiqs76t2vo5oh2", 217 | })) 218 | form := NewMultipartForm( 219 | "-------------------", 220 | randomBoundary, 221 | ) 222 | 223 | var err error 224 | 225 | form.AppendString("type", "file") 226 | form.AppendString("action", "upload") 227 | form.AppendString("timestamp", "1627871450610") 228 | form.AppendString("auth_token", "f43cdc8a537eff5169dfddb946c2365d1f897b0c") 229 | form.AppendString("nsfw", "0") 230 | err = form.AppendFile("source", "/Users/thepoy/Pictures/Nginx.png") 231 | So(err, ShouldBeNil) 232 | 233 | c.AfterResponse(func(r *Response) { 234 | status := gjson.ParseBytes(r.Body).Get("status_code").Int() 235 | So(status, ShouldEqual, fasthttp.StatusOK) 236 | }) 237 | 238 | err = c.PostMultipart("https://imgtu.com/json", form, nil) 239 | So(err, ShouldBeNil) 240 | }) 241 | 242 | } 243 | 244 | func TestHTTPProxy(t *testing.T) { 245 | ts := server() 246 | defer ts.Close() 247 | 248 | u := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp" 249 | validIP := "http://123.73.209.237:46603" 250 | Convey("测试有效代理", t, func() { 251 | c := NewCrawler( 252 | WithUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55"), 253 | WithProxy(validIP), 254 | WithLogger(nil), 255 | ) 256 | 257 | c.AfterResponse(func(r *Response) { 258 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String() 259 | 260 | So(ip, ShouldEqual, strings.Split(strings.Split(validIP, "//")[1], ":")[0]) 261 | }) 262 | 263 | c.Get(u) 264 | }) 265 | 266 | Convey("测试代理池为空时 panic", t, func() { 267 | defer func() { 268 | if err := recover(); err != nil { 269 | So(err.(proxy.ProxyErr).Code, ShouldEqual, proxy.ErrEmptyProxyPoolCode) 270 | } 271 | }() 272 | ips := []string{ 273 | "http://14.134.203.22:45104", 274 | "http://14.134.204.22:45105", 275 | "http://14.134.205.22:45106", 276 | "http://14.134.206.22:45107", 277 | "http://14.134.207.22:45108", 278 | "http://14.134.208.22:45109", 279 | } 280 | c := NewCrawler(WithProxyPool(ips), WithLogger(nil)) 281 | 282 | c.Get(u) 283 | }) 284 | 285 | Convey("测试删除代理池中某个或某些无效代理", t, func() { 286 | ips := []string{ 287 | "http://14.134.204.22:45105", 288 | validIP, 289 | "http://14.134.205.22:45106", 290 | "http://14.134.206.22:45107", 291 | "http://27.29.155.141:45118", 292 | "http://14.134.208.22:45109", 293 | } 294 | c := NewCrawler(WithProxyPool(ips), WithLogger(nil)) 295 | 296 | c.AfterResponse(func(r *Response) { 297 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String() 298 | So(c.ProxyPoolAmount(), ShouldBeLessThanOrEqualTo, len(ips)) 299 | So(ip, ShouldEqual, strings.Split(strings.Split(validIP, "//")[1], ":")[0]) 300 | }) 301 | 302 | err := c.Get(u) 303 | So(err, ShouldBeNil) 304 | }) 305 | 306 | Convey("测试多个有效代理的随机选择", t, func() { 307 | count := 5 308 | u := "http://t.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=%d&time=1&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=&usertype=2" 309 | client := &fasthttp.Client{} 310 | body := make([]byte, 0) 311 | _, body, err := client.Get(body, fmt.Sprintf(u, count)) 312 | if err != nil { 313 | panic(err) 314 | } 315 | 316 | ips := strings.Split(string(body), "\r\n") 317 | for i := 0; i < len(ips); i++ { 318 | ips[i] = "http://" + ips[i] 319 | } 320 | 321 | c := NewCrawler(WithProxyPool(ips), WithDefaultLogger()) 322 | 323 | c.BeforeRequest(func(r *Request) { 324 | r.SetHeaders(map[string]string{ 325 | // 避免因 keep-alive 的响应无法改变代理 326 | "Connection": "close", 327 | }) 328 | }) 329 | 330 | c.AfterResponse(func(r *Response) { 331 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String() 332 | t.Log(ip) 333 | }) 334 | 335 | ipu := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp" 336 | for i := 0; i < count*2; i++ { 337 | err := c.Get(ipu) 338 | So(err, ShouldBeNil) 339 | } 340 | }) 341 | } 342 | 343 | func TestSocks5Proxy(t *testing.T) { 344 | proxyIP := "socks5://222.37.211.49:46601" 345 | u := "https://api.bilibili.com/x/web-interface/zone?jsonp=jsonp" 346 | 347 | Convey("测试有效代理", t, func() { 348 | c := NewCrawler( 349 | WithProxy(proxyIP), 350 | ) 351 | 352 | c.AfterResponse(func(r *Response) { 353 | t.Log(r) 354 | 355 | ip := gjson.ParseBytes(r.Body).Get("data.addr").String() 356 | 357 | So(ip, ShouldEqual, strings.Split(strings.Split(proxyIP, "//")[1], ":")[0]) 358 | }) 359 | 360 | err := c.Get(u) 361 | So(err, ShouldBeNil) 362 | }) 363 | } 364 | 365 | func TestRetry(t *testing.T) { 366 | ts := server() 367 | defer ts.Close() 368 | 369 | Convey("测试对失败响应发起重试", t, func() { 370 | cookie := map[string]string{"test": "ha"} 371 | c := NewCrawler( 372 | WithCookies(cookie), 373 | WithRetry(5, func(r *Response) bool { 374 | return r.StatusCode != 200 375 | }), 376 | ) 377 | 378 | c.AfterResponse(func(r *Response) { 379 | So(r.Request.NumberOfRetries(), ShouldEqual, 5) 380 | So(r.StatusCode, ShouldNotEqual, 200) 381 | }) 382 | 383 | c.Get(ts.URL + "/check_cookie") 384 | }) 385 | } 386 | 387 | func TestCookies(t *testing.T) { 388 | ts := server() 389 | defer ts.Close() 390 | 391 | Convey("测试响应 set-cookie", t, func() { 392 | c := NewCrawler() 393 | 394 | c.AfterResponse(func(r *Response) { 395 | So(r.StatusCode, ShouldEqual, 200) 396 | So(string(r.Headers.Peek("Set-Cookie")), ShouldEqual, "test=testv") 397 | }) 398 | 399 | c.Get(ts.URL + "/set_cookie") 400 | }) 401 | 402 | Convey("测试使用 cookie 请求", t, func() { 403 | Convey("成功", func() { 404 | cookie := map[string]string{"test": "testv"} 405 | c := NewCrawler(WithCookies(cookie)) 406 | 407 | c.AfterResponse(func(r *Response) { 408 | So(r.StatusCode, ShouldEqual, 200) 409 | So(r.String(), ShouldEqual, "ok") 410 | }) 411 | 412 | c.Get(ts.URL + "/check_cookie") 413 | }) 414 | Convey("失败", func() { 415 | cookie := map[string]string{"test": "ha"} 416 | c := NewCrawler(WithCookies(cookie)) 417 | 418 | c.AfterResponse(func(r *Response) { 419 | So(r.StatusCode, ShouldEqual, 500) 420 | So(r.String(), ShouldEqual, "nok") 421 | }) 422 | 423 | c.Get(ts.URL + "/check_cookie") 424 | }) 425 | }) 426 | } 427 | 428 | func TestJSON(t *testing.T) { 429 | ts := server() 430 | defer ts.Close() 431 | 432 | type TestResponse struct { 433 | Msg string `json:"msg"` 434 | } 435 | 436 | Convey("测试请求方法是否正确", t, func() { 437 | Convey("错误", func() { 438 | c := NewCrawler() 439 | 440 | c.AfterResponse(func(r *Response) { 441 | So(r.StatusCode, ShouldEqual, 403) 442 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8") 443 | 444 | var j TestResponse 445 | json.Unmarshal(r.Body, &j) 446 | So(j.Msg, ShouldEqual, "only allow access with post method") 447 | }) 448 | 449 | c.Get(ts.URL + "/json") 450 | }) 451 | Convey("正确", func() { 452 | c := NewCrawler() 453 | 454 | c.AfterResponse(func(r *Response) { 455 | So(r.StatusCode, ShouldEqual, 400) 456 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8") 457 | 458 | var j TestResponse 459 | json.Unmarshal(r.Body, &j) 460 | So(j.Msg, ShouldEqual, "unkown content type") 461 | }) 462 | 463 | c.Post(ts.URL+"/json", nil, nil) 464 | }) 465 | }) 466 | 467 | Convey("测试请求头 Content-Type", t, func() { 468 | c := NewCrawler() 469 | 470 | c.BeforeRequest(func(r *Request) { 471 | r.SetContentType("application/json") 472 | }) 473 | 474 | c.AfterResponse(func(r *Response) { 475 | So(r.StatusCode, ShouldEqual, 200) 476 | So(r.ContentType(), ShouldEqual, "application/json; charset=UTF-8") 477 | 478 | var j TestResponse 479 | json.Unmarshal(r.Body, &j) 480 | So(j.Msg, ShouldEqual, "ok") 481 | }) 482 | 483 | c.Post(ts.URL+"/json", nil, nil) 484 | }) 485 | 486 | Convey("测试完整 JSON 请求和响应", t, func() { 487 | c := NewCrawler() 488 | 489 | c.AfterResponse(func(r *Response) { 490 | t.Log(r) 491 | }) 492 | 493 | type User struct { 494 | Name string `json:"name"` 495 | Age int `json:"age"` 496 | } 497 | 498 | body := map[string]any{ 499 | "time": 156546535, 500 | "cid": "10_18772100220-1625540144276-302919", 501 | "args": []int{1, 2, 3, 4, 5}, 502 | "dict": map[string]string{ 503 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778", 504 | }, 505 | "user": User{"Tom", 13}, 506 | } 507 | 508 | c.PostJSON("https://httpbin.org/post", body, nil) 509 | }) 510 | 511 | Convey("测试带缓存的完整 JSON 请求和响应", t, func() { 512 | c := NewCrawler( 513 | WithCache(nil, false, nil, CacheField{requestBodyParam, "cid"}, CacheField{requestBodyParam, "user.name"}, CacheField{requestBodyParam, "user.age"}), 514 | ) 515 | 516 | c.AfterResponse(func(r *Response) { 517 | t.Log(r.FromCache) 518 | }) 519 | 520 | type User struct { 521 | Name string `json:"name"` 522 | Age int `json:"age"` 523 | } 524 | 525 | body := map[string]any{ 526 | "time": 156546535, 527 | "cid": "10_18772100220-1625540144276-302919", 528 | "args": []int{1, 2, 3, 4, 5}, 529 | "dict": map[string]string{ 530 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778", 531 | }, 532 | "user": User{"Tom", 13}, 533 | } 534 | 535 | c.PostJSON("https://httpbin.org/post", body, nil) 536 | }) 537 | } 538 | 539 | func TestJSONWithInvalidCacheField(t *testing.T) { 540 | c := NewCrawler( 541 | WithCache(nil, false, nil, CacheField{requestBodyParam, "id"}, CacheField{requestBodyParam, "user.name"}, CacheField{requestBodyParam, "user.age"}), 542 | WithLogger(nil), 543 | ) 544 | 545 | c.AfterResponse(func(r *Response) { 546 | t.Log(r.FromCache) 547 | }) 548 | 549 | type User struct { 550 | Name string `json:"name"` 551 | Age int `json:"age"` 552 | } 553 | 554 | body := map[string]any{ 555 | "time": 156546535, 556 | "cid": "10_18772100220-1625540144276-302919", 557 | "args": []int{1, 2, 3, 4, 5}, 558 | "dict": map[string]string{ 559 | "mod": "1592215036_002", "extend1": "关注", "t": "1628346994", "eleTop": "778", 560 | }, 561 | "user": User{"Tom", 13}, 562 | } 563 | 564 | c.PostJSON("https://httpbin.org/post", body, nil) 565 | } 566 | 567 | func TestParseHTML(t *testing.T) { 568 | ts := server() 569 | defer ts.Close() 570 | 571 | Convey("测试 HTML 解析", t, func() { 572 | crawl := NewCrawler() 573 | 574 | Convey("测试解析整体 HTML", func() { 575 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) { 576 | h, err := he.OuterHTML() 577 | So(err, ShouldBeNil) 578 | So(h, ShouldEqual, ` 579 |This is a 1
581 |This is a 2
582 |This is a 3
583 | 584 | 585 | `) 586 | }) 587 | }) 588 | 589 | Convey("测试解析内部 HTML", func() { 590 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) { 591 | h, err := he.InnerHTML() 592 | So(err, ShouldBeNil) 593 | So(h, ShouldEqual, ` 594 |This is a 1
596 |This is a 2
597 |This is a 3
598 | 599 | 600 | `) 601 | }) 602 | }) 603 | 604 | Convey("测试解析内部文本", func() { 605 | crawl.ParseHTML("title", func(he *html.HTMLElement, r *Response) { 606 | So(he.Text(), ShouldEqual, "Test Page") 607 | }) 608 | }) 609 | 610 | Convey("测试获取属性", func() { 611 | crawl.ParseHTML("p", func(he *html.HTMLElement, r *Response) { 612 | attr := he.Attr("class") 613 | So(attr, ShouldEqual, "description") 614 | }) 615 | }) 616 | 617 | Convey("测试查找子元素", func() { 618 | crawl.ParseHTML("body", func(he *html.HTMLElement, r *Response) { 619 | So(he.FirstChild("p").Attr("class"), ShouldEqual, "description") 620 | So(he.Child("p", 2).Text(), ShouldEqual, "This is a 2") 621 | So(he.ChildAttr("p", "class"), ShouldEqual, "description") 622 | So(len(he.ChildrenAttr("p", "class")), ShouldEqual, 3) 623 | }) 624 | }) 625 | 626 | crawl.Get(ts.URL + "/html") 627 | }) 628 | } 629 | 630 | func timeCost() func() { 631 | start := time.Now() 632 | return func() { 633 | tc := time.Since(start) 634 | fmt.Printf("time cost = %v\n", tc) 635 | } 636 | } 637 | 638 | func TestConcurrency(t *testing.T) { 639 | ts := server() 640 | defer ts.Close() 641 | 642 | Convey("测试并发和同步耗时", t, func() { 643 | Convey("并发", func() { 644 | start := time.Now() 645 | c := NewCrawler( 646 | WithConcurrency(30, false), 647 | ) 648 | 649 | for i := 0; i < 10; i++ { 650 | err := c.Post(ts.URL+"/post", map[string]string{ 651 | "id": fmt.Sprint(i + 1), 652 | }, nil) 653 | So(err, ShouldBeNil) 654 | } 655 | 656 | delta := time.Since(start) 657 | t.Log(delta) 658 | }) 659 | 660 | Convey("同步", func() { 661 | start := time.Now() 662 | c := NewCrawler() 663 | 664 | for i := 0; i < 10; i++ { 665 | err := c.Post(ts.URL+"/post", map[string]string{ 666 | "id": fmt.Sprint(i + 1), 667 | }, nil) 668 | So(err, ShouldBeNil) 669 | } 670 | 671 | delta := time.Since(start) 672 | t.Log(delta) 673 | }) 674 | }) 675 | } 676 | 677 | func TestLog(t *testing.T) { 678 | ts := server() 679 | defer ts.Close() 680 | 681 | Convey("默认在终端美化输出 INFO 等级\n", t, func() { 682 | c := NewCrawler( 683 | WithLogger(nil), 684 | ) 685 | 686 | c.Get(ts.URL) 687 | }) 688 | 689 | Convey("在终端美化输出 DEBUG 等级\n", t, func() { 690 | c := NewCrawler( 691 | WithLogger(log.NewLogger(log.DEBUG, log.ToConsole())), 692 | ) 693 | 694 | c.BeforeRequest(func(r *Request) { 695 | r.Ctx.Put("key", "value") 696 | }) 697 | 698 | c.Get(ts.URL) 699 | }) 700 | 701 | Convey("保存到文件\n", t, func() { 702 | c := NewCrawler( 703 | WithLogger(log.NewLogger(log.DEBUG, log.MustToFile("test.log", -1))), 704 | ) 705 | 706 | c.Get(ts.URL) 707 | }) 708 | 709 | Convey("既保存到文件,也输出到终端\n", t, func() { 710 | c := NewCrawler( 711 | WithLogger(log.NewLogger(log.DEBUG, log.MustToConsoleAndFile("test2.log", -1))), 712 | ) 713 | 714 | c.BeforeRequest(func(r *Request) { 715 | r.Ctx.Put("key", "value") 716 | }) 717 | 718 | c.Get(ts.URL) 719 | }) 720 | } 721 | 722 | func TestRedirect(t *testing.T) { 723 | ts := server() 724 | defer ts.Close() 725 | 726 | Convey("测试默认情况", t, func() { 727 | c := NewCrawler() 728 | 729 | c.AfterResponse(func(r *Response) { 730 | So(r.StatusCode, ShouldEqual, 301) 731 | }) 732 | 733 | c.Get(ts.URL + "/redirect") 734 | }) 735 | 736 | Convey("测试设置重定向次数的情况", t, func() { 737 | c := NewCrawler() 738 | 739 | c.BeforeRequest(func(r *Request) { 740 | r.AllowRedirect(1) 741 | }) 742 | 743 | c.AfterResponse(func(r *Response) { 744 | So(r.StatusCode, ShouldEqual, 200) 745 | }) 746 | 747 | c.Get(ts.URL + "/redirect") 748 | }) 749 | } 750 | 751 | func getRawCookie(c *Crawler, ts *httptest.Server) string { 752 | var rawCookie string 753 | 754 | c.AfterResponse(func(r *Response) { 755 | if r.StatusCode == 301 { 756 | rawCookie = string(r.Headers.Peek("Set-Cookie")) 757 | } 758 | }) 759 | 760 | c.Post(ts.URL+"/redirect", map[string]string{"username": "test", "password": "test"}, nil) 761 | return rawCookie 762 | } 763 | 764 | func TestClone(t *testing.T) { 765 | ts := server() 766 | defer ts.Close() 767 | 768 | Convey("测试克隆", t, func() { 769 | c := NewCrawler() 770 | 771 | rawCookie := getRawCookie(c, ts) 772 | 773 | WithRawCookie(rawCookie)(c) 774 | WithConcurrency(10, false)(c) 775 | 776 | c.AfterResponse(func(r *Response) { 777 | fmt.Println(r.StatusCode) 778 | fmt.Println(r) 779 | So(r.StatusCode, ShouldEqual, 200) 780 | So(r.String(), ShouldEqual, "ok") 781 | }) 782 | 783 | c.Get(ts.URL + "/check_cookie") 784 | c.Wait() 785 | }) 786 | } 787 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: errors.go 5 | * @Created: 2022-02-17 15:30:54 6 | * @Modified: 2022-11-29 16:14:14 7 | */ 8 | 9 | package predator 10 | 11 | import "errors" 12 | 13 | var ( 14 | ErrRequestFailed = errors.New("request failed") 15 | ErrTimeout = errors.New("timeout, and it is recommended to try a new proxy if you are using a proxy pool") 16 | ErrInvalidCacheTypeCode = errors.New("invalid cache type code") 17 | ErrNotAllowedCacheFieldType = errors.New("only query parameters are allowed as cached fields in `GET` requests") 18 | ErrNoCache = errors.New("no cache configured") 19 | ErrInvalidResponseStatus = errors.New("if the http status code is `302`, there must be a valid `Location` field in the response header") 20 | ) 21 | -------------------------------------------------------------------------------- /example/multipart/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: main.go 5 | * @Created: 2021-07-31 11:50:11 6 | * @Modified: 2022-05-24 09:23:31 7 | */ 8 | 9 | package main 10 | 11 | import ( 12 | "fmt" 13 | "math/rand" 14 | "strings" 15 | 16 | "github.com/go-predator/log" 17 | "github.com/go-predator/predator" 18 | "github.com/go-predator/predator/context" 19 | "github.com/tidwall/gjson" 20 | ) 21 | 22 | // 自定义生成 boundary 的方法 23 | func randomBoundary() string { 24 | var s strings.Builder 25 | count := 29 26 | for i := 0; i < count; i++ { 27 | if i == 0 { 28 | s.WriteString(fmt.Sprint(rand.Intn(9) + 1)) 29 | } else { 30 | s.WriteString(fmt.Sprint(rand.Intn(10))) 31 | } 32 | } 33 | return s.String() 34 | } 35 | 36 | func main() { 37 | c := predator.NewCrawler( 38 | // 使用 cookie 39 | predator.WithCookies(map[string]string{ 40 | "PHPSESSID": "7ijqglcno1cljiqs76t2vo5oh2", 41 | }), 42 | // 使用日志 43 | predator.WithLogger(log.NewLogger(log.DEBUG, log.ToConsole())), 44 | predator.WithCache(nil, false, nil), 45 | ) 46 | 47 | // 创建 multipart/form-data 48 | form := predator.NewMultipartForm( 49 | // boundary 前的横线 50 | "-------------------", 51 | // 传入自定义生成 boundary 的方法 52 | randomBoundary, 53 | ) 54 | 55 | var err error 56 | 57 | // 向 form 中添加表单信息 58 | form.AppendString("type", "file") 59 | form.AppendString("action", "upload") 60 | form.AppendString("timestamp", "1627871450610") 61 | form.AppendString("auth_token", "f43cdc8a537eff5169dfddb946c2365d1f897b0c") 62 | form.AppendString("nsfw", "0") 63 | err = form.AppendFile("source", "/Users/thepoy/Pictures/Nginx.png") 64 | if err != nil { 65 | panic(err) 66 | } 67 | 68 | c.AfterResponse(func(r *predator.Response) { 69 | // 读取上下文 70 | val := r.Ctx.Get("foo") 71 | fmt.Println("value from context:", val) 72 | 73 | status := gjson.ParseBytes(r.Body).Get("status_code").Int() 74 | fmt.Println("status_code:", status) 75 | }) 76 | 77 | // 创建上下文,并传入一些键值对 78 | ctx, err := context.NewContext() 79 | if err != nil { 80 | panic(err) 81 | } 82 | ctx.Put("foo", "bar") 83 | 84 | // 发送 multipart/form-data POST 请求 85 | err = c.PostMultipart("https://imgtu.com/json", form, ctx) 86 | if err != nil { 87 | panic(err) 88 | } 89 | 90 | // 清除缓存 91 | c.ClearCache() 92 | } 93 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/go-predator/predator 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.8.0 7 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f 8 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b 9 | github.com/json-iterator/go v1.1.12 10 | github.com/smartystreets/goconvey v1.7.2 11 | github.com/tidwall/gjson v1.14.3 12 | github.com/valyala/bytebufferpool v1.0.0 13 | github.com/valyala/fasthttp v1.40.0 14 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458 15 | ) 16 | 17 | require ( 18 | github.com/andybalholm/brotli v1.0.4 // indirect 19 | github.com/andybalholm/cascadia v1.3.1 // indirect 20 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 // indirect 21 | github.com/jtolds/gls v4.20.0+incompatible // indirect 22 | github.com/klauspost/compress v1.15.4 // indirect 23 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect 24 | github.com/modern-go/reflect2 v1.0.2 // indirect 25 | github.com/rs/zerolog v1.26.1 // indirect 26 | github.com/smartystreets/assertions v1.2.0 // indirect 27 | github.com/tidwall/match v1.1.1 // indirect 28 | github.com/tidwall/pretty v1.2.0 // indirect 29 | golang.org/x/text v0.3.7 // indirect 30 | ) 31 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= 2 | github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= 3 | github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY= 4 | github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= 5 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= 6 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= 7 | github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= 8 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 10 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 11 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f h1:0Nt/e0By/ClBnnJq+jbGsUWNVIVSbfqYNCE9Z6p7TY0= 12 | github.com/go-predator/log v0.0.0-20220523074050-01ad78a75b3f/go.mod h1:TdZqX+mXzn9Xb+7QnjpCFZLYU3poUG64+Ct2+DnlRDU= 13 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b h1:MMUBfyosVuLCa4k0iDpq+3mOHIQAsBSZt/fJ/nN++js= 14 | github.com/go-predator/tools v0.0.0-20220524022058-ce749e9bf77b/go.mod h1:xG4JX2Eyw5NgKkSXVmKu6u90nPklN0M4fv7jLbzP3TY= 15 | github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= 16 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 17 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= 18 | github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= 19 | github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= 20 | github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= 21 | github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= 22 | github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= 23 | github.com/klauspost/compress v1.15.0/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= 24 | github.com/klauspost/compress v1.15.4 h1:1kn4/7MepF/CHmYub99/nNX8az0IJjfSOU/jbnTVfqQ= 25 | github.com/klauspost/compress v1.15.4/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= 26 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc= 27 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 28 | github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= 29 | github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= 30 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 31 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 32 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 33 | github.com/rs/xid v1.3.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= 34 | github.com/rs/zerolog v1.26.1 h1:/ihwxqH+4z8UxyI70wM1z9yCvkWcfz/a3mj48k/Zngc= 35 | github.com/rs/zerolog v1.26.1/go.mod h1:/wSSJWX7lVrsOwlbyTRSOJvqRlc+WjWlfes+CiJ+tmc= 36 | github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs= 37 | github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= 38 | github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg= 39 | github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM= 40 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 41 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 42 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 43 | github.com/tidwall/gjson v1.14.3 h1:9jvXn7olKEHU1S9vwoMGliaT8jq1vJ7IH/n9zD9Dnlw= 44 | github.com/tidwall/gjson v1.14.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= 45 | github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= 46 | github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= 47 | github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= 48 | github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= 49 | github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= 50 | github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= 51 | github.com/valyala/fasthttp v1.40.0 h1:CRq/00MfruPGFLTQKY8b+8SfdK60TxNztjRMnH0t1Yc= 52 | github.com/valyala/fasthttp v1.40.0/go.mod h1:t/G+3rLek+CyY9bnIE+YlMRddxVAAGjhxndDB4i4C0I= 53 | github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc= 54 | github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= 55 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 56 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 57 | golang.org/x/crypto v0.0.0-20211215165025-cf75a172585e/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= 58 | golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 59 | golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 60 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 61 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 62 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 63 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 64 | golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 65 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 66 | golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 67 | golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= 68 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458 h1:MgJ6t2zo8v0tbmLCueaCbF1RM+TtB0rs3Lv8DGtOIpY= 69 | golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= 70 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 71 | golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 72 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 73 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 74 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 75 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 76 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 77 | golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 78 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 79 | golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 80 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 81 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 82 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 83 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 84 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 85 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= 86 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 87 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 88 | golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 89 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 90 | golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= 91 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 92 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 93 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 94 | -------------------------------------------------------------------------------- /html/element.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: element.go 5 | * @Created: 2021-07-27 20:35:31 6 | * @Modified: 2022-05-26 20:23:43 7 | */ 8 | 9 | package html 10 | 11 | import ( 12 | "errors" 13 | "strings" 14 | 15 | "github.com/PuerkitoBio/goquery" 16 | "github.com/go-predator/tools" 17 | "golang.org/x/net/html" 18 | ) 19 | 20 | var ( 21 | ErrNilElement = errors.New("the current element is nil") 22 | ) 23 | 24 | // HTMLElement is the representation of a HTML tag. 25 | type HTMLElement struct { 26 | // Name is the name of the tag 27 | Name string 28 | 29 | // DOM is the goquery parsed DOM object of the page. DOM is relative 30 | // to the current HTMLElement 31 | DOM *goquery.Selection 32 | 33 | // Index stores the position of the current element within 34 | // all the elements matched by an OnHTML callback 35 | Index int 36 | 37 | Node *html.Node 38 | } 39 | 40 | func (he HTMLElement) String() string { 41 | var s strings.Builder 42 | 43 | s.WriteByte('<') 44 | s.WriteString(he.Name) 45 | 46 | for _, attr := range he.Node.Attr { 47 | s.WriteByte(' ') 48 | s.WriteString(attr.Key) 49 | 50 | if len(attr.Val) > 0 { 51 | s.WriteByte('=') 52 | s.WriteByte('"') 53 | s.WriteString(attr.Val) 54 | s.WriteByte('"') 55 | } 56 | } 57 | 58 | s.WriteByte('>') 59 | 60 | if fc := he.Node.FirstChild; fc != nil { 61 | if fc.Type == html.TextNode { 62 | text := strings.TrimSpace(fc.Data) 63 | runes := []rune(text) 64 | if len(runes) == 0 { 65 | s.WriteString("...") 66 | } else if len(runes) > 10 { 67 | s.WriteString(string(runes[:10])) 68 | s.WriteString("...") 69 | } else { 70 | s.WriteString(text) 71 | } 72 | } else { 73 | s.WriteString("...") 74 | } 75 | } 76 | 77 | s.WriteString("") 78 | s.WriteString(he.Name) 79 | s.WriteByte('>') 80 | 81 | return s.String() 82 | } 83 | 84 | // NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node. 85 | func NewHTMLElementFromSelectionNode(s *goquery.Selection, n *html.Node, index int) *HTMLElement { 86 | return &HTMLElement{ 87 | Name: n.Data, 88 | DOM: s, 89 | Index: index, 90 | Node: n, 91 | } 92 | } 93 | 94 | // Attr returns the selected attribute of a HTMLElement or empty string 95 | // if no attribute found 96 | func (he *HTMLElement) Attr(key string) string { 97 | for _, attr := range he.Node.Attr { 98 | if attr.Key == key { 99 | return attr.Val 100 | } 101 | } 102 | return "" 103 | } 104 | 105 | // OuterHtml returns the outer HTML rendering of the first item in 106 | // the selection - that is, the HTML including the first element's 107 | // tag and attributes. 108 | func (he *HTMLElement) OuterHTML() (string, error) { 109 | return goquery.OuterHtml(he.DOM) 110 | } 111 | 112 | // InnerHTML gets the HTML contents of the first element in the set of matched 113 | // elements. It includes text and comment nodes. 114 | func (he *HTMLElement) InnerHTML() (string, error) { 115 | return he.DOM.Html() 116 | } 117 | 118 | // Text gets the combined text contents of each element in the set of matched 119 | // elements, including their descendants. 120 | func (he *HTMLElement) Text() string { 121 | return he.DOM.Text() 122 | } 123 | 124 | // Texts Gets all child text elements in the current element and returns a []string 125 | func (he *HTMLElement) Texts() []string { 126 | if he == nil { 127 | return nil 128 | } 129 | 130 | var texts []string 131 | 132 | // Slightly optimized vs calling Each: no single selection object created 133 | var f func(*html.Node) 134 | f = func(n *html.Node) { 135 | if n.Type == html.TextNode { 136 | text := tools.Strip(n.Data) 137 | if text != "" { 138 | // 当使用 Selection.ReplaceWithHtml 将原节点替换成了一个 TextNode 时 139 | // 很可能会出现多个文本节点连接,这在现实 DOM 结构是不可能存在的,但 ReplaceWithHtml 140 | // 方法的不完备却可能出现此情况,故只能在此判断前面的节点是否为文本节点,如果是则将两个文本 141 | // 节点的文本合并。 142 | if n.PrevSibling != nil && n.PrevSibling.Type == html.TextNode { 143 | if len(texts) > 0 { 144 | texts[len(texts)-1] += text 145 | } else { 146 | texts = append(texts, text) 147 | } 148 | } else { 149 | texts = append(texts, text) 150 | } 151 | } 152 | } 153 | if n.FirstChild != nil { 154 | for c := n.FirstChild; c != nil; c = c.NextSibling { 155 | f(c) 156 | } 157 | } 158 | } 159 | for _, n := range he.DOM.Nodes { 160 | f(n) 161 | } 162 | 163 | return texts 164 | } 165 | 166 | // ChildText returns the concatenated and stripped text content of the matching 167 | // elements. 168 | func (he *HTMLElement) ChildText(selector string) string { 169 | return strings.TrimSpace(he.DOM.Find(selector).Text()) 170 | } 171 | 172 | // ChildrenText returns the stripped text content of all the matching 173 | // elements. 174 | func (he *HTMLElement) ChildrenText(selector string) []string { 175 | var res []string 176 | he.Each(selector, func(_ int, h *HTMLElement) bool { 177 | text := h.Text() 178 | if text == "" { 179 | return false 180 | } 181 | 182 | res = append(res, strings.TrimSpace(text)) 183 | return false 184 | }) 185 | return res 186 | } 187 | 188 | // ChildAttr returns the stripped text content of the first matching 189 | // element's attribute. 190 | func (he *HTMLElement) ChildAttr(selector, attrName string) string { 191 | if attr, ok := he.DOM.Find(selector).Attr(attrName); ok { 192 | return strings.TrimSpace(attr) 193 | } 194 | return "" 195 | } 196 | 197 | // ChildrenAttr returns the stripped text content of all the matching 198 | // element's attributes. 199 | func (he *HTMLElement) ChildrenAttr(selector, attrName string) []string { 200 | var res []string 201 | he.Each(selector, func(_ int, h *HTMLElement) bool { 202 | if attr := h.Attr(attrName); attr != "" { 203 | res = append(res, strings.TrimSpace(attr)) 204 | } 205 | return false 206 | }) 207 | return res 208 | } 209 | 210 | // Each iterates over the elements matched by the first argument 211 | // and calls the callback function on every HTMLElement match. 212 | // 213 | // The for loop will break when the `callback` returns `true`. 214 | func (he *HTMLElement) Each(selector string, callback func(int, *HTMLElement) bool) { 215 | i := 0 216 | if he == nil { 217 | panic(ErrNilElement) 218 | } 219 | 220 | found := he.DOM.Find(selector) 221 | if found == nil { 222 | return 223 | } 224 | 225 | found.Each(func(_ int, s *goquery.Selection) { 226 | for _, n := range s.Nodes { 227 | if callback(i, NewHTMLElementFromSelectionNode(s, n, i)) { 228 | break 229 | } 230 | i++ 231 | } 232 | }) 233 | } 234 | 235 | // Child returns the numth matched child element. 236 | // num starts at 1, not at 0. 237 | func (he *HTMLElement) Child(selector string, num int) *HTMLElement { 238 | if he == nil { 239 | panic(ErrNilElement) 240 | } 241 | 242 | s := he.DOM.Find(selector) 243 | nodes := s.Nodes 244 | if len(nodes) == 0 { 245 | return nil 246 | } 247 | 248 | if num == -1 { 249 | num = s.Length() 250 | } 251 | 252 | return NewHTMLElementFromSelectionNode( 253 | goquery.NewDocumentFromNode(nodes[num-1]).Selection, 254 | nodes[num-1], 255 | num-1, 256 | ) 257 | } 258 | 259 | // FirstChild returns the first child element that matches the selector. 260 | func (he *HTMLElement) FirstChild(selector string) *HTMLElement { 261 | return he.Child(selector, 1) 262 | } 263 | 264 | // LastChild returns the last child element that matches the selector. 265 | func (he *HTMLElement) LastChild(selector string) *HTMLElement { 266 | return he.Child(selector, -1) 267 | } 268 | 269 | // Parent returns the direct parent element. 270 | func (he *HTMLElement) Parent() *HTMLElement { 271 | // If the current element is tag, return nil 272 | if he.Name == "html" { 273 | return nil 274 | } 275 | 276 | s := he.DOM.Parent() 277 | return NewHTMLElementFromSelectionNode(s, s.Nodes[0], 0) 278 | } 279 | 280 | // Parents returns all parent elements. 281 | func (he *HTMLElement) Parents() []*HTMLElement { 282 | parents := make([]*HTMLElement, 0) 283 | 284 | for { 285 | var parent = he.Parent() 286 | if parent == nil { 287 | break 288 | } 289 | parents = append(parents, parent) 290 | he = parent 291 | } 292 | 293 | return parents 294 | } 295 | 296 | // FindChildByText returns the first child element matching the target text. 297 | func (he *HTMLElement) FindChildByText(selector, text string) *HTMLElement { 298 | var target *HTMLElement 299 | he.Each(selector, func(i int, h *HTMLElement) bool { 300 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && h.Node.FirstChild.Data == text { 301 | target = h 302 | return true 303 | } 304 | return false 305 | }) 306 | return target 307 | } 308 | 309 | // FindChildByStripedText returns the first child element matching the stripped text. 310 | func (he *HTMLElement) FindChildByStripedText(selector, text string) *HTMLElement { 311 | var target *HTMLElement 312 | he.Each(selector, func(i int, h *HTMLElement) bool { 313 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && tools.Strip(h.Node.FirstChild.Data) == text { 314 | target = h 315 | return true 316 | } 317 | return false 318 | }) 319 | return target 320 | } 321 | 322 | // Children returns all child elements matching the selector 323 | func (he *HTMLElement) Children(selector string) []*HTMLElement { 324 | children := make([]*HTMLElement, 0, 3) 325 | he.Each(selector, func(i int, h *HTMLElement) bool { 326 | children = append(children, h) 327 | return false 328 | }) 329 | return children 330 | } 331 | 332 | // FindChildrenByText returns all the child elements matching the target text. 333 | func (he *HTMLElement) FindChildrenByText(selector, text string) []*HTMLElement { 334 | targets := make([]*HTMLElement, 0, 3) 335 | he.Each(selector, func(i int, h *HTMLElement) bool { 336 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && h.Node.FirstChild.Data == text { 337 | targets = append(targets, h) 338 | } 339 | return false 340 | }) 341 | return targets 342 | } 343 | 344 | // FindChildrenByStripedText returns all the child elements matching the stripped text. 345 | func (he *HTMLElement) FindChildrenByStripedText(selector, text string) []*HTMLElement { 346 | targets := make([]*HTMLElement, 0, 3) 347 | he.Each(selector, func(i int, h *HTMLElement) bool { 348 | if h.Node.FirstChild != nil && h.Node.FirstChild.Type == html.TextNode && tools.Strip(h.Node.FirstChild.Data) == text { 349 | targets = append(targets, h) 350 | } 351 | return false 352 | }) 353 | return targets 354 | } 355 | -------------------------------------------------------------------------------- /html/html_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: thepoy 3 | * @Email: thepoy@163.com 4 | * @File Name: html_test.go 5 | * @Created: 2021-10-10 14:59:49 6 | * @Modified: 2021-10-12 09:44:28 7 | */ 8 | 9 | package html 10 | 11 | import ( 12 | "testing" 13 | 14 | . "github.com/smartystreets/goconvey/convey" 15 | ) 16 | 17 | var body = []byte(`