├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── assets
└── proxy-pool.png
├── config
└── config.go
├── configs
└── default.yml
├── controller
├── asset.go
├── common.go
└── proxy.go
├── crawler
├── crawler.go
├── crawler_test.go
├── ip66.go
├── ip66_test.go
├── kuai.go
├── kuai_test.go
├── proxylist.go
├── proxylist_test.go
├── xici.go
└── xici_test.go
├── doc
├── config.md
└── crawler.md
├── go.mod
├── go.sum
├── log
└── log.go
├── main.go
├── router
└── router.go
├── runner.conf
├── script
└── entrypoint.sh
├── service
└── proxy.go
└── web
├── .gitignore
├── package.json
├── public
├── favicon.ico
├── index.html
├── manifest.json
└── robots.txt
├── src
├── App.js
├── App.sass
├── App.test.js
├── index.js
├── index.sass
├── request-interceptors.js
├── serviceWorker.js
└── setupTests.js
└── yarn.lock
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 |
17 | tmp
18 | proxypool
19 |
20 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
21 |
22 | # dependencies
23 | node_modules
24 | .pnp
25 | .pnp.js
26 |
27 | # testing
28 | coverage
29 |
30 | # production
31 | build
32 |
33 | # misc
34 | .DS_Store
35 | .env.local
36 | .env.development.local
37 | .env.test.local
38 | .env.production.local
39 |
40 | npm-debug.log*
41 | yarn-debug.log*
42 | yarn-error.log*
43 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | sudo: required
3 |
4 | go:
5 | - "1.14"
6 |
7 | services:
8 | - docker
9 |
10 | script:
11 | - make test
12 |
13 | after_success:
14 | - docker build -t proxypool .
15 | - docker tag proxypool $DOCKER_USERNAME/proxypool:latest
16 | - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
17 | - docker push $DOCKER_USERNAME/proxypool:latest
18 | - docker images
19 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:12-alpine as webbuilder
2 | ADD . /proxy-pool
3 | RUN cd /proxy-pool/web \
4 | && yarn \
5 | && yarn build \
6 | && rm -rf node_module
7 |
8 | FROM golang:1.14-alpine as builder
9 |
10 | COPY --from=webbuilder /proxy-pool /proxy-pool
11 |
12 | RUN apk update \
13 | && apk add git make \
14 | && go get -u github.com/gobuffalo/packr/v2/packr2 \
15 | && cd /proxy-pool \
16 | && make build
17 |
18 | FROM alpine
19 |
20 | EXPOSE 4000
21 |
22 | RUN addgroup -g 1000 go \
23 | && adduser -u 1000 -G go -s /bin/sh -D go \
24 | && apk add --no-cache ca-certificates
25 |
26 | COPY --from=builder /proxy-pool/proxypool /usr/local/bin/proxypool
27 | COPY --from=builder /proxy-pool/script/entrypoint.sh /entrypoint.sh
28 |
29 | USER go
30 |
31 | WORKDIR /home/go
32 |
33 | HEALTHCHECK --timeout=10s CMD [ "wget", "http://127.0.0.1:4000/ping", "-q", "-O", "-"]
34 |
35 | ENTRYPOINT ["/entrypoint.sh"]
36 | CMD ["proxypool"]
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | export GO111MODULE = on
2 |
3 | .PHONY: default test test-cover dev
4 |
5 | # for dev
6 | dev:
7 | fresh
8 |
9 | # for test
10 | test:
11 | go test -race -cover ./...
12 |
13 | test-cover:
14 | go test -race -coverprofile=test.out ./... && go tool cover --html=test.out
15 |
16 | build:
17 | packr2
18 | go build -o proxypool
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # proxy-pool
2 |
3 | 虽然网上各类网站提供了一堆的免费代理地址,但是其可用性比较差,更新不及时,过多不可用的地址,以及延时较大等问题都干扰实际使用的效果。对于代理地址,期望是越多越好,但是对于代理质量有着更高的要求,宁缺勿滥,因此`proxy-pool`不再将抓取到的代理地址保存至数据库,而调整为定期从免费代理网站下抓取代理地址,使用该地址去测试其可用性(默认配置为访问baidu),测试可用则添加至可用代理地址列表中,如此循环一直抓取新的地址,一直校验。对于已校验可用的代理地址,也定期重新校验是否可用,默认校验间隔为30分钟。
4 |
5 | 注意:网页部分有增加百度统计,部署时可先删除。
6 |
7 |
8 |
9 |
10 |
11 | ## 常用配置
12 |
13 | 对于有特别需求,可以调整默认的配置,主要的配置如下:
14 |
15 | 抓取代理网站列表配置(暂时只实现了三个网站的抓取):
16 |
17 | ```yml
18 | crawler:
19 | - xici
20 | - ip66
21 | - kuai
22 | ```
23 |
24 | 由于各网站对访问IP频率限制的不同,可根据实际使用中调整各网站的抓取间隔,如设置`xici`的抓取延时为10分钟(如果不配置则为默认值2分钟):
25 |
26 | ```yml
27 | xici:
28 | interval: 10m
29 | ```
30 |
31 | 默认的检测方式是通过代理地址去访问`baidu`,可根据应用场景调整相应的配置:
32 |
33 | ```yml
34 | detect:
35 | # 检测时间(定时对现可用的代理地址重新检测)
36 | interval: 30m
37 | # 检测地址
38 | url: https://www.baidu.com/
39 | # 检测超时
40 | timeout: 3s
41 | # 最大次数
42 | maxTimes: 3
43 | ```
44 |
45 | ## 程序设计
46 |
47 | - [config](./doc/config.md)
48 | - [crawler](./doc/crawler.md)
49 |
--------------------------------------------------------------------------------
/assets/proxy-pool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vicanso/proxy-pool/fa848accebac33a2fc102f173bdd7a9bdbd9a395/assets/proxy-pool.png
--------------------------------------------------------------------------------
/config/config.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package config
16 |
17 | import (
18 | "bytes"
19 | "os"
20 | "time"
21 |
22 | "github.com/gobuffalo/packr/v2"
23 | "github.com/spf13/viper"
24 | )
25 |
26 | var (
27 | box = packr.New("config", "../configs")
28 | env = os.Getenv("GO_ENV")
29 | )
30 |
31 | const (
32 | // Dev development env
33 | Dev = "dev"
34 | // Test test env
35 | Test = "test"
36 | // Production production env
37 | Production = "production"
38 | )
39 |
40 | type (
41 | Crawler struct {
42 | Name string
43 | Interval time.Duration
44 | MaxPage int
45 | }
46 | // Detect detect config
47 | Detect struct {
48 | URL string
49 | Interval time.Duration
50 | Timeout time.Duration
51 | MaxTimes int
52 | }
53 | )
54 |
55 | func init() {
56 | configType := "yml"
57 | configExt := "." + configType
58 | data, err := box.Find("default" + configExt)
59 | if err != nil {
60 | panic(err)
61 | }
62 | viper.SetConfigType(configType)
63 | v := viper.New()
64 | v.SetConfigType(configType)
65 | // 读取默认配置中的所有配置
66 | err = v.ReadConfig(bytes.NewReader(data))
67 | if err != nil {
68 | panic(err)
69 | }
70 | configs := v.AllSettings()
71 | // 将default中的配置全部以默认配置写入
72 | for k, v := range configs {
73 | viper.SetDefault(k, v)
74 | }
75 |
76 | // 根据当前运行环境配置读取
77 | // 可根据不同的环境仅调整与default不一致的相关配置
78 | if env != "" {
79 | envConfigFile := env + configExt
80 | data, err = box.Find(envConfigFile)
81 | if err != nil {
82 | panic(err)
83 | }
84 | // 读取当前运行环境对应的配置
85 | err = viper.ReadConfig(bytes.NewReader(data))
86 | if err != nil {
87 | panic(err)
88 | }
89 | }
90 | }
91 |
92 | // GetCrawlers get crawlers config
93 | func GetCrawlers() []*Crawler {
94 | crawlers := make([]*Crawler, 0)
95 | data := viper.GetStringSlice("crawler")
96 | for _, name := range data {
97 | interval := viper.GetDuration(name + ".interval")
98 | maxPage := viper.GetInt(name + ".maxPage")
99 | // 如果未配置抓取间隔时间,则设置为2分钟
100 | if interval == 0 {
101 | interval = 2 * time.Minute
102 | }
103 | crawlers = append(crawlers, &Crawler{
104 | Name: name,
105 | Interval: interval,
106 | MaxPage: maxPage,
107 | })
108 | }
109 | return crawlers
110 | }
111 |
112 | // GetDetect get detect config
113 | func GetDetect() *Detect {
114 | prefix := "detect."
115 | conf := &Detect{
116 | Timeout: viper.GetDuration(prefix + "timeout"),
117 | URL: viper.GetString(prefix + "url"),
118 | Interval: viper.GetDuration(prefix + "interval"),
119 | MaxTimes: viper.GetInt(prefix + "maxTimes"),
120 | }
121 | if conf.Timeout == 0 {
122 | conf.Timeout = 3 * time.Second
123 | }
124 | if conf.Interval == 0 {
125 | conf.Interval = 30 * time.Minute
126 | }
127 | if conf.URL == "" {
128 | conf.URL = "https://www.baidu.com/"
129 | }
130 | if conf.MaxTimes <= 0 {
131 | conf.MaxTimes = 3
132 | }
133 | return conf
134 | }
135 |
136 | // GetListenAddr get listen address
137 | func GetListenAddr() string {
138 | addr := viper.GetString("listen")
139 | if addr == "" {
140 | return ":4000"
141 | }
142 | return addr
143 | }
144 |
--------------------------------------------------------------------------------
/configs/default.yml:
--------------------------------------------------------------------------------
1 | # 监听地址
2 | listen: :4000
3 | # 抓取的代理网站列表
4 | crawler:
5 | - xici
6 | - ip66
7 | - kuai
8 | # 按照需要可配置不同的代理网站的抓取频率
9 | xici:
10 | interval: 10m
11 | maxPage: 100
12 | ip66:
13 | maxPage: 200
14 | kuai:
15 | maxPage: 200
16 | # 检测代理是否可用的配置
17 | detect:
18 | # 检测时间(定时对现可用的代理地址重新检测)
19 | interval: 30m
20 | # 检测地址
21 | url: https://www.baidu.com/
22 | # 检测超时
23 | timeout: 3s
24 | # 最大次数
25 | maxTimes: 3
--------------------------------------------------------------------------------
/controller/asset.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package controller
16 |
17 | import (
18 | "bytes"
19 | "io"
20 | "os"
21 |
22 | "github.com/gobuffalo/packr/v2"
23 | "github.com/vicanso/elton"
24 | "github.com/vicanso/elton/middleware"
25 | "github.com/vicanso/proxy-pool/router"
26 | )
27 |
28 | type (
29 | // assetCtrl asset ctrl
30 | assetCtrl struct {
31 | }
32 | staticFile struct {
33 | box *packr.Box
34 | }
35 | )
36 |
37 | var (
38 | box = packr.New("asset", "../web/build")
39 | )
40 |
41 | func (sf *staticFile) Exists(file string) bool {
42 | return sf.box.Has(file)
43 | }
44 | func (sf *staticFile) Get(file string) ([]byte, error) {
45 | return sf.box.Find(file)
46 | }
47 | func (sf *staticFile) Stat(file string) os.FileInfo {
48 | return nil
49 | }
50 | func (sf *staticFile) NewReader(file string) (io.Reader, error) {
51 | buf, err := sf.Get(file)
52 | if err != nil {
53 | return nil, err
54 | }
55 | return bytes.NewReader(buf), nil
56 | }
57 |
58 | func init() {
59 | g := router.NewGroup("")
60 | ctrl := assetCtrl{}
61 | g.GET("/", ctrl.index)
62 | g.GET("/favicon.ico", ctrl.favIcon)
63 |
64 | sf := &staticFile{
65 | box: box,
66 | }
67 | g.GET("/static/*", middleware.NewStaticServe(sf, middleware.StaticServeConfig{
68 | Path: "/static",
69 | // 客户端缓存一年
70 | MaxAge: 365 * 24 * 3600,
71 | // 缓存服务器缓存一个小时
72 | SMaxAge: 60 * 60,
73 | DenyQueryString: true,
74 | DisableLastModified: true,
75 | }))
76 | }
77 |
78 | func sendFile(c *elton.Context, file string) (err error) {
79 | buf, err := box.Find(file)
80 | if err != nil {
81 | return
82 | }
83 | // 根据文件后续设置类型
84 | c.SetContentTypeByExt(file)
85 | c.BodyBuffer = bytes.NewBuffer(buf)
86 | return
87 | }
88 |
89 | func (ctrl assetCtrl) index(c *elton.Context) (err error) {
90 | c.CacheMaxAge("10s")
91 | return sendFile(c, "index.html")
92 | }
93 |
94 | func (ctrl assetCtrl) favIcon(c *elton.Context) (err error) {
95 | c.SetHeader(elton.HeaderAcceptEncoding, "public, max-age=3600, s-maxage=600")
96 | return sendFile(c, "favicon.ico")
97 | }
98 |
--------------------------------------------------------------------------------
/controller/common.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package controller
16 |
17 | import (
18 | "bytes"
19 |
20 | "github.com/vicanso/elton"
21 | "github.com/vicanso/proxy-pool/router"
22 | )
23 |
24 | type (
25 | commonCtrl struct{}
26 | )
27 |
28 | func init() {
29 | ctrl := commonCtrl{}
30 | g := router.NewGroup("")
31 |
32 | g.GET("/ping", ctrl.ping)
33 |
34 | }
35 |
36 | func (commonCtrl) ping(c *elton.Context) (err error) {
37 | c.BodyBuffer = bytes.NewBufferString("pong")
38 | return
39 | }
40 |
--------------------------------------------------------------------------------
/controller/proxy.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package controller
16 |
17 | import (
18 | "strconv"
19 |
20 | "github.com/vicanso/elton"
21 | "github.com/vicanso/proxy-pool/router"
22 | "github.com/vicanso/proxy-pool/service"
23 | )
24 |
25 | type (
26 | proxyCtrl struct{}
27 | )
28 |
29 | func init() {
30 | ctrl := proxyCtrl{}
31 | g := router.NewGroup("/proxies")
32 |
33 | g.GET("", ctrl.list)
34 | g.GET("/one", ctrl.findOne)
35 | }
36 |
37 | // list get all available proxy
38 | func (proxyCtrl) list(c *elton.Context) (err error) {
39 | c.CacheMaxAge("1m")
40 | // 直接返回所有可用的proxy,暂不考虑分页等处理
41 | c.Body = map[string]interface{}{
42 | "proxies": service.GetAvailableProxyList(),
43 | }
44 | return
45 | }
46 |
47 | // findOne get one available proxy
48 | func (proxyCtrl) findOne(c *elton.Context) (err error) {
49 | category := c.QueryParam("category")
50 | speed := -1
51 | sp := c.QueryParam("speed")
52 | if sp != "" {
53 | v, e := strconv.Atoi(sp)
54 | if e == nil {
55 | speed = v
56 | }
57 | }
58 | p := service.GetAvailableProxy(category, speed)
59 | if p == nil {
60 | c.NoContent()
61 | return
62 | }
63 | c.Body = p
64 | return
65 | }
66 |
--------------------------------------------------------------------------------
/crawler/crawler.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package crawler
16 |
17 | import (
18 | "bytes"
19 | "fmt"
20 | "net"
21 | "net/http"
22 | "net/url"
23 | "sync"
24 | "sync/atomic"
25 | "time"
26 |
27 | "github.com/PuerkitoBio/goquery"
28 | "github.com/vicanso/go-axios"
29 | "github.com/vicanso/proxy-pool/config"
30 | "github.com/vicanso/proxy-pool/log"
31 | "go.uber.org/zap"
32 | )
33 |
34 | const (
35 | StatusRunning = iota
36 | StatusStop
37 | )
38 |
39 | const (
40 | detectRunning = iota + 1
41 | detectStop
42 | )
43 |
44 | const (
45 | defaultUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
46 | defaulttProxyTimeout = 10 * time.Second
47 | )
48 |
49 | var (
50 | speedDevides = []time.Duration{750 * time.Millisecond, 1500 * time.Millisecond}
51 |
52 | logger = log.Default()
53 | detectConfig = config.GetDetect()
54 | )
55 |
56 | type (
57 | // Crawler crawler
58 | Crawler struct {
59 | sync.Mutex
60 | HTTPDetectURL string
61 | HTTPSDetectURL string
62 | newProxyList ProxyList
63 | avaliableProxyList ProxyList
64 | newProxyDetectStatus int32
65 | availableProxyDetectStatus int32
66 | }
67 | // baseProxyCrawler base proxy crawler
68 | // nolint
69 | baseProxyCrawler struct {
70 | // 每次抓取代理信息间隔(需要注意不同的网站对访问频率有不同的限制,不要设置太短)
71 | interval time.Duration
72 | // axios http实例
73 | ins *axios.Instance
74 | // 获取到IP的回调函数
75 | fetchListener FetchListener
76 | // 当前页
77 | currentPage int
78 | // 最大页数
79 | maxPage int
80 | // 状态,运行中或停止
81 | status int32
82 | // 限制的最大页数
83 | limitMaxPage int
84 | }
85 | // FetchListener fetch listener
86 | FetchListener func(*Proxy)
87 | // ProxyCrawler proxy crawler
88 | ProxyCrawler interface {
89 | // OnFetch set fetch listener
90 | OnFetch(FetchListener)
91 | // Start start the crawler
92 | Start()
93 | // Stop stop the crawler
94 | Stop()
95 | }
96 | )
97 |
98 | // OnFetch set fetch listener
99 | func (bp *baseProxyCrawler) OnFetch(fn FetchListener) {
100 | bp.fetchListener = fn
101 | }
102 |
103 | // Stop stop the crawler
104 | func (bp *baseProxyCrawler) Stop() {
105 | atomic.StoreInt32(&bp.status, StatusStop)
106 | }
107 |
108 | // fetchPage fetch html content of the current page
109 | func (bp *baseProxyCrawler) fetchPage(name, urlTemplate string) (doc *goquery.Document, err error) {
110 | ins := bp.ins
111 | // 至最后一页则重置页码
112 | if bp.maxPage != 0 && bp.currentPage == bp.maxPage {
113 | bp.currentPage = 0
114 | bp.maxPage = 0
115 | }
116 | bp.currentPage++
117 | resp, err := ins.Get(fmt.Sprintf(urlTemplate, bp.currentPage))
118 | // 对于抓取失败,则直接退出
119 | if err != nil ||
120 | resp.Status != http.StatusOK ||
121 | len(resp.Data) == 0 {
122 | logger.Error("get proxy list fail",
123 | zap.String("name", name),
124 | zap.Int("page", bp.currentPage),
125 | zap.Error(err),
126 | )
127 | return
128 | }
129 | logger.Info("get proxy list success",
130 | zap.String("name", name),
131 | zap.Int("page", bp.currentPage),
132 | )
133 | return goquery.NewDocumentFromReader(bytes.NewReader(resp.Data))
134 | }
135 |
136 | // LimitMaxPage set limit max page
137 | func (bp *baseProxyCrawler) LimitMaxPage(value int) {
138 | bp.limitMaxPage = value
139 | }
140 |
141 | // NewProxyClient create a new http client with proxy
142 | func NewProxyClient(p *Proxy) *http.Client {
143 | proxyURL, _ := url.Parse(fmt.Sprintf("http://%s:%s", p.IP, p.Port))
144 | if proxyURL == nil {
145 | return nil
146 | }
147 | return &http.Client{
148 | Transport: &http.Transport{
149 | Proxy: func(_ *http.Request) (*url.URL, error) {
150 | return proxyURL, nil
151 | },
152 | DialContext: (&net.Dialer{
153 | Timeout: 30 * time.Second,
154 | KeepAlive: 30 * time.Second,
155 | DualStack: true,
156 | }).DialContext,
157 | ForceAttemptHTTP2: true,
158 | MaxIdleConns: 100,
159 | IdleConnTimeout: 10 * time.Second,
160 | TLSHandshakeTimeout: 10 * time.Second,
161 | ExpectContinueTimeout: 1 * time.Second,
162 | },
163 | }
164 | }
165 |
166 | // analyze check the proxy is available and speed
167 | func (c *Crawler) analyze(p *Proxy) (available bool) {
168 | httpClient := NewProxyClient(p)
169 | if httpClient == nil {
170 | return false
171 | }
172 | // 多次检测,只要一次成功则认为成功
173 | for i := 0; i < detectConfig.MaxTimes; i++ {
174 | ins := axios.NewInstance(&axios.InstanceConfig{
175 | Timeout: detectConfig.Timeout,
176 | Client: httpClient,
177 | })
178 | startedAt := time.Now()
179 | resp, err := ins.Get(detectConfig.URL)
180 | if err != nil {
181 | continue
182 | }
183 | if resp.Status >= http.StatusOK && resp.Status < http.StatusBadRequest {
184 | d := time.Since(startedAt)
185 | atomic.StoreInt32(&p.Speed, int32(len(speedDevides)))
186 | // 将当前proxy划分对应的分段
187 | for index, item := range speedDevides {
188 | if d < item {
189 | atomic.StoreInt32(&p.Speed, int32(index))
190 | break
191 | }
192 | }
193 | available = true
194 | break
195 | }
196 | }
197 | return
198 | }
199 |
200 | // addNewProxy add proxy to new proxy list
201 | func (c *Crawler) addNewProxy(p *Proxy) {
202 | c.newProxyList.Add(p)
203 | }
204 |
205 | // detectProxyList detect proxy list
206 | func (c *Crawler) detectProxyList(list []*Proxy) (availableList []*Proxy, unavailableList []*Proxy) {
207 | availableList = make([]*Proxy, 0)
208 | unavailableList = make([]*Proxy, 0)
209 | w := sync.WaitGroup{}
210 | // 控制最多检测proxy的数量
211 | chans := make(chan bool, 5)
212 | for _, item := range list {
213 | w.Add(1)
214 | go func(p *Proxy) {
215 | chans <- true
216 | avaliable := c.analyze(p)
217 | atomic.StoreInt64(&p.DetectedAt, time.Now().Unix())
218 | if avaliable {
219 | availableList = append(availableList, p)
220 | } else {
221 | unavailableList = append(unavailableList, p)
222 | }
223 | <-chans
224 | w.Done()
225 | }(item)
226 | }
227 | w.Wait()
228 | return
229 | }
230 |
231 | // detectNewProxy detect the new proxy is avaliable
232 | func (c *Crawler) detectNewProxy() {
233 | old := atomic.SwapInt32(&c.newProxyDetectStatus, detectRunning)
234 | // 如果已经在运行中,则直接退出
235 | if old == detectRunning {
236 | return
237 | }
238 | proxyList := c.newProxyList.Reset()
239 | availableList, _ := c.detectProxyList(proxyList)
240 | c.avaliableProxyList.Add(availableList...)
241 |
242 | atomic.StoreInt32(&c.newProxyDetectStatus, detectStop)
243 | // 等待1分钟后,重新运行detect new proxy
244 | time.Sleep(time.Minute)
245 | c.detectNewProxy()
246 | }
247 |
248 | // RedetectAvailableProxy redetect available proxy
249 | func (c *Crawler) RedetectAvailableProxy() {
250 | old := atomic.SwapInt32(&c.availableProxyDetectStatus, detectRunning)
251 | // 如果已经在运行中,则直接退出
252 | if old == detectRunning {
253 | return
254 | }
255 | proxyList := c.avaliableProxyList.List()
256 | availableList, unavailableList := c.detectProxyList(proxyList)
257 |
258 | // 如果成功,则重置失败次数
259 | for _, p := range availableList {
260 | atomic.StoreInt32(&p.Fails, 0)
261 | }
262 | // 如果失败,则失败次数+1
263 | failProxyList := make([]*Proxy, 0)
264 | for _, p := range unavailableList {
265 | count := atomic.AddInt32(&p.Fails, 1)
266 | if count >= 3 {
267 | failProxyList = append(failProxyList, p)
268 | }
269 | }
270 | // 对于三次检测失败的代理则删除
271 | c.avaliableProxyList.Remove(failProxyList...)
272 |
273 | atomic.StoreInt32(&c.availableProxyDetectStatus, detectStop)
274 | }
275 |
276 | // Start start fetch proxy
277 | func (c *Crawler) Start(crawlers ...ProxyCrawler) {
278 | for _, item := range crawlers {
279 | item.OnFetch(c.addNewProxy)
280 | go item.Start()
281 | }
282 | // 首次延时10秒后则执行detect new proxy
283 | go func() {
284 | time.Sleep(10 * time.Second)
285 | c.detectNewProxy()
286 | }()
287 | }
288 |
289 | // GetAvailableProxyList get available proxy list
290 | func (c *Crawler) GetAvailableProxyList() []*Proxy {
291 | return c.avaliableProxyList.List()
292 | }
293 |
294 | // GetAvailableProxy get available proxy
295 | func (c *Crawler) GetAvailableProxy(category string, speed int32) *Proxy {
296 | return c.avaliableProxyList.FindOne(category, speed)
297 | }
298 |
--------------------------------------------------------------------------------
/crawler/crawler_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package crawler
16 |
17 | import (
18 | "testing"
19 |
20 | "github.com/vicanso/go-axios"
21 |
22 | "github.com/stretchr/testify/assert"
23 | )
24 |
25 | func TestBaseProxyCrawler(t *testing.T) {
26 | assert := assert.New(t)
27 | bp := new(baseProxyCrawler)
28 |
29 | assert.Nil(bp.fetchListener)
30 | bp.OnFetch(func(_ *Proxy) {})
31 | assert.NotNil(bp.fetchListener)
32 |
33 | ins := axios.NewInstance(nil)
34 | bp.currentPage = 10
35 | bp.maxPage = 10
36 | bp.ins = ins
37 | // empty data
38 | done := ins.Mock(&axios.Response{
39 | Status: 200,
40 | Data: []byte(""),
41 | })
42 | doc, err := bp.fetchPage("", "%d")
43 | assert.Nil(err)
44 | assert.Nil(doc)
45 | assert.Equal(0, bp.maxPage)
46 | assert.Equal(1, bp.currentPage)
47 | done()
48 | }
49 |
50 |
--------------------------------------------------------------------------------
/crawler/ip66.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package crawler
16 |
17 | import (
18 | "net/http"
19 | "strconv"
20 | "time"
21 |
22 | "github.com/PuerkitoBio/goquery"
23 | "github.com/vicanso/go-axios"
24 | )
25 |
26 | type (
27 | // ip66Proxy ip66 proxy
28 | ip66Proxy struct {
29 | baseProxyCrawler
30 | }
31 | )
32 |
33 | const (
34 | ProxyIP66 = "ip66"
35 | )
36 |
37 | // NewIP66Proxy create a new ip66 proxy crawler
38 | func NewIP66Proxy(interval time.Duration) *ip66Proxy {
39 | header := make(http.Header)
40 | header.Set("User-Agent", defaultUserAgent)
41 | ins := axios.NewInstance(&axios.InstanceConfig{
42 | BaseURL: "http://www.66ip.cn",
43 | Headers: header,
44 | Timeout: defaulttProxyTimeout,
45 | })
46 | ip66 := new(ip66Proxy)
47 | ip66.interval = interval
48 | ip66.ins = ins
49 | return ip66
50 | }
51 |
52 | // Start start the crawler
53 | func (ip66 *ip66Proxy) Start() {
54 | ip66.status = StatusRunning
55 | for {
56 | if ip66.status != StatusRunning {
57 | return
58 | }
59 | _ = ip66.fetch()
60 | time.Sleep(ip66.interval)
61 | }
62 | }
63 |
64 | func (ip66 *ip66Proxy) fetch() (err error) {
65 | doc, err := ip66.fetchPage("ip66", "/%d")
66 | if err != nil || doc == nil {
67 | return
68 | }
69 | // 仅在首次获取
70 | if ip66.maxPage == 0 {
71 | pages := doc.Find("#PageList a")
72 | value := pages.Eq(pages.Length() - 2).Text()
73 | max, _ := strconv.Atoi(value)
74 | if max == 0 {
75 | max = 1
76 | }
77 | if ip66.limitMaxPage != 0 && max > ip66.limitMaxPage {
78 | max = ip66.limitMaxPage
79 | }
80 | ip66.maxPage = max
81 | }
82 | doc.Find("#main table tr").Each(func(i int, s *goquery.Selection) {
83 | // 表头忽略
84 | if i == 0 {
85 | return
86 | }
87 | tdList := s.Find("td")
88 | ip := tdList.Eq(0).Text()
89 | port := tdList.Eq(1).Text()
90 | if ip == "" || port == "" || ip66.fetchListener == nil {
91 | return
92 | }
93 | fn := ip66.fetchListener
94 | fn(&Proxy{
95 | IP: ip,
96 | Port: port,
97 | Anonymous: true,
98 | Category: "http",
99 | })
100 | })
101 | return
102 | }
103 |
--------------------------------------------------------------------------------
/crawler/ip66_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2019 tree xie
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package crawler
16 |
17 | import (
18 | "testing"
19 | "time"
20 |
21 | "github.com/stretchr/testify/assert"
22 | "github.com/vicanso/go-axios"
23 | )
24 |
25 | func TestIP66Proxy(t *testing.T) {
26 | assert := assert.New(t)
27 | ip66 := NewIP66Proxy(time.Minute)
28 | html := `
29 |
30 |