├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── chenerzhu
│ │ └── crawler
│ │ └── proxy
│ │ └── pool
│ │ ├── ProxyPoolApplication.java
│ │ ├── common
│ │ ├── HttpMethod.java
│ │ └── RedisKey.java
│ │ ├── config
│ │ ├── RedisConfig.java
│ │ ├── SpringConfig.java
│ │ └── WebConfig.java
│ │ ├── context
│ │ └── SpringContextHolder.java
│ │ ├── controller
│ │ ├── BaseController.java
│ │ └── ProxyIpController.java
│ │ ├── entity
│ │ ├── ProxyIp.java
│ │ ├── Result.java
│ │ └── WebPage.java
│ │ ├── exception
│ │ ├── ProxyPoolException.java
│ │ └── ProxyPoolExceptionHandler.java
│ │ ├── job
│ │ ├── crawler
│ │ │ ├── AbstractCrawler.java
│ │ │ ├── CrawlerJob.java
│ │ │ ├── Data5uCrawlerJob.java
│ │ │ ├── FreeProxyListCrawlerJob.java
│ │ │ ├── GatherproxyCrawlerJob.java
│ │ │ ├── ICrawler.java
│ │ │ ├── MyProxyCrawlerJob.java
│ │ │ ├── Proxy4FreeCrawlerJob.java
│ │ │ ├── ProxynovaCrawlerJob.java
│ │ │ ├── SpysOneCrawlerJob.java
│ │ │ └── XicidailiCrawlerJob.java
│ │ ├── execute
│ │ │ ├── ISchedulerJobExecutor.java
│ │ │ └── impl
│ │ │ │ └── SchedulerJobExecutor.java
│ │ └── scheduler
│ │ │ ├── AbstractSchedulerJob.java
│ │ │ ├── SchedulerJob.java
│ │ │ ├── SyncDbSchedulerJob.java
│ │ │ ├── SyncRedisSchedulerJob.java
│ │ │ └── ValidateRedisSchedulerJob.java
│ │ ├── listener
│ │ ├── JobContextListener.java
│ │ └── SpringContextListener.java
│ │ ├── repository
│ │ └── IProxyIpRepository.java
│ │ ├── service
│ │ ├── IProxyIpRedisService.java
│ │ ├── IProxyIpService.java
│ │ └── impl
│ │ │ ├── ProxyIpRedisServiceImpl.java
│ │ │ └── ProxyIpServiceImpl.java
│ │ ├── thread
│ │ └── ThreadFactory.java
│ │ └── util
│ │ ├── HttpClientUtils.java
│ │ ├── HttpsUtils.java
│ │ └── ProxyUtils.java
└── resources
│ ├── application.properties
│ ├── static
│ ├── css
│ │ ├── bootstrap-table.css
│ │ └── bootstrap.min.css
│ ├── img
│ │ ├── crawler.PNG
│ │ ├── glyphicons-halflings-white.png
│ │ ├── glyphicons-halflings.png
│ │ └── home.PNG
│ └── js
│ │ ├── bootstrap-table.js
│ │ ├── bootstrap.min.js
│ │ └── jquery-3.1.1.min.js
│ └── templates
│ ├── error
│ └── 500.html
│ ├── index.html
│ └── test.html
└── test
└── java
└── com
└── chenerzhu
└── crawler
└── proxy
└── pool
└── ProxyPoolApplicationTest.java
/README.md:
--------------------------------------------------------------------------------
1 | # proxy-pool 代理IP
2 | ### 背景
3 | 前段时间,写java爬虫来爬网易云音乐的评论。不料,爬了一段时间后ip被封禁了。由此,想到了使用ip代理,但是找了很多的ip代理网站,很少有可以用的代理ip。于是,抱着边学习的心态,自己开发了一个代理ip池。
4 |
5 | ### 相关技术及环境
6 | **技术:** SpringBoot,SpringMVC, Hibernate, MySQL, Redis , Maven, Lombok, BootStrap-table,多线程并发
7 | **环境:** JDK1.8 , IDEA
8 |
9 | ### 实现功能
10 | 通过ip代理池,提供高可用的代理ip,可用率达到95%以上。
11 | - 通过接口获取代理ip
12 | 通过访问接口,如:http://127.0.0.1:8080/proxyIp 返回代理ip的json格式
13 | ```json
14 | {
15 | "code":200,
16 | "data":[
17 | {
18 | "available":true,
19 | "ip":"1.10.186.214",
20 | "lastValidateTime":"2018-09-25 20:31:52",
21 | "location":"THThailand",
22 | "port":57677,
23 | "requestTime":0,
24 | "responseTime":0,
25 | "type":"https",
26 | "useTime":3671
27 | }
28 | ],
29 | "message":"success"
30 | }
31 | ```
32 |
33 | - 通过页面获取代理ip
34 | 通过访问url,如:http://127.0.0.1:8080 返回代理ip列表页面。
35 |
36 |
37 | - 提供代理ip测试接口及页面
38 | 通过访问url, 如:http://127.0.0.1:8080/test (get)测试代理ip的可用性;通过接口 http://127.0.0.1:8080/test ](post data: {"ip": "127.0.0.1","port":8080} ) 测试代理ip的可用性。
39 |
40 | ### 设计思路
41 | #### 模块划分
42 | - 爬虫模块:爬取代理ip网站的代理IP信息,先通过队列再保存进数据库。
43 | - 数据库同步模块:设置一定时间间隔同步数据库IP到redis缓存中。
44 | - 缓存redis同步模块:设置一定时间间隔同步redis缓存到另一块redis缓存中。
45 | - 缓存redis代理ip校验模块:设置一定时间间隔redis缓存代理ip池校验。
46 | - 前端显示及接口控制模块:显示可用ip页面,及提供ip获取api接口。
47 |
48 | #### 架构图
49 |
50 |
51 | ### IP来源
52 | 代理ip均来自爬虫爬取,有些国内爬取的ip大多都不能用,代理池的ip可用ip大多是国外的ip。爬取的网站有:http://www.xicidaili.com/nn ,http://www.data5u.com/free/index.shtml ,https://free-proxy-list.net ,https://www.my-proxy.com/free-proxy-list.html ,http://spys.one/en/free-proxy-list/ , https://www.proxynova.com/proxy-server-list/ ,https://www.proxy4free.com/list/webproxy1.html ,http://www.gatherproxy.com/ 。
53 | ### 如何使用
54 | **前提:** 已经安装JDK1.8环境,MySQL数据库,Redis。
55 | 先使用maven编译成jar,proxy-pool-1.0.jar。
56 | 使用SpringBoot启动方式,启动即可。
57 | ```java
58 | java -jar proxy-pool-1.0.jar
59 | ```
60 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | com.chenerzhu.crawler
7 | proxy-pool
8 | 1.0-SNAPSHOT
9 | jar
10 |
11 | proxy-pool
12 | proxy-pool
13 |
14 |
15 | org.springframework.boot
16 | spring-boot-starter-parent
17 | 2.0.4.RELEASE
18 |
19 |
20 |
21 |
22 | UTF-8
23 | UTF-8
24 | 1.8
25 |
26 |
27 |
28 |
29 | org.springframework.boot
30 | spring-boot-starter-data-jpa
31 |
32 |
33 | org.springframework.boot
34 | spring-boot-starter-data-redis
35 |
36 |
37 | org.apache.commons
38 | commons-pool2
39 |
40 |
41 | org.springframework.boot
42 | spring-boot-starter-web
43 |
44 |
45 | org.springframework.boot
46 | spring-boot-starter-thymeleaf
47 |
48 |
49 | io.lettuce
50 | lettuce-core
51 | 5.1.0.M1
52 |
53 |
54 | mysql
55 | mysql-connector-java
56 | 8.0.11
57 |
58 |
59 | org.projectlombok
60 | lombok
61 | true
62 |
63 |
64 | org.springframework.boot
65 | spring-boot-starter-test
66 | test
67 |
68 |
69 | com.alibaba
70 | fastjson
71 | 1.2.47
72 |
73 |
74 | commons-lang
75 | commons-lang
76 | 2.5
77 |
78 |
79 | org.apache.httpcomponents
80 | httpclient
81 | 4.5.2
82 |
83 |
84 | org.jsoup
85 | jsoup
86 | 1.11.2
87 |
88 |
89 |
90 | junit
91 | junit
92 | 4.12
93 | test
94 |
95 |
96 |
97 |
98 | proxy-pool-1.0
99 |
100 |
101 | org.springframework.boot
102 | spring-boot-maven-plugin
103 |
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/ProxyPoolApplication.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool;
2 |
3 | import org.springframework.boot.SpringApplication;
4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
5 | import org.springframework.boot.web.servlet.ServletComponentScan;
6 |
7 | @SpringBootApplication
8 | @ServletComponentScan("com.chenerzhu.crawler.proxy.pool.listener")
9 | public class ProxyPoolApplication {
10 |
11 | public static void main(String[] args) {
12 | SpringApplication.run(ProxyPoolApplication.class, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/common/HttpMethod.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.common;
2 |
3 | /**
4 | * @author chenerzhu
5 | * @create 2018-09-08 17:54
6 | **/
7 | public enum HttpMethod {
8 | GET,
9 | POST,
10 | PUT,
11 | PATCH,
12 | DELETE;
13 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/common/RedisKey.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.common;
2 |
3 | /**
4 | * @author chenerzhu
5 | * @create 2018-08-31 20:08
6 | **/
7 | public final class RedisKey {
8 | public static final String PROXY_IP_KEY="PROXY_IP_KEY";
9 | public static final String PROXY_IP_RT_KEY="PROXY_IP_RT_KEY";
10 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/config/RedisConfig.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.config;
2 |
3 | import org.springframework.boot.autoconfigure.AutoConfigureAfter;
4 | import org.springframework.boot.autoconfigure.data.redis.RedisAutoConfiguration;
5 | import org.springframework.context.annotation.Bean;
6 | import org.springframework.context.annotation.Configuration;
7 | import org.springframework.data.redis.connection.lettuce.LettuceConnectionFactory;
8 | import org.springframework.data.redis.core.RedisTemplate;
9 | import org.springframework.data.redis.serializer.GenericJackson2JsonRedisSerializer;
10 | import org.springframework.data.redis.serializer.StringRedisSerializer;
11 |
12 | import java.io.Serializable;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-08-31 16:05
17 | **/
18 | @Configuration
19 | @AutoConfigureAfter(RedisAutoConfiguration.class)
20 | public class RedisConfig {
21 | @Bean
22 | public RedisTemplate redisRedisTemplate(LettuceConnectionFactory redisConnectionFactory) {
23 | RedisTemplate template = new RedisTemplate<>();
24 | template.setKeySerializer(new StringRedisSerializer());
25 | template.setValueSerializer(new GenericJackson2JsonRedisSerializer());
26 | template.setConnectionFactory(redisConnectionFactory);
27 | return template;
28 | }
29 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/config/SpringConfig.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.config;
2 |
3 | import org.springframework.context.annotation.Configuration;
4 |
5 | /**
6 | * @author chenerzhu
7 | * @create 2018-08-30 12:38
8 | **/
9 | @Configuration
10 | public class SpringConfig {
11 |
12 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/config/WebConfig.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.config;
2 |
3 | import com.alibaba.fastjson.support.spring.FastJsonHttpMessageConverter;
4 | import org.springframework.context.annotation.Bean;
5 | import org.springframework.context.annotation.Configuration;
6 | import org.springframework.http.MediaType;
7 | import org.springframework.http.converter.HttpMessageConverter;
8 | import org.springframework.http.converter.StringHttpMessageConverter;
9 | import org.springframework.http.converter.json.Jackson2ObjectMapperBuilder;
10 | import org.springframework.http.converter.json.MappingJackson2HttpMessageConverter;
11 | import org.springframework.web.servlet.config.annotation.DefaultServletHandlerConfigurer;
12 | import org.springframework.web.servlet.config.annotation.EnableWebMvc;
13 | import org.springframework.web.servlet.config.annotation.ResourceHandlerRegistry;
14 | import org.springframework.web.servlet.config.annotation.WebMvcConfigurer;
15 |
16 | import java.nio.charset.Charset;
17 | import java.util.ArrayList;
18 | import java.util.List;
19 |
20 | /**
21 | * @author chenerzhu
22 | * @create 2018-05-27 14:10
23 | **/
24 | @Configuration
25 | @EnableWebMvc // 启用MVC Java config的支持. 相当于
26 | public class WebConfig implements WebMvcConfigurer {
27 |
28 | // 设置响应头信息
29 | private static List buildDefaultMediaTypes() {
30 | List list = new ArrayList<>();
31 | list.add(MediaType.TEXT_HTML); // 这个必须设置在第一位
32 | list.add(MediaType.APPLICATION_JSON_UTF8);
33 | return list;
34 | }
35 |
36 | @Override
37 | public void addResourceHandlers(ResourceHandlerRegistry registry) {
38 | registry.addResourceHandler("/static/**").addResourceLocations("classpath:/static/");
39 | registry.addResourceHandler("/js/**").addResourceLocations("classpath:/static/js/");
40 | registry.addResourceHandler("/css/**").addResourceLocations("classpath:/static/css/");
41 | }
42 |
43 | // 配置处理静态资源
44 | @Override
45 | public void configureDefaultServletHandling(DefaultServletHandlerConfigurer configurer) {
46 | configurer.enable();
47 | }
48 |
49 | // 设置MessageConverter
50 | @Override
51 | public void configureMessageConverters(List> converters) {
52 | converters.add(stringHttpMessageConverter());
53 | converters.add(httpMessageConverter());
54 | }
55 |
56 | @Bean
57 | public StringHttpMessageConverter stringHttpMessageConverter() {
58 | // 设置默认编码为UTF-8
59 | Charset default_charset = Charset.forName("UTF-8");
60 | StringHttpMessageConverter converter = new StringHttpMessageConverter(default_charset);
61 | List list = buildDefaultMediaTypes();
62 | converter.setSupportedMediaTypes(list);
63 | return converter;
64 | }
65 | @Bean
66 | public FastJsonHttpMessageConverter httpMessageConverter() {
67 | FastJsonHttpMessageConverter converter=new FastJsonHttpMessageConverter();
68 | List list = buildDefaultMediaTypes();
69 | converter.setSupportedMediaTypes(list);
70 | return converter;
71 | }
72 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/context/SpringContextHolder.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.context;
2 |
3 | import lombok.extern.slf4j.Slf4j;
4 | import org.springframework.beans.BeansException;
5 | import org.springframework.beans.factory.DisposableBean;
6 | import org.springframework.context.ApplicationContext;
7 | import org.springframework.context.ApplicationContextAware;
8 | import org.springframework.stereotype.Component;
9 |
10 | /**
11 | * @author chenerzhu
12 | * @create 2018-08-30 21:09
13 | **/
14 | @Slf4j
15 | public class SpringContextHolder implements ApplicationContextAware, DisposableBean {
16 | private static ApplicationContext applicationContext;
17 |
18 | private SpringContextHolder() {
19 | }
20 |
21 | public static void initApplicationContext(ApplicationContext applicationContext) {
22 | if(SpringContextHolder.applicationContext==null){
23 | SpringContextHolder.applicationContext = applicationContext;
24 | }
25 | }
26 |
27 | public static ApplicationContext getApplicationContext() {
28 | return applicationContext;
29 | }
30 |
31 | @Override
32 | public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
33 | if(this.applicationContext==null){
34 | SpringContextHolder.applicationContext = applicationContext;
35 | }
36 | }
37 |
38 | @SuppressWarnings("unchecked")
39 | public static T getBean(String name) {
40 | return (T) getApplicationContext().getBean(name);
41 | }
42 |
43 |
44 | @SuppressWarnings("unchecked")
45 | public static T getBean(Class clazz) {
46 | return (T) getApplicationContext().getBeansOfType(clazz);
47 | }
48 |
49 | @Override
50 | public void destroy() throws Exception {
51 | SpringContextHolder.clear();
52 | }
53 |
54 | public static void clear() {
55 | log.debug("Clear ApplicationContext of SpringContextHolder:" + applicationContext);
56 | applicationContext = null;
57 | }
58 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/controller/BaseController.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.controller;
2 |
3 | import org.springframework.web.bind.annotation.RestController;
4 |
5 | /**
6 | * @author chenerzhu
7 | * @create 2018-08-29 19:52
8 | **/
9 | public class BaseController {
10 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/controller/ProxyIpController.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.controller;
2 |
3 | import com.alibaba.fastjson.JSON;
4 | import com.alibaba.fastjson.JSONObject;
5 | import com.alibaba.fastjson.serializer.SerializeConfig;
6 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
7 | import com.chenerzhu.crawler.proxy.pool.entity.Result;
8 | import com.chenerzhu.crawler.proxy.pool.service.IProxyIpRedisService;
9 | import com.chenerzhu.crawler.proxy.pool.service.IProxyIpService;
10 | import lombok.extern.slf4j.Slf4j;
11 | import org.springframework.beans.factory.annotation.Autowired;
12 | import org.springframework.http.ResponseEntity;
13 | import org.springframework.stereotype.Controller;
14 | import org.springframework.ui.ModelMap;
15 | import org.springframework.web.bind.annotation.GetMapping;
16 | import org.springframework.web.bind.annotation.PostMapping;
17 | import org.springframework.web.bind.annotation.RequestMapping;
18 | import org.springframework.web.bind.annotation.ResponseBody;
19 |
20 | import javax.annotation.Resource;
21 | import javax.servlet.http.HttpServletRequest;
22 | import javax.servlet.http.HttpServletResponse;
23 | import java.util.ArrayList;
24 | import java.util.Arrays;
25 | import java.util.List;
26 |
27 | /**
28 | * @author chenerzhu
29 | * @create 2018-08-29 19:51
30 | **/
31 | @Slf4j
32 | @Controller
33 | public class ProxyIpController extends BaseController {
34 | @Autowired
35 | private IProxyIpRedisService proxyIpRedisService;
36 |
37 | @Resource
38 | private IProxyIpService proxyIpService;
39 |
40 | @GetMapping("/")
41 | public String index(ModelMap modelMap){
42 | List proxyIpList=proxyIpRedisService.findAllByPageRt(0,20);
43 | modelMap.put("proxyIpList", JSON.toJSON(proxyIpList));
44 | return "index";
45 | }
46 |
47 | @GetMapping("/proxyIpLow")
48 | @ResponseBody
49 | public Object getProxyIpLow(HttpServletRequest request, HttpServletResponse response, ModelMap modelMap) throws Exception {
50 | ProxyIp proxyIp = proxyIpRedisService.getOne();
51 | boolean available = proxyIpService.testIp(proxyIp.getIp(), proxyIp.getPort(),proxyIp.getType());
52 | while (!available){
53 | proxyIp = proxyIpRedisService.getOne();
54 | available = proxyIpService.testIp(proxyIp.getIp(), proxyIp.getPort(),proxyIp.getType());
55 | }
56 | Result result=new Result();
57 | result.setCode(200);
58 | result.setMessage("success");
59 | result.setData(Arrays.asList(proxyIp));
60 | return result;
61 | }
62 |
63 | @GetMapping("/proxyIp")
64 | @ResponseBody
65 | public Object getProxyIp(HttpServletRequest request, HttpServletResponse response, ModelMap modelMap) throws Exception {
66 | ProxyIp proxyIp = proxyIpRedisService.getOneRt();
67 | Result result=new Result();
68 | result.setCode(200);
69 | result.setMessage("success");
70 | result.setData(Arrays.asList(proxyIp));
71 | return result;
72 | }
73 |
74 | @PostMapping("/test")
75 | @ResponseBody
76 | public Object testIp(HttpServletRequest request, HttpServletResponse response, ModelMap modelMap) throws Exception {
77 | String ip = request.getParameter("ip").trim();
78 | String port = request.getParameter("port").trim();
79 | boolean available = proxyIpService.testIp(ip, Integer.parseInt(port));
80 | Result result=new Result();
81 | result.setCode(200);
82 | result.setData(new ArrayList());
83 | result.setMessage(available==true?"available":"unavailable");
84 | return result;
85 | }
86 |
87 | @GetMapping("/test")
88 | public String test() {
89 | return "test";
90 | }
91 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/entity/ProxyIp.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.entity;
2 |
3 | import com.alibaba.fastjson.annotation.JSONField;
4 | import com.fasterxml.jackson.annotation.JsonIgnore;
5 | import lombok.Data;
6 | import lombok.ToString;
7 | import org.hibernate.annotations.CreationTimestamp;
8 | import org.hibernate.annotations.UpdateTimestamp;
9 |
10 | import javax.persistence.*;
11 | import java.io.Serializable;
12 | import java.util.Date;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-08-29 21:00
17 | **/
18 | @Data
19 | @ToString
20 | @Entity
21 | @Table(name = "ProxyIp")
22 | public class ProxyIp implements Serializable {
23 | private static final long serialVersionUID = 1L;
24 | @Id
25 | @GeneratedValue(strategy = GenerationType.IDENTITY)
26 | @JSONField(serialize = false)
27 | private long id;
28 | private String ip;
29 | private int port;
30 | private String country;//国家
31 | private String location;//位置
32 | private String type;//类型 https http
33 | private String anonymity;//匿名性
34 | @Column(name="available" ,nullable=false)
35 | private boolean available;
36 | /*@Temporal(TemporalType.TIMESTAMP)
37 | @CreationTimestamp*/
38 | @JsonIgnore
39 | @JSONField(serialize = false)
40 | private Date createTime;
41 | /*@UpdateTimestamp
42 | @Temporal(TemporalType.TIMESTAMP)*/
43 | @JSONField(format="yyyy-MM-dd HH:mm:ss")
44 | private Date lastValidateTime;
45 | @Column(name="validateCount" ,nullable=false,columnDefinition="INT default 0")
46 | @JsonIgnore
47 | @JSONField(serialize = false)
48 | private int validateCount;//校验次数
49 | @JsonIgnore
50 | @JSONField(serialize = false)
51 | private int availableCount;//校验可用次数
52 | @JsonIgnore
53 | @JSONField(serialize = false)
54 | private int unAvailableCount;//校验不可用次数
55 | private long responseTime;//响应时间
56 | private long requestTime;//请求时间
57 | private long useTime;//代理请求需要总时长
58 | @Column(scale=3,precision = 5)
59 | @JSONField(serialize = false)
60 | private double availableRate;//可用率
61 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/entity/Result.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.entity;
2 |
3 | import lombok.Data;
4 | import lombok.ToString;
5 |
6 | import java.util.List;
7 |
8 | /**
9 | * @author chenerzhu
10 | * @create 2018-09-05 22:09
11 | **/
12 | @ToString
13 | @Data
14 | public class Result {
15 | private String message;
16 | private int code;
17 | private List data;
18 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/entity/WebPage.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.entity;
2 |
3 | import lombok.Data;
4 | import lombok.ToString;
5 | import org.jsoup.nodes.Document;
6 |
7 | import java.io.Serializable;
8 | import java.util.Date;
9 |
10 | /**
11 | * @author chenerzhu
12 | * @create 2018-09-02 15:14
13 | **/
14 | @Data
15 | @ToString
16 | public class WebPage implements Serializable {
17 | private static final long serialVersionUID = 23454787L;
18 | private Date crawlTime;
19 | private String page;
20 | private Document document;
21 | private String html;
22 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/exception/ProxyPoolException.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.exception;
2 |
3 | import lombok.extern.slf4j.Slf4j;
4 | import org.springframework.http.HttpStatus;
5 | import org.springframework.web.bind.annotation.ControllerAdvice;
6 | import org.springframework.web.bind.annotation.ExceptionHandler;
7 | import org.springframework.web.bind.annotation.ResponseStatus;
8 | import org.springframework.web.servlet.ModelAndView;
9 |
10 | /**
11 | * @author chenerzhu
12 | * @create 2018-05-26 19:46
13 | **/
14 | public class ProxyPoolException extends RuntimeException{
15 | public ProxyPoolException(){
16 | super();
17 | }
18 |
19 | public ProxyPoolException(String message){
20 | super(message);
21 | }
22 |
23 | public ProxyPoolException(String message,Throwable e){
24 | super(message,e);
25 | }
26 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/exception/ProxyPoolExceptionHandler.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.exception;
2 |
3 | import lombok.extern.slf4j.Slf4j;
4 | import org.springframework.http.HttpStatus;
5 | import org.springframework.web.bind.annotation.ControllerAdvice;
6 | import org.springframework.web.bind.annotation.ExceptionHandler;
7 | import org.springframework.web.bind.annotation.ResponseStatus;
8 | import org.springframework.web.servlet.ModelAndView;
9 |
10 | /**
11 | * @author chenerzhu
12 | * @create 2018-08-29 20:29
13 | **/
14 | @Slf4j
15 | @ControllerAdvice
16 | public class ProxyPoolExceptionHandler {
17 | @ExceptionHandler(ProxyPoolException.class)
18 | @ResponseStatus(HttpStatus.OK)
19 | public ModelAndView processProxyPool(Exception e){
20 | log.info("自定义异常处理-ProxyPoolException");
21 | ModelAndView m = new ModelAndView();
22 | log.error("error:",e);
23 | m.addObject("exception", e.getMessage());
24 | m.setViewName("error/500");
25 | return m;
26 | }
27 | @ExceptionHandler(Exception.class)
28 | @ResponseStatus(HttpStatus.OK)
29 | public ModelAndView processException(Exception e){
30 | ModelAndView m = new ModelAndView();
31 | log.error("error:",e);
32 | m.addObject("exception", e.getMessage());
33 | m.setViewName("error/500");
34 | return m;
35 | }
36 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/AbstractCrawler.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.common.HttpMethod;
4 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
5 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
6 | import com.chenerzhu.crawler.proxy.pool.job.scheduler.AbstractSchedulerJob;
7 | import com.chenerzhu.crawler.proxy.pool.util.HttpClientUtils;
8 | import lombok.extern.slf4j.Slf4j;
9 | import org.jsoup.Jsoup;
10 |
11 | import java.util.Date;
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | import java.util.concurrent.ConcurrentLinkedQueue;
15 |
16 | /**
17 | * @author chenerzhu
18 | * @create 2018-09-02 13:40
19 | **/
20 | @Slf4j
21 | public abstract class AbstractCrawler extends AbstractSchedulerJob implements ICrawler, Runnable {
22 | protected ConcurrentLinkedQueue proxyIpQueue;
23 | protected String pageUrl;
24 | protected WebPage webPage;
25 | protected HttpMethod httpMethd=HttpMethod.GET;
26 | protected Map formParamMap;
27 | private Map headerMap = new HashMap() {{
28 | put("Connection", "keep-alive");
29 | put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");
30 | put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
31 | put("Accept-Encoding", "gzip, deflate, sdch");
32 | put("Accept-Language", "zh-CN,zh;q=0.9");
33 | put("Redis-Control", "max-age=0");
34 | put("Upgrade-Insecure-Requests", "1");
35 | }};
36 |
37 | public AbstractCrawler(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
38 | this.proxyIpQueue = proxyIpQueue;
39 | this.pageUrl = pageUrl;
40 | this.httpMethd=HttpMethod.GET;
41 | }
42 |
43 | public AbstractCrawler(ConcurrentLinkedQueue proxyIpQueue, String pageUrl,HttpMethod httpMethd,Map formParamMap) {
44 | this.proxyIpQueue = proxyIpQueue;
45 | this.pageUrl = pageUrl;
46 | this.httpMethd=httpMethd;
47 | this.formParamMap=formParamMap;
48 | }
49 |
50 | @Override
51 | public void run() {
52 | try {
53 | getPage();
54 | parsePage(webPage);
55 | }catch (Exception e){
56 | log.error("{} page process error",pageUrl,e);
57 | }
58 |
59 | }
60 |
61 | @Override
62 | public WebPage getPage() {
63 | WebPage webPage = null;
64 | try {
65 | log.debug("start get page:{}", pageUrl);
66 | headerMap.put("Referer", pageUrl);
67 | String pageContent="";
68 | if(httpMethd==HttpMethod.GET){
69 | pageContent= HttpClientUtils.sendGet(pageUrl, headerMap);
70 | }else if(httpMethd==HttpMethod.POST){
71 | pageContent= HttpClientUtils.sendPostForm(pageUrl, "",headerMap,formParamMap);
72 | }
73 | webPage = new WebPage();
74 | webPage.setCrawlTime(new Date());
75 | webPage.setPage(pageContent);
76 | webPage.setDocument(Jsoup.parse(pageContent));
77 | webPage.setHtml(Jsoup.parse(pageContent).html());
78 | this.webPage = webPage;
79 | log.debug("end get page:{}", pageUrl);
80 | } catch (Exception e) {
81 | log.error("get page:{}", pageUrl, e);
82 | }
83 | return webPage;
84 | }
85 |
86 | public String getPageUrl() {
87 | return pageUrl;
88 | }
89 |
90 | public void setPageUrl(String pageUrl) {
91 | this.pageUrl = pageUrl;
92 | }
93 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/CrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.job.execute.ISchedulerJobExecutor;
5 | import com.chenerzhu.crawler.proxy.pool.job.execute.impl.SchedulerJobExecutor;
6 | import com.chenerzhu.crawler.proxy.pool.service.IProxyIpService;
7 | import lombok.extern.slf4j.Slf4j;
8 | import org.springframework.beans.factory.annotation.Autowired;
9 | import org.springframework.stereotype.Component;
10 |
11 | import java.util.concurrent.*;
12 | import com.chenerzhu.crawler.proxy.pool.thread.ThreadFactory;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-09-02 20:16
17 | **/
18 | @Slf4j
19 | @Component
20 | @SuppressWarnings("unchecked")
21 | public class CrawlerJob implements Runnable {
22 | private volatile static ExecutorService executorService= Executors.newFixedThreadPool(5,new ThreadFactory("crawlerJob-consumer"));
23 |
24 | private ISchedulerJobExecutor schedulerJobExecutor=new SchedulerJobExecutor(30,"crawlerJob-producer");
25 |
26 | @Autowired
27 | private IProxyIpService proxyIpService;
28 |
29 | @Override
30 | public void run() {
31 | try{
32 | ConcurrentLinkedQueue proxyIpQueue = new ConcurrentLinkedQueue<>();
33 | //生产者
34 | //schedulerJobExecutor.execute(new XicidailiCrawlerJob(proxyIpQueue, "http://www.xicidaili.com/nn"), 0, 100, TimeUnit.SECONDS);
35 |
36 | //schedulerJobExecutor.execute(new Data5uCrawlerJob(proxyIpQueue, "http://www.data5u.com/free/index.shtml"), 10, 100, TimeUnit.SECONDS);
37 |
38 | schedulerJobExecutor.execute(new FreeProxyListCrawlerJob(proxyIpQueue, "https://free-proxy-list.net"), 20, 100, TimeUnit.SECONDS);
39 |
40 | schedulerJobExecutor.execute(new MyProxyCrawlerJob(proxyIpQueue, "https://www.my-proxy.com/free-proxy-list.html"), 30, 100, TimeUnit.SECONDS);
41 |
42 | //schedulerJobExecutor.execute(new SpysOneCrawlerJob(proxyIpQueue, "http://spys.one/en/free-proxy-list/"), 40, 100, TimeUnit.SECONDS);
43 |
44 | schedulerJobExecutor.execute(new ProxynovaCrawlerJob(proxyIpQueue, "https://www.proxynova.com/proxy-server-list/"), 50, 100, TimeUnit.SECONDS);
45 |
46 | schedulerJobExecutor.execute(new Proxy4FreeCrawlerJob(proxyIpQueue, "https://www.proxy4free.com/list/webproxy1.html"), 60, 100, TimeUnit.SECONDS);
47 |
48 | schedulerJobExecutor.execute(new GatherproxyCrawlerJob(proxyIpQueue, "http://www.gatherproxy.com/"), 70, 100, TimeUnit.SECONDS);
49 |
50 | //消费者
51 | for (int i = 0; i < 5; i++) {
52 | executorService.execute(new Runnable() {
53 | @Override
54 | public void run() {
55 | while (true && !Thread.currentThread().isInterrupted()) {
56 | try {
57 | log.info("the proxyIpQueue current size:{}", proxyIpQueue.size());
58 | ProxyIp proxyIp = proxyIpQueue.poll();
59 | if (proxyIp != null) {
60 | log.debug("get proxy ip:{}", proxyIp.toString());
61 | if (proxyIpService.findByIpEqualsAndPortEqualsAndTypeEquals(proxyIp.getIp(), proxyIp.getPort(), proxyIp.getType()) == null) {
62 | proxyIpService.save(proxyIp);
63 | } else {
64 | log.debug("the proxy ip exist:{}", proxyIp.toString());
65 | }
66 | }else{
67 | TimeUnit.SECONDS.sleep(3);
68 | }
69 | } catch (Exception e) {
70 | log.error("get the proxy ip failed! error:{}",e.getMessage());
71 | //e.printStackTrace();
72 | try {
73 | TimeUnit.SECONDS.sleep(3);
74 | } catch (InterruptedException e1) {
75 | e1.printStackTrace();
76 | }
77 | }
78 | }
79 | }
80 | });
81 | }
82 | }catch (Exception e){
83 | log.error("crawler error:{}",e);
84 | executorService.shutdown();
85 | schedulerJobExecutor.shutdown();
86 | }finally {
87 |
88 | }
89 | }
90 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/Data5uCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.jsoup.nodes.Element;
7 | import org.jsoup.select.Elements;
8 |
9 | import java.util.Date;
10 | import java.util.concurrent.BlockingQueue;
11 | import java.util.concurrent.ConcurrentLinkedQueue;
12 | import java.util.concurrent.TimeUnit;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-09-03 20:11
17 | **/
18 | @Slf4j
19 | public class Data5uCrawlerJob extends AbstractCrawler {
20 | public Data5uCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
21 | super(proxyIpQueue, pageUrl);
22 | }
23 |
24 | @Override
25 | public void parsePage(WebPage webPage) {
26 | Elements elements = webPage.getDocument().getElementsByClass("l2");
27 | Element element;
28 | ProxyIp proxyIp;
29 | for (int i = 0; i < elements.size(); i++) {
30 | try {
31 | element = elements.get(i);
32 | proxyIp = new ProxyIp();
33 | proxyIp.setIp(element.child(0).text());
34 | proxyIp.setPort(Integer.parseInt(element.child(1).text()));
35 | proxyIp.setLocation(element.child(4).text() + "-" + element.child(5).text());
36 | proxyIp.setType(element.child(3).text());
37 | proxyIp.setAvailable(true);
38 | proxyIp.setCreateTime(new Date());
39 | proxyIp.setLastValidateTime(new Date());
40 | proxyIp.setValidateCount(0);
41 | proxyIpQueue.offer(proxyIp);
42 | } catch (Exception e) {
43 | log.error("data5uCrawlerJob error:{0}",e);
44 | }
45 | }
46 | }
47 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/FreeProxyListCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import com.chenerzhu.crawler.proxy.pool.job.crawler.AbstractCrawler;
6 | import lombok.extern.slf4j.Slf4j;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.select.Elements;
9 |
10 | import java.util.Date;
11 | import java.util.concurrent.BlockingQueue;
12 | import java.util.concurrent.ConcurrentLinkedQueue;
13 | import java.util.concurrent.TimeUnit;
14 |
15 | /**
16 | * @author chenerzhu
17 | * @create 2018-09-04 14:06
18 | * https://free-proxy-list.net/
19 | **/
20 | @Slf4j
21 | public class FreeProxyListCrawlerJob extends AbstractCrawler {
22 | public FreeProxyListCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
23 | super(proxyIpQueue, pageUrl);
24 | }
25 |
26 | @Override
27 | public void parsePage(WebPage webPage) {
28 | Elements elements = webPage.getDocument().getElementById("proxylisttable").getElementsByTag("tr");
29 | Element element;
30 | ProxyIp proxyIp;
31 | for (int i = 1; i < elements.size() - 1; i++) {
32 | try {
33 | element = elements.get(i);
34 | proxyIp = new ProxyIp();
35 | proxyIp.setIp(element.child(0).text());
36 | proxyIp.setPort(Integer.parseInt(element.child(1).text()));
37 | proxyIp.setLocation(element.child(2).text() + "-" + element.child(3).text());
38 | proxyIp.setType("yes".equalsIgnoreCase(element.child(6).text()) == true ? "https" : "http");
39 | proxyIp.setAvailable(true);
40 | proxyIp.setCreateTime(new Date());
41 | proxyIp.setLastValidateTime(new Date());
42 | proxyIp.setValidateCount(0);
43 | proxyIpQueue.offer(proxyIp);
44 | } catch (Exception e) {
45 | log.error("freeProxyListCrawlerJob error:{0}",e);
46 | }
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/GatherproxyCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.alibaba.fastjson.JSONObject;
4 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
5 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
6 | import lombok.extern.slf4j.Slf4j;
7 |
8 | import java.util.Date;
9 | import java.util.concurrent.BlockingQueue;
10 | import java.util.concurrent.ConcurrentLinkedQueue;
11 | import java.util.concurrent.TimeUnit;
12 | import java.util.regex.Matcher;
13 | import java.util.regex.Pattern;
14 |
15 | /**
16 | * @author chenerzhu
17 | * @create 2018-09-09 9:09
18 | * http://www.gatherproxy.com/
19 | **/
20 | @Slf4j
21 | public class GatherproxyCrawlerJob extends AbstractCrawler {
22 | public GatherproxyCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
23 | super(proxyIpQueue, pageUrl);
24 | }
25 |
26 | @Override
27 | public void parsePage(WebPage webPage) {
28 | Pattern pattern = Pattern.compile("\\{\"PROXY_CITY\".*?\"}");
29 | Matcher matcher = null;
30 | matcher = pattern.matcher(webPage.getHtml());
31 | ProxyIp proxyIp = null;
32 | while (matcher.find()) {
33 | try {
34 | JSONObject jsonObject = JSONObject.parseObject(matcher.group(0));
35 | proxyIp = new ProxyIp();
36 | proxyIp.setIp(jsonObject.getString("PROXY_IP"));
37 | proxyIp.setPort(Integer.parseInt(jsonObject.getString("PROXY_PORT"), 16));
38 | proxyIp.setType("SOCKS");//
39 | proxyIp.setLocation(jsonObject.getString("PROXY_COUNTRY"));
40 | proxyIp.setCountry(jsonObject.getString("PROXY_COUNTRY"));
41 | proxyIp.setAnonymity(jsonObject.getString("PROXY_TYPE"));
42 | proxyIp.setAvailable(true);
43 | proxyIp.setCreateTime(new Date());
44 | proxyIp.setLastValidateTime(new Date());
45 | proxyIp.setValidateCount(0);
46 | proxyIpQueue.offer(proxyIp);
47 | } catch (Exception e) {
48 | log.error("freeProxyListCrawlerJob error:{0}",e);
49 | }
50 |
51 | }
52 | }
53 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/ICrawler.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
4 |
5 | /**
6 | * @author chenerzhu
7 | * @create 2018-09-02 13:40
8 | **/
9 | public interface ICrawler {
10 | WebPage getPage();
11 |
12 | void parsePage(WebPage webPage);
13 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/MyProxyCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import com.chenerzhu.crawler.proxy.pool.job.crawler.AbstractCrawler;
6 | import lombok.extern.slf4j.Slf4j;
7 |
8 | import java.util.Date;
9 | import java.util.concurrent.BlockingQueue;
10 | import java.util.concurrent.ConcurrentLinkedQueue;
11 | import java.util.concurrent.LinkedBlockingQueue;
12 | import java.util.concurrent.TimeUnit;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-09-08 16:35
17 | * https://www.my-proxy.com/free-proxy-list.html
18 | **/
19 | @Slf4j
20 | public class MyProxyCrawlerJob extends AbstractCrawler {
21 | public MyProxyCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
22 | super(proxyIpQueue, pageUrl);
23 | }
24 |
25 | @Override
26 | public void parsePage(WebPage webPage) {
27 | String[] elements = webPage.getDocument().getElementsByClass("list")
28 | .html().split("
");
29 | ProxyIp proxyIp;
30 | String element;
31 | for (int i = 0; i < 43; i++) {
32 | try {
33 | //185.120.37.186:55143#AL
34 | element = elements[i];
35 | String ipPort = element.split("#")[0];
36 | String ip = ipPort.split(":")[0];
37 | String port = ipPort.split(":")[1];
38 | String country = element.split("#")[1];
39 | proxyIp = new ProxyIp();
40 | proxyIp.setIp(ip);
41 | proxyIp.setPort(Integer.parseInt(port));
42 | proxyIp.setType("http");
43 | proxyIp.setCountry(country);
44 | proxyIp.setLocation(country);
45 | proxyIp.setCreateTime(new Date());
46 | proxyIp.setAvailable(true);
47 | proxyIp.setLastValidateTime(new Date());
48 | proxyIp.setValidateCount(0);
49 | proxyIpQueue.offer(proxyIp);
50 | } catch (Exception e) {
51 | log.error("myProxyCrawlerJob error:{0}",e);
52 | }
53 | }
54 |
55 |
56 | }
57 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/Proxy4FreeCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.jsoup.nodes.Element;
7 | import org.jsoup.select.Elements;
8 |
9 | import java.util.Date;
10 | import java.util.concurrent.BlockingQueue;
11 | import java.util.concurrent.ConcurrentLinkedQueue;
12 | import java.util.concurrent.TimeUnit;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-09-09 8:43
17 | * https://www.proxy4free.com/list/webproxy1.html
18 | **/
19 | @Slf4j
20 | public class Proxy4FreeCrawlerJob extends AbstractCrawler {
21 | public Proxy4FreeCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
22 | super(proxyIpQueue, pageUrl);
23 | }
24 |
25 | @Override
26 | public void parsePage(WebPage webPage) {
27 | Elements elements = webPage.getDocument().getElementsByTag("tr");
28 | Element element;
29 | ProxyIp proxyIp;
30 | for (int i = 2; i < elements.size(); i++) {
31 | try {
32 | element = elements.get(i);
33 | proxyIp = new ProxyIp();
34 | proxyIp.setIp(element.child(0).child(0).attr("href").replaceAll("\"", "").split("=")[1]);
35 | proxyIp.setPort(80);
36 | proxyIp.setLocation(element.child(3).text());
37 | proxyIp.setCountry(element.child(3).text());
38 | proxyIp.setAnonymity(element.child(9).text());
39 | proxyIp.setType("unKnow");
40 | proxyIp.setAvailable(true);
41 | proxyIp.setCreateTime(new Date());
42 | proxyIp.setLastValidateTime(new Date());
43 | proxyIp.setValidateCount(0);
44 | proxyIpQueue.offer(proxyIp);
45 | } catch (Exception e) {
46 | log.error("proxy4FreeCrawlerJob error:{0}",e);
47 | }
48 | }
49 |
50 | }
51 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/ProxynovaCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.jsoup.nodes.Element;
7 | import org.jsoup.select.Elements;
8 |
9 | import javax.script.ScriptEngine;
10 | import javax.script.ScriptEngineManager;
11 | import javax.script.ScriptException;
12 | import java.util.Date;
13 | import java.util.concurrent.BlockingQueue;
14 | import java.util.concurrent.ConcurrentLinkedQueue;
15 | import java.util.concurrent.TimeUnit;
16 | import java.util.regex.Matcher;
17 | import java.util.regex.Pattern;
18 |
19 | /**
20 | * @author chenerzhu
21 | * @create 2018-09-08 23:25
22 | * https://www.proxynova.com/proxy-server-list/
23 | **/
24 | @Slf4j
25 | public class ProxynovaCrawlerJob extends AbstractCrawler {
26 | public ProxynovaCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
27 | super(proxyIpQueue, pageUrl);
28 | }
29 |
30 | @Override
31 | public void parsePage(WebPage webPage) {
32 | Elements elements = webPage.getDocument().getElementsByTag("tbody")
33 | .get(0).getElementsByTag("tr");
34 | Element element;
35 | ProxyIp proxyIp;
36 | for (int i = 0; i < elements.size(); i++) {
37 | try {
38 | element = elements.get(i);
39 | proxyIp = new ProxyIp();
40 | String ip = getIp(element);
41 | if ("".equals(ip)) {
42 | continue;
43 | }
44 | proxyIp.setIp(ip);
45 | proxyIp.setPort(Integer.parseInt(element.child(1).text()));
46 | proxyIp.setLocation(element.child(5).text());
47 | proxyIp.setCountry(element.child(5).text().split("-")[0]);
48 | proxyIp.setAnonymity(element.child(6).text());
49 | proxyIp.setType("unKnow");
50 | proxyIp.setAvailable(true);
51 | proxyIp.setCreateTime(new Date());
52 | proxyIp.setLastValidateTime(new Date());
53 | proxyIp.setValidateCount(0);
54 | proxyIpQueue.offer(proxyIp);
55 | } catch (Exception e) {
56 | log.error("proxynovaCrawlerJob error:{0}",e);
57 | }
58 | }
59 | }
60 |
61 | private String getIp(Element element) throws ScriptException {
62 | String ip = "";
63 | ScriptEngineManager manager = new ScriptEngineManager();
64 | ScriptEngine engine = manager.getEngineByName("js");
65 | Pattern pattern = Pattern.compile("\\(.*?\\);<");
66 | Matcher matcher = null;
67 | matcher = pattern.matcher(element.child(0).html());
68 | if (matcher.find()) {
69 | String ipScript = matcher.group(0).substring(1, matcher.group(0).length() - 1);
70 | ip = (String) engine.eval(ipScript.replaceAll("\\);", ""));
71 | }
72 | return ip;
73 | }
74 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/SpysOneCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.common.HttpMethod;
4 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
5 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
6 | import lombok.extern.slf4j.Slf4j;
7 | import org.jsoup.nodes.Document;
8 | import org.jsoup.nodes.Element;
9 | import org.jsoup.select.Elements;
10 |
11 | import javax.script.ScriptEngine;
12 | import javax.script.ScriptEngineManager;
13 | import javax.script.ScriptException;
14 | import java.util.Date;
15 | import java.util.HashMap;
16 | import java.util.Map;
17 | import java.util.concurrent.BlockingQueue;
18 | import java.util.concurrent.ConcurrentLinkedQueue;
19 | import java.util.concurrent.TimeUnit;
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 |
23 | /**
24 | * @author chenerzhu
25 | * @create 2018-09-08 17:25
26 | * http://spys.one/en/free-proxy-list/
27 | * form:xpp=5&xf1=0&xf2=0&xf4=0&xf5=1
28 | **/
29 | @Slf4j
30 | public class SpysOneCrawlerJob extends AbstractCrawler {
31 | public SpysOneCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
32 | super(proxyIpQueue, pageUrl);
33 | this.httpMethd=HttpMethod.POST;
34 | this.formParamMap=new HashMap(){{
35 | put("xpp","5");
36 | put("xf1","0");
37 | put("xf2","0");
38 | put("xf4","0");
39 | put("xf5","1");
40 | }};
41 | }
42 |
43 | @Override
44 | public void parsePage(WebPage webPage) {
45 | Elements elements = webPage.getDocument().getElementsByClass("spy1xx");
46 | Element element;
47 | ProxyIp proxyIp;
48 | for (int i = 1; i < elements.size(); i++) {
49 | try {
50 | element = elements.get(i);
51 | proxyIp = new ProxyIp();
52 | proxyIp.setIp(element.child(0).selectFirst(".spy14").text());
53 | int port = getPort(element);
54 | if (port == -1) {
55 | continue;
56 | }
57 | proxyIp.setPort(port);
58 | proxyIp.setCountry(element.child(3).selectFirst(".spy14").text());
59 | proxyIp.setLocation(element.child(3).text());
60 | proxyIp.setType(element.child(1).text());
61 | proxyIp.setAnonymity(element.child(2).text());
62 | proxyIp.setAvailable(true);
63 | proxyIp.setCreateTime(new Date());
64 | proxyIp.setLastValidateTime(new Date());
65 | proxyIp.setValidateCount(0);
66 | proxyIpQueue.offer(proxyIp);
67 | } catch (Exception e) {
68 | log.error("spysOneCrawlerJob error:{0}",e);
69 | }
70 | }
71 | }
72 |
73 | private int getPort(Element element) throws ScriptException {
74 | int port = -1;
75 | ScriptEngineManager manager = new ScriptEngineManager();
76 | ScriptEngine engine = manager.getEngineByName("js");
77 | Pattern pattern = Pattern.compile("\\+.*?<");
78 | Matcher matcher = null;
79 | Document document = webPage.getDocument();
80 | String scrpit = document.getElementsByTag("script").get(2).data();
81 | engine.eval(scrpit);
82 | matcher = pattern.matcher(element.child(0).html());
83 | if (matcher.find()) {
84 | String portScript = matcher.group(0).substring(1, matcher.group(0).length() - 2);
85 | Object obj=engine.eval(portScript.replaceAll("\\+", "+''+"));
86 | port = Integer.parseInt((String)obj);
87 | }
88 | return port;
89 | }
90 |
91 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/crawler/XicidailiCrawlerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.crawler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.entity.WebPage;
5 | import com.chenerzhu.crawler.proxy.pool.job.crawler.AbstractCrawler;
6 | import lombok.extern.slf4j.Slf4j;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.select.Elements;
9 |
10 | import java.util.Date;
11 | import java.util.concurrent.BlockingQueue;
12 | import java.util.concurrent.ConcurrentLinkedQueue;
13 | import java.util.concurrent.TimeUnit;
14 |
15 | /**
16 | * @author chenerzhu
17 | * @create 2018-09-02 15:23
18 | * http://www.xicidaili.com
19 | **/
20 | @Slf4j
21 | public class XicidailiCrawlerJob extends AbstractCrawler {
22 | public XicidailiCrawlerJob(ConcurrentLinkedQueue proxyIpQueue, String pageUrl) {
23 | super(proxyIpQueue, pageUrl);
24 | }
25 |
26 | @Override
27 | public void parsePage(WebPage webPage) {
28 | Elements elements = webPage.getDocument().getElementsByTag("tr");
29 | Element element;
30 | ProxyIp proxyIp;
31 | for (int i = 1; i < elements.size(); i++) {
32 | try {
33 | element = elements.get(i);
34 | proxyIp = new ProxyIp();
35 | proxyIp.setIp(element.child(1).text());
36 | proxyIp.setPort(Integer.parseInt(element.child(2).text()));
37 | proxyIp.setLocation(element.child(3).text());
38 | proxyIp.setType(element.child(5).text());
39 | proxyIp.setAvailable(true);
40 | proxyIp.setCreateTime(new Date());
41 | proxyIp.setLastValidateTime(new Date());
42 | proxyIp.setValidateCount(0);
43 | proxyIpQueue.offer(proxyIp);
44 | } catch (Exception e) {
45 | log.error("xicidailiCrawlerJob error:{0}",e);
46 | }
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/execute/ISchedulerJobExecutor.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.execute;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.job.scheduler.AbstractSchedulerJob;
4 |
5 | import java.util.concurrent.TimeUnit;
6 |
7 | /**
8 | * @author chenerzhu
9 | * @create 2018-08-30 12:14
10 | **/
11 | public interface ISchedulerJobExecutor {
12 | void execute(AbstractSchedulerJob schedulerJob, long delayTime, long intervalTime, TimeUnit timeUnit);
13 | void executeDelay(AbstractSchedulerJob schedulerJob, long delayTime, long intervalTime, TimeUnit timeUnit);
14 | void shutdown();
15 | //void execute(Runnable runnable);
16 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/execute/impl/SchedulerJobExecutor.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.execute.impl;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.job.execute.ISchedulerJobExecutor;
4 | import com.chenerzhu.crawler.proxy.pool.job.scheduler.AbstractSchedulerJob;
5 | import com.chenerzhu.crawler.proxy.pool.thread.ThreadFactory;
6 |
7 | import java.util.concurrent.*;
8 |
9 | /**
10 | * @author chenerzhu
11 | * @create 2018-08-30 12:15
12 | **/
13 | public class SchedulerJobExecutor implements ISchedulerJobExecutor {
14 |
15 | private ScheduledExecutorService scheduledExecutorService;
16 | public SchedulerJobExecutor(){}
17 |
18 | public SchedulerJobExecutor(String threadFactory){
19 | scheduledExecutorService=Executors.newScheduledThreadPool(10,new ThreadFactory(threadFactory));
20 | }
21 |
22 | public SchedulerJobExecutor(int corePoolSize,String threadFactory){
23 | scheduledExecutorService=Executors.newScheduledThreadPool(corePoolSize,new ThreadFactory(threadFactory));
24 | }
25 |
26 |
27 | public void execute(AbstractSchedulerJob schedulerJob, long delayTime, long intervalTime, TimeUnit timeUnit){
28 | scheduledExecutorService.scheduleAtFixedRate(schedulerJob,delayTime,intervalTime,timeUnit);
29 | }
30 | public void executeDelay(AbstractSchedulerJob schedulerJob, long delayTime, long intervalTime, TimeUnit timeUnit){
31 | scheduledExecutorService.scheduleWithFixedDelay(schedulerJob,delayTime,intervalTime,timeUnit);
32 | }
33 |
34 | public void shutdown(){
35 | scheduledExecutorService.shutdown();
36 | }
37 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/scheduler/AbstractSchedulerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.scheduler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.job.execute.ISchedulerJobExecutor;
5 | import com.chenerzhu.crawler.proxy.pool.job.execute.impl.SchedulerJobExecutor;
6 | import com.chenerzhu.crawler.proxy.pool.thread.ThreadFactory;
7 | import com.chenerzhu.crawler.proxy.pool.util.ProxyUtils;
8 |
9 | import java.util.concurrent.*;
10 |
11 |
12 | /**
13 | * @author chenerzhu
14 | * @create 2018-08-30 10:27
15 | **/
16 | public abstract class AbstractSchedulerJob implements Runnable {
17 | private volatile transient ExecutorService executorService = Executors.newCachedThreadPool(new ThreadFactory("validate"));
18 |
19 | public Future> execute(Callable> callable) {
20 | initInstance();
21 | return executorService.submit(callable);
22 | }
23 |
24 | public Future> execute(FutureTask> task) {
25 | initInstance();
26 | return executorService.submit(task);
27 | }
28 |
29 | private void initInstance() {
30 | if (executorService.isShutdown()) {
31 | synchronized (AbstractSchedulerJob.class) {
32 | if (executorService.isShutdown()) {
33 | executorService = Executors.newCachedThreadPool(new ThreadFactory("validate"));
34 | }
35 | }
36 | }
37 | }
38 |
39 | public void shutdown() {
40 | executorService.shutdown();
41 | }
42 |
43 | public boolean validateIp(ProxyIp proxyIp) {
44 | boolean available = false;
45 | if (proxyIp.getType().toUpperCase().contains("HTTPS")) {
46 | available = ProxyUtils.validateHttps(proxyIp.getIp(), proxyIp.getPort());
47 | } else if (proxyIp.getType().toUpperCase().contains("HTTP")) {
48 | available = ProxyUtils.validateHttp(proxyIp.getIp(), proxyIp.getPort());
49 | } else if (proxyIp.getType().equalsIgnoreCase("unKnow")) {
50 | available = ProxyUtils.validateHttp(proxyIp.getIp(), proxyIp.getPort());
51 | if (!available) {
52 | available = ProxyUtils.validateHttps(proxyIp.getIp(), proxyIp.getPort());
53 | }
54 | /*if(!available){
55 | available = ProxyUtils.validateHttps(proxyIp.getIp(), proxyIp.getPort());
56 | proxyIp.setType("https");
57 | }
58 | if(!available){
59 | proxyIp.setType("unKnow");
60 | }*/
61 | } else if (proxyIp.getType().toUpperCase().contains("SOCKS")) {
62 | available = ProxyUtils.validateHttp(proxyIp.getIp(), proxyIp.getPort());
63 | if (!available) {
64 | available = ProxyUtils.validateHttps(proxyIp.getIp(), proxyIp.getPort());
65 | }
66 | /*if(!available){
67 | available = ProxyUtils.validateHttps(proxyIp.getIp(), proxyIp.getPort());
68 | proxyIp.setType("https");
69 | }
70 | if(!available){
71 | proxyIp.setType("socks");
72 | }*/
73 | }
74 | return available;
75 | }
76 | }
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/scheduler/SchedulerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.scheduler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.job.execute.ISchedulerJobExecutor;
4 | import com.chenerzhu.crawler.proxy.pool.job.execute.impl.SchedulerJobExecutor;
5 | import lombok.extern.slf4j.Slf4j;
6 | import org.springframework.beans.factory.annotation.Qualifier;
7 | import org.springframework.stereotype.Component;
8 |
9 | import javax.annotation.Resource;
10 | import java.text.SimpleDateFormat;
11 | import java.util.Date;
12 | import java.util.concurrent.TimeUnit;
13 |
14 | /**
15 | * @author chenerzhu
16 | * @create 2018-09-21 15:03
17 | **/
18 | @Slf4j
19 | @Component
20 | public class SchedulerJob implements Runnable {
21 | private static ISchedulerJobExecutor schedulerJobExecutor = new SchedulerJobExecutor(10, "schedulerJob");
22 | @Resource
23 | @Qualifier("syncDbSchedulerJob")
24 | private AbstractSchedulerJob syncDbSchedulerJob;
25 | @Resource
26 | @Qualifier("syncRedisSchedulerJob")
27 | private AbstractSchedulerJob syncRedisSchedulerJob;
28 | @Resource
29 | @Qualifier("validateRedisSchedulerJob")
30 | private AbstractSchedulerJob validateRedisSchedulerJob;
31 | @Override
32 | public void run() {
33 | try{
34 | schedulerJobExecutor.execute(syncDbSchedulerJob,10, 5, TimeUnit.SECONDS);
35 | schedulerJobExecutor.execute(syncRedisSchedulerJob,50, 30, TimeUnit.SECONDS);
36 | schedulerJobExecutor.execute(validateRedisSchedulerJob,100, 30, TimeUnit.SECONDS);
37 | }catch (Exception e){
38 | log.error("schedulerJob error:{}",e);
39 | schedulerJobExecutor.shutdown();
40 | }finally {
41 |
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/com/chenerzhu/crawler/proxy/pool/job/scheduler/SyncDbSchedulerJob.java:
--------------------------------------------------------------------------------
1 | package com.chenerzhu.crawler.proxy.pool.job.scheduler;
2 |
3 | import com.chenerzhu.crawler.proxy.pool.entity.ProxyIp;
4 | import com.chenerzhu.crawler.proxy.pool.service.IProxyIpRedisService;
5 | import com.chenerzhu.crawler.proxy.pool.service.IProxyIpService;
6 | import lombok.extern.slf4j.Slf4j;
7 | import org.springframework.beans.factory.annotation.Autowired;
8 | import org.springframework.stereotype.Component;
9 |
10 | import java.util.ArrayList;
11 | import java.util.Date;
12 | import java.util.List;
13 | import java.util.concurrent.Callable;
14 | import java.util.concurrent.CopyOnWriteArrayList;
15 | import java.util.concurrent.FutureTask;
16 | import java.util.concurrent.TimeUnit;
17 | import java.util.concurrent.atomic.AtomicInteger;
18 | import java.util.stream.IntStream;
19 |
20 | /**
21 | * @author chenerzhu
22 | * @create 2018-09-07 17:25
23 | **/
24 | @Slf4j
25 | @Component
26 | @SuppressWarnings("unchecked")
27 | public class SyncDbSchedulerJob extends AbstractSchedulerJob {
28 |
29 | @Autowired
30 | private IProxyIpRedisService proxyIpRedisService;
31 | @Autowired
32 | private IProxyIpService proxyIpService;
33 |
34 |
35 | @Override
36 | public void run() {
37 | try {
38 | List availableIpList = new CopyOnWriteArrayList();
39 | List unAvailableIpList = new CopyOnWriteArrayList();
40 | int validateCountBefore = 3;
41 | int validateCountAfter = 100;
42 | double availableRate=0.5;//可用率大于0.5的重新取出来
43 | long totalCount = proxyIpService.totalCount(validateCountBefore,validateCountAfter,availableRate);
44 | log.info("proxyIp total count:{}", totalCount);
45 | AtomicInteger availableIpCount=new AtomicInteger(0);
46 | AtomicInteger unAvailableIpCount=new AtomicInteger(0);
47 | int pageSize = 200;
48 | int pageCount = (int) ((int) (totalCount % pageSize) == 0 ? totalCount / pageSize : totalCount / pageSize + 1);
49 | List> taskList = new ArrayList<>();
50 | long start = System.currentTimeMillis();
51 | IntStream.range(0, pageCount).forEach(pageNumber -> {
52 | List proxyIpList = proxyIpService.findAllByPage(pageNumber, pageSize, validateCountBefore,validateCountAfter ,availableRate);
53 | proxyIpList.forEach(proxyIp -> {
54 | FutureTask task = new FutureTask(new Callable() {
55 | @Override
56 | public ProxyIp call() {
57 | try{
58 | long startTime = System.currentTimeMillis();
59 | boolean available = validateIp(proxyIp);
60 | long endTime = System.currentTimeMillis();
61 | log.info("validateIp ==> ip:{} port:{} available:{} total time:{}", proxyIp.getIp(), proxyIp.getPort(), available, (endTime - startTime));
62 | if (available) {
63 | if (proxyIpRedisService.isExist(proxyIp)) {
64 | log.info("redis exist ip:{} port:{}", proxyIp.getIp(), proxyIp.getPort());
65 | proxyIpRedisService.remove(proxyIp);
66 | }
67 | proxyIp.setLastValidateTime(new Date());
68 | proxyIp.setAvailable(available);
69 | proxyIp.setValidateCount(proxyIp.getValidateCount() + 1);
70 | proxyIp.setAvailableCount(proxyIp.getAvailableCount()+1);
71 | proxyIp.setAvailableRate(proxyIp.getAvailableCount()/(double)proxyIp.getValidateCount());
72 | proxyIp.setUseTime(endTime - startTime);
73 | proxyIpRedisService.add(proxyIp);
74 | log.info("redis add or update ip:{} port:{}", proxyIp.getIp(), proxyIp.getPort());
75 | availableIpList.add(proxyIp);
76 | availableIpCount.incrementAndGet();
77 | } else {
78 | //proxyIpRedisService.remove(proxyIp);//第一层校验不删除缓存,通过第二层校验删除
79 | //log.info("redis remove ip:{} port:{}", proxyIp.getIp(), proxyIp.getPort());
80 | proxyIp.setLastValidateTime(new Date());
81 | proxyIp.setAvailable(available);
82 | proxyIp.setValidateCount(proxyIp.getValidateCount() + 1);
83 | proxyIp.setUnAvailableCount(proxyIp.getUnAvailableCount()+1);
84 | proxyIp.setAvailableRate(proxyIp.getAvailableCount()/(double)proxyIp.getValidateCount());
85 | proxyIp.setUseTime(endTime - startTime);
86 | unAvailableIpList.add(proxyIp);
87 | unAvailableIpCount.incrementAndGet();
88 | }
89 | return proxyIp;
90 | }catch (Exception e){
91 | log.error("syncDb task proxyIP:{}",proxyIp.getIp(),e);
92 | try {
93 | TimeUnit.SECONDS.sleep(1);
94 | } catch (InterruptedException e1) {
95 | e1.printStackTrace();
96 | }
97 | }
98 | return null;
99 | }
100 | });
101 | taskList.add(task);
102 | execute(task);
103 | });
104 | try {
105 | TimeUnit.SECONDS.sleep(1);
106 | } catch (InterruptedException e1) {
107 | e1.printStackTrace();
108 | }
109 | });
110 | List proxyIpList = new ArrayList<>();
111 | taskList.forEach(proxyIpFuture -> {
112 | try {
113 | ProxyIp proxyIp = proxyIpFuture.get(6, TimeUnit.SECONDS);
114 | if(proxyIp!=null){
115 | proxyIpList.add(proxyIp);
116 | }
117 | } catch (InterruptedException e) {
118 | log.error("Interrupted ", e);
119 | } catch (Exception e) {
120 | log.error("error:", e);
121 | }
122 | });
123 | refreshDataBase(availableIpList,unAvailableIpList);
124 | long end = System.currentTimeMillis();
125 | log.info("validate over total time:{}", (end - start));
126 | log.info("availableIp size:{}", availableIpCount.get());
127 | log.info("unAvailableIp size:{}", unAvailableIpCount.get());
128 | } catch (Exception e) {
129 | log.error("error:", e);
130 | } finally {
131 | shutdown();
132 | }
133 | }
134 |
135 | private void refreshDataBase(List availableIpList,List unAvailableIpList) {
136 | int batchSize = 100;
137 | List> taskList = new ArrayList<>();
138 | long startTime=System.currentTimeMillis();
139 | log.info("refreshDataBase start...");
140 | batchUpdate(availableIpList, batchSize, taskList);
141 | batchUpdate(unAvailableIpList, batchSize, taskList);
142 |
143 | taskList.forEach(proxyIpFuture -> {
144 | try {
145 | ProxyIp proxyIp = proxyIpFuture.get(10, TimeUnit.MINUTES);
146 | } catch (InterruptedException e) {
147 | log.error("refreshDataBase Interrupted ", e);
148 | } catch (Exception e) {
149 | log.error("refreshDataBase error:", e);
150 | }
151 | });
152 | long endTime=System.currentTimeMillis();
153 | log.info("refreshDataBase time:{}",endTime-startTime);
154 | log.info("refreshDataBase proxyIp size:{}", availableIpList.size()+unAvailableIpList.size());
155 | }
156 |
157 | private void batchUpdate(List ipList, int batchSize, List> taskList) {
158 | CopyOnWriteArrayList cowIpList=new CopyOnWriteArrayList(ipList);
159 | for (int i = 0; i < cowIpList.size(); i++) {
160 | if ((i != 0) && i % batchSize == 0 || (i + 1 == cowIpList.size())) {
161 | if(i() {
163 | @Override
164 | public Object call() throws Exception {
165 | proxyIpService.batchUpdate(cowIpList);
166 | return null;
167 | }
168 | });
169 | taskList.add(task);
170 | execute(task);
171 | }else{
172 | final int start=i;
173 | FutureTask task = new FutureTask(new Callable