├── .gitignore ├── README.md ├── img ├── getProxy.png └── proxyStatistic.png ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── meow │ │ └── proxy │ │ ├── Proxyservice.java │ │ ├── appcontext │ │ └── AppcontextUtil.java │ │ ├── base │ │ └── Const.java │ │ ├── check │ │ ├── ProxyCheck.java │ │ ├── ProxyIp2Addr.java │ │ ├── ProxyRecheckCallBack.java │ │ ├── ProxyRecheckHandler.java │ │ └── ProxyRecheckSender.java │ │ ├── configure │ │ ├── Configure.java │ │ ├── ScheduleConfig.java │ │ └── TaskHolder.java │ │ ├── controller │ │ └── ProxyControllor.java │ │ ├── crawl │ │ └── ProxyCrawl.java │ │ ├── dao │ │ └── ProxyDao.java │ │ ├── deduplicate │ │ └── SimpleBloomFilter.java │ │ ├── download │ │ ├── BaseDownLoader.java │ │ ├── DownLoader.java │ │ ├── WebDriverFactory.java │ │ └── impl │ │ │ ├── CoderbusyDownLoader.java │ │ │ ├── Data5uDownLoader.java │ │ │ ├── FreeProxyListDownLoader.java │ │ │ ├── GoubanjiaDownLoader.java │ │ │ ├── Ip3366DownLoader.java │ │ │ ├── KxdailiDownLoader.java │ │ │ ├── NianshaoDownLoader.java │ │ │ ├── ProxydbDownLoader.java │ │ │ └── XicidailiDownLoader.java │ │ ├── entity │ │ ├── IPAddr.java │ │ ├── Proxy.java │ │ ├── ProxyQueryResult.java │ │ └── Task.java │ │ ├── enums │ │ ├── CountryType.java │ │ ├── ProxyAnonymousType.java │ │ ├── ProxyProtocolType.java │ │ └── ProxySite.java │ │ ├── extract │ │ ├── Extractor.java │ │ └── impl │ │ │ ├── CoderbusyExtractor.java │ │ │ ├── Data5uExtractor.java │ │ │ ├── FreeProxyListExtractor.java │ │ │ ├── GoubanjiaExtractor.java │ │ │ ├── Ip3366Extractor.java │ │ │ ├── KxdailiExtractor.java │ │ │ ├── NianshaoExtractor.java │ │ │ ├── ProxydbExtractor.java │ │ │ ├── XdailiExtractor.java │ │ │ └── XicidailiExtractor.java │ │ ├── jobs │ │ └── ScheduleJobs.java │ │ ├── request │ │ ├── HttpClientUtil.java │ │ ├── Request.java │ │ ├── RequestRetryHandler.java │ │ └── Response.java │ │ └── service │ │ ├── ProxyService.java │ │ └── impl │ │ └── ProxyServiceImpl.java └── resources │ ├── application.properties │ ├── conf │ └── logback.xml │ ├── mapper │ └── ProxyMapper.xml │ ├── proxyservice.sh │ └── sql │ └── Proxy.sql └── test └── java └── com └── meow └── proxy ├── ProxyserviceApplicationTests.java ├── check └── ProxyCheckTest.java ├── conigure └── Task.java └── request └── Request.java /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | ### NetBeans ### 19 | nbproject/private/ 20 | build/ 21 | nbbuild/ 22 | dist/ 23 | nbdist/ 24 | .nb-gradle/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 爬取网上公开代理 2 | 3 | ## 已支持爬取的代理网站 4 | * 西刺: http://www.xicidaili.com/ 5 | * 全网代理: http://www.goubanjia.com/ 6 | * 云代理: http://www.ip3366.net/ 7 | * Free Proxy List: https://www.us-proxy.org/ 8 | * 无忧代理: http://www.data5u.com/ 9 | * 讯代理: http://www.xdaili.cn/freeproxy 10 | * 年少HTTP PROXY: http://www.nianshao.me/ 11 | * proxydb: http://proxydb.net/ 12 | * 开心代理: http://www.kxdaili.com/dailiip.html 13 | * coderbusy: https://proxy.coderbusy.com/ 14 | 15 | 16 | ## 代理查询接口 17 | * http://localhost:8888/proxy/getProxy?isDemostic=true&anonymousType=elite&protocolType=https
18 | 默认返回前一百条可用代理;
19 | 参数说明:
20 | (1) isDemostic: 可选参数,是否为国内代理,值为true和false;
21 | (2) anonymousType: 可选参数,代理的匿名类型,分为四种:transparent(透明)、anonymous(匿名)、distorting(混淆)、elite(高匿);
22 | (3) protocolType: 可选参数,代理的协议类型,分为http、https、socks4、socks5和socks(未做socks4和socks5的细分,统称为socks)
23 | 24 | 示例数据: 25 | ![返回数据](/img/getProxy.png) 26 | 27 | * http://localhost:8888/proxy/proxyStatistic
28 | 查询代理数量,按代理站点统计
29 | 示例数据: 30 | ![返回数据](/img/proxyStatistic.png) -------------------------------------------------------------------------------- /img/getProxy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jwnie/proxyservice/0ab1009e24ceffbf5588c85cfe7556f70114c72b/img/getProxy.png -------------------------------------------------------------------------------- /img/proxyStatistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jwnie/proxyservice/0ab1009e24ceffbf5588c85cfe7556f70114c72b/img/proxyStatistic.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | com.meow.proxy 7 | proxyservice 8 | 0.0.1-SNAPSHOT 9 | jar 10 | 11 | proxyservice 12 | Free proxy from network. 13 | 14 | 15 | org.springframework.boot 16 | spring-boot-starter-parent 17 | 1.5.9.RELEASE 18 | 19 | 20 | 21 | 22 | UTF-8 23 | UTF-8 24 | 1.8 25 | 1.2.0 26 | 5.1.39 27 | 28 | 29 | 30 | 31 | org.springframework.boot 32 | spring-boot-starter-web 33 | 34 | 35 | org.springframework.boot 36 | spring-boot-starter-aop 37 | 38 | 39 | org.springframework.boot 40 | spring-boot-starter-test 41 | test 42 | 43 | 44 | org.mybatis.spring.boot 45 | mybatis-spring-boot-starter 46 | ${mybatis-spring-boot.version} 47 | 48 | 49 | mysql 50 | mysql-connector-java 51 | ${mysql-connector.version} 52 | 53 | 54 | com.alibaba 55 | fastjson 56 | 1.2.24 57 | 58 | 59 | org.jsoup 60 | jsoup 61 | 1.7.2 62 | 63 | 64 | org.apache.httpcomponents 65 | httpclient 66 | 4.5.3 67 | 68 | 69 | org.seleniumhq.selenium 70 | selenium-java 71 | 2.53.1 72 | 73 | 74 | 75 | io.netty 76 | netty 77 | 78 | 79 | org.apache.commons 80 | commons-lang3 81 | 82 | 83 | commons-logging 84 | commons-logging 85 | 86 | 87 | 88 | 93 | 94 | 105 | 106 | ch.qos.logback 107 | logback-classic 108 | 1.2.3 109 | 110 | 111 | commons-collections 112 | commons-collections 113 | 3.2 114 | 115 | 116 | commons-io 117 | commons-io 118 | 2.4 119 | 120 | 121 | 122 | commons-lang 123 | commons-lang 124 | 2.6 125 | 126 | 127 | org.apache.commons 128 | commons-lang3 129 | 3.6 130 | 131 | 132 | com.google.guava 133 | guava 134 | 19.0 135 | 136 | 137 | 138 | org.apache.commons 139 | commons-compress 140 | 1.14 141 | 142 | 143 | 144 | org.brotli 145 | dec 146 | 0.1.2 147 | 148 | 149 | 150 | 151 | 152 | src/main/java 153 | ${project.build.directory}/proxyservice 154 | 155 | 156 | 157 | src/main/resources 158 | ${project.build.directory}/proxyservice 159 | 160 | 161 | 162 | 163 | org.springframework.boot 164 | spring-boot-maven-plugin 165 | 166 | 167 | org.apache.maven.plugins 168 | maven-surefire-plugin 169 | 170 | true 171 | 172 | 173 | 174 | org.apache.maven.plugins 175 | maven-compiler-plugin 176 | 177 | 1.8 178 | 1.8 179 | UTF-8 180 | 181 | -verbose 182 | -Xlint:unchecked 183 | -Xlint:deprecation 184 | 185 | 186 | 187 | 188 | 189 | org.apache.maven.plugins 190 | maven-jar-plugin 191 | 192 | 193 | 194 | ${project.build.directory}/proxyservice/lib 195 | 196 | 197 | true 198 | lib/ 199 | com.meow.proxy.Proxyservice 200 | true 201 | true 202 | 203 | 204 | 205 | 207 | *.sh 208 | *.jar 209 | *.properties 210 | conf/*.xml 211 | sql/*.sql 212 | 213 | 214 | 215 | 216 | 217 | maven-source-plugin 218 | 219 | 220 | true 221 | 222 | ${project.build.directory}/proxyservice 223 | 224 | 225 | 226 | compile 227 | 228 | jar 229 | 230 | 231 | 232 | 233 | 234 | 235 | org.apache.maven.plugins 236 | maven-dependency-plugin 237 | 238 | 239 | 240 | copy-dependencies 241 | package 242 | 243 | copy-dependencies 244 | 245 | 246 | 247 | ${project.build.directory}/proxyservice/lib 248 | false 249 | false 250 | true 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/Proxyservice.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy; 2 | 3 | import org.mybatis.spring.annotation.MapperScan; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.boot.SpringApplication; 7 | import org.springframework.boot.autoconfigure.SpringBootApplication; 8 | import org.springframework.context.ApplicationContext; 9 | import org.springframework.scheduling.annotation.EnableScheduling; 10 | 11 | /** 12 | * @author alex 13 | */ 14 | @MapperScan("com.meow.proxy.dao") 15 | @EnableScheduling 16 | @SpringBootApplication 17 | public class Proxyservice { 18 | private final static Logger LOG = LoggerFactory.getLogger(Proxyservice.class); 19 | 20 | public static void main(String[] args) { 21 | LOG.info("Proxyservice start >>>>>>"); 22 | SpringApplication.run(Proxyservice.class, args); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/appcontext/AppcontextUtil.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.appcontext; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import org.springframework.beans.BeansException; 6 | import org.springframework.context.ApplicationContext; 7 | import org.springframework.context.ApplicationContextAware; 8 | import org.springframework.stereotype.Component; 9 | 10 | /** 11 | * 系统初始化时需要进行设置全局Spring上下文 12 | * 13 | * @author Alex 14 | * date:2017/12/22 15 | * email:jwnie@foxmail.com 16 | */ 17 | @Component 18 | public class AppcontextUtil implements ApplicationContextAware { 19 | private final static Logger LOG = LoggerFactory.getLogger(AppcontextUtil.class); 20 | private static ApplicationContext applicationContext = null; 21 | 22 | /** 23 | * 根据beanName生成对应的bean 24 | * 25 | * @param beanName 26 | * @return 27 | */ 28 | public static Object getBean(String beanName) { 29 | return applicationContext.getBean(beanName); 30 | } 31 | 32 | public static ApplicationContext getContext() { 33 | return applicationContext; 34 | } 35 | 36 | 37 | @Override 38 | public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { 39 | if (AppcontextUtil.applicationContext == null) { 40 | AppcontextUtil.applicationContext = applicationContext; 41 | LOG.info("Set applicationContext success!"); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/base/Const.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.base; 2 | 3 | /** 4 | * @author Alex 5 | * date:2017/12/14 6 | * email:jwnie@foxmail.com 7 | */ 8 | public class Const { 9 | /** 10 | * http响应状态码 11 | */ 12 | public final static int REDICT_301 = 301; 13 | public final static int REDICT_302 = 302; 14 | 15 | /** 16 | * userAgent 17 | */ 18 | public final static String USERAGENT = "User-Agent"; 19 | public final static String USER_AGENT[] = { 20 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; CIBA; MAXTHON 2.0)", 21 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36", 22 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", 23 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", 24 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", 25 | "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", 26 | "Opera/9.80 (Windows NT 6.1; WOW64; U; en) Presto/2.10.229 Version/11.62", 27 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36", 28 | "Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19" 29 | }; 30 | 31 | /** 32 | * 请求类型(httpGet/httpPost) 33 | */ 34 | public final static String METHOD_HTTPGET = "httpGet"; 35 | public final static String METHOD_HTTPPOST = "httpPost"; 36 | 37 | public static String CHARSET_PATTERN = "charset\\s?=\\s?((\\w+-?)?\\w+)\\s?"; 38 | 39 | /** 40 | * zip压缩的内容 41 | */ 42 | public static String SYMBOL_ZIP = "zip"; 43 | /** 44 | * 返回的内容为Brotli 算法压缩 45 | */ 46 | public static String SYMBOL_BROTLI = "br"; 47 | 48 | public static String CHINESE_CHAR = "[\u4e00-\u9fa5]"; 49 | 50 | public static final int CPU_AVAILABLEPROCESSORS = Runtime.getRuntime().availableProcessors(); 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/check/ProxyCheck.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | import com.meow.proxy.deduplicate.SimpleBloomFilter; 4 | import com.meow.proxy.request.HttpClientUtil; 5 | import com.meow.proxy.request.Response; 6 | import org.apache.http.HttpHost; 7 | import org.apache.http.impl.client.CloseableHttpClient; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.io.IOException; 12 | import java.net.InetSocketAddress; 13 | import java.net.Socket; 14 | 15 | /** 16 | * @author Alex 17 | * date:2017/12/15 18 | * email:jwnie@foxmail.com 19 | */ 20 | public class ProxyCheck { 21 | private final static Logger LOG = LoggerFactory.getLogger(ProxyCheck.class); 22 | private final static int CHECKPROXY_TIMEOUT = 30000; 23 | /** 24 | * 使用布隆过滤器进行去重 25 | */ 26 | private SimpleBloomFilter simpleBloomFilter = SimpleBloomFilter.getInstance(); 27 | 28 | public static ProxyCheck getInstance() { 29 | return ProxyCheckSingleton.PROXY_CHECK; 30 | } 31 | 32 | 33 | /** 34 | * 检测代理前检查之前是否已经爬取过 35 | * 36 | * @param proxy 37 | * @return 38 | */ 39 | private boolean isHadCheck(HttpHost proxy) { 40 | String value = new StringBuilder().append(proxy.getHostName()).append(":").append(proxy.getPort()).toString(); 41 | if (simpleBloomFilter.contains(value)) { 42 | // LOG.info("已经检验过的代理: " + value); 43 | return true; 44 | } 45 | return false; 46 | } 47 | 48 | /** 49 | * 检测过的代理加入布隆过滤器 50 | * 51 | * @param proxy 52 | */ 53 | private void addChecked(HttpHost proxy) { 54 | String value = new StringBuilder().append(proxy.getHostName()).append(":").append(proxy.getPort()).toString(); 55 | simpleBloomFilter.add(value); 56 | } 57 | 58 | /** 59 | * @param proxy 60 | * @return 61 | */ 62 | public boolean checkProxyBySocket(HttpHost proxy, boolean deduplicate) { 63 | if (proxy == null) { 64 | return false; 65 | } 66 | if (deduplicate) { 67 | if (isHadCheck(proxy)) { 68 | return false; 69 | } 70 | } 71 | Socket socket = null; 72 | try { 73 | //失败重试三次 74 | for (int i = 0; i < 2; i++) { 75 | try { 76 | socket = new Socket(); 77 | InetSocketAddress endpointSocketAddr = new InetSocketAddress(proxy.getHostName(), proxy.getPort()); 78 | socket.connect(endpointSocketAddr, CHECKPROXY_TIMEOUT); 79 | return true; 80 | } catch (Exception e) { 81 | LOG.warn("连接失败, remote: " + proxy.getHostName() + ":" + proxy.getPort()); 82 | } finally { 83 | if (socket != null) { 84 | try { 85 | socket.close(); 86 | } catch (IOException e) { 87 | LOG.warn("Socket关闭异常:", e); 88 | } 89 | } 90 | } 91 | } 92 | return false; 93 | } finally { 94 | if (deduplicate) { 95 | addChecked(proxy); 96 | } 97 | } 98 | } 99 | 100 | public boolean checkProxyByRequestBaidu(HttpHost proxy, boolean deduplicate) { 101 | if (proxy == null) { 102 | return false; 103 | } 104 | if (deduplicate) { 105 | if (isHadCheck(proxy)) { 106 | return false; 107 | } 108 | } 109 | String url = "https://www.baidu.com/"; 110 | CloseableHttpClient closeableHttpClient = null; 111 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 112 | try { 113 | closeableHttpClient = httpClientUtil.createHttpClient(CHECKPROXY_TIMEOUT, proxy, null); 114 | Response response = httpClientUtil.getResponse(closeableHttpClient, url); 115 | if (response != null) { 116 | System.out.println(response.getStatusCode()); 117 | // System.out.println(response.getContent()); 118 | if (response.getContent().contains("百度一下,你就知道")) { 119 | return true; 120 | } 121 | } 122 | return false; 123 | } catch (Exception e) { 124 | LOG.warn("验证代理请求出错:", e); 125 | return false; 126 | } finally { 127 | if (deduplicate) { 128 | addChecked(proxy); 129 | } 130 | } 131 | } 132 | 133 | 134 | private static class ProxyCheckSingleton { 135 | private final static ProxyCheck PROXY_CHECK = new ProxyCheck(); 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/check/ProxyIp2Addr.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.alibaba.fastjson.JSONObject; 5 | import com.meow.proxy.entity.IPAddr; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Response; 8 | import org.apache.http.impl.client.CloseableHttpClient; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | /** 13 | * @author Alex 14 | * date:2017/12/22 15 | * email:jwnie@foxmail.com 16 | */ 17 | public class ProxyIp2Addr { 18 | private final static Logger LOG = LoggerFactory.getLogger(ProxyIp2Addr.class); 19 | private static ProxyIp2Addr ourInstance = new ProxyIp2Addr(); 20 | private static String GETIPINFO_URL = "http://ip.taobao.com/service/getIpInfo.php?ip="; 21 | 22 | public static ProxyIp2Addr getInstance() { 23 | return ourInstance; 24 | } 25 | 26 | private ProxyIp2Addr() { 27 | } 28 | 29 | public IPAddr getIPAddrBYTaobaoAPI(String ip) { 30 | IPAddr ipAddr = new IPAddr(); 31 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 32 | CloseableHttpClient closeableHttpClient = null; 33 | String url = new StringBuilder().append(GETIPINFO_URL).append(ip).toString(); 34 | try { 35 | closeableHttpClient = httpClientUtil.createHttpClient(); 36 | Response response = httpClientUtil.getResponse(closeableHttpClient, url); 37 | if (response != null) { 38 | JSONObject jsonObject = JSON.parseObject(response.getContent()); 39 | if (jsonObject != null) { 40 | ipAddr.setCountry(jsonObject.getString("country")); 41 | ipAddr.setProvince(jsonObject.getString("region")); 42 | ipAddr.setCity(jsonObject.getString("city")); 43 | ipAddr.setIsp(jsonObject.getString("isp")); 44 | } 45 | } 46 | } catch (Exception e) { 47 | LOG.warn("验证代理请求出错:", e); 48 | } 49 | return ipAddr; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/check/ProxyRecheckCallBack.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | /** 4 | * @author Alex 5 | * date:2017/12/19 6 | * email:jwnie@foxmail.com 7 | */ 8 | public interface ProxyRecheckCallBack { 9 | public void process(String handleStatus); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/check/ProxyRecheckHandler.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | import com.meow.proxy.base.Const; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.service.ProxyService; 6 | import org.apache.commons.collections.CollectionUtils; 7 | import org.apache.commons.lang3.concurrent.BasicThreadFactory; 8 | import org.apache.http.HttpHost; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | import javax.annotation.PreDestroy; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.concurrent.BlockingDeque; 19 | import java.util.concurrent.LinkedBlockingDeque; 20 | import java.util.concurrent.ScheduledExecutorService; 21 | import java.util.concurrent.ScheduledThreadPoolExecutor; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/19 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component 29 | public class ProxyRecheckHandler { 30 | private final static Logger LOG = LoggerFactory.getLogger(ProxyRecheckHandler.class); 31 | @Autowired 32 | ProxyService proxyService; 33 | ScheduledExecutorService scheduledExecutorService = null; 34 | BlockingDeque proxyBlockingDeque = null; 35 | private volatile boolean queueIsEmpty = Boolean.TRUE; 36 | 37 | public void handleMessage(ProxyRecheckSender proxyRecheckSender, List proxies) { 38 | int total = proxies.size(); 39 | LOG.info("代理总数:" + total); 40 | if (total > 0) { 41 | synchronized (proxyBlockingDeque) { 42 | proxyBlockingDeque.addAll(proxies); 43 | queueIsEmpty = Boolean.FALSE; 44 | } 45 | } 46 | 47 | while (!queueIsEmpty) { 48 | try { 49 | Thread.sleep(30000); 50 | if (proxyBlockingDeque.size() <= 0) { 51 | queueIsEmpty = Boolean.TRUE; 52 | } 53 | } catch (Exception e) { 54 | LOG.warn("", e); 55 | } 56 | } 57 | proxyRecheckSender.process("完成!"); 58 | 59 | } 60 | 61 | /** 62 | * bean初始化后执行指定操作 63 | */ 64 | @PostConstruct 65 | public void initMethod() { 66 | int size = 2 * Const.CPU_AVAILABLEPROCESSORS; 67 | scheduledExecutorService = new ScheduledThreadPoolExecutor(size + 1, new BasicThreadFactory.Builder().namingPattern("定时检测入库代理线程池").daemon(Boolean.TRUE).build()); 68 | proxyBlockingDeque = new LinkedBlockingDeque(); 69 | for (int i = 0; i < Const.CPU_AVAILABLEPROCESSORS + 1; i++) { 70 | scheduledExecutorService.execute(new ProxyRecheckHandlerThread("校验代理线程【" + (i + 1) + "】")); 71 | } 72 | LOG.info("初始化完成!"); 73 | } 74 | 75 | 76 | /** 77 | * bean销毁时释放资源 78 | */ 79 | @PreDestroy 80 | public void destory() { 81 | //关闭线程池 82 | scheduledExecutorService.shutdown(); 83 | LOG.info("关闭资源完毕"); 84 | } 85 | 86 | 87 | class ProxyRecheckHandlerThread implements Runnable { 88 | private final Logger LOG = LoggerFactory.getLogger(ProxyRecheckHandlerThread.class); 89 | private final static int BATCH_UPDATE_SIEZ = 50; 90 | private String threadName; 91 | private List proxies = new ArrayList<>(100); 92 | 93 | 94 | public ProxyRecheckHandlerThread(String threadName) { 95 | this.threadName = threadName; 96 | } 97 | 98 | /** 99 | * When an object implementing interface Runnable is used 100 | * to create a thread, starting the thread causes the object's 101 | * run method to be called in that separately executing 102 | * thread. 103 | *

104 | * The general contract of the method run is that it may 105 | * take any action whatsoever. 106 | * 107 | * @see Thread#run() 108 | */ 109 | @Override 110 | public void run() { 111 | while (true) { 112 | doReCheckProxies(); 113 | } 114 | } 115 | 116 | public void doReCheckProxies() { 117 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 118 | try { 119 | while (!queueIsEmpty) { 120 | Proxy proxy = proxyBlockingDeque.poll(); 121 | if (proxy != null) { 122 | long begin = System.currentTimeMillis(); 123 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(proxy.getIp(), proxy.getPort()), false); 124 | long end = System.currentTimeMillis(); 125 | proxy.setValid(valid); 126 | if (!valid) { 127 | proxy.setInvalidTime(end); 128 | if (proxy.getLastSurviveTime() == null || proxy.getLastSurviveTime() <= 0) { 129 | proxy.setLastSurviveTime(end - proxy.getCheckTime()); 130 | } 131 | } 132 | proxy.setCheckStatus(1); 133 | proxy.setCheckTime(begin); 134 | proxy.setResponseTime(end - begin); 135 | if (proxy.getLastSurviveTime() == null) { 136 | proxy.setLastSurviveTime(-1L); 137 | } 138 | if (proxy.getInvalidTime() == null) { 139 | proxy.setInvalidTime(-1L); 140 | } 141 | if (proxy.getValidTime() == null) { 142 | proxy.setValidTime(1); 143 | } else { 144 | proxy.setValidTime(proxy.getValidTime() + 1); 145 | } 146 | 147 | proxies.add(proxy); 148 | if (proxies.size() >= BATCH_UPDATE_SIEZ) { 149 | int size = proxies.size(); 150 | proxyService.updateProxies(proxies); 151 | proxies.clear(); 152 | LOG.info("批量检测代理成功!数量:" + size); 153 | } 154 | // LOG.info(this.threadName + " 校验代理结果>>> " + proxy.getIp() + ":" + proxy.getProtocolType() + ",是否有效: " + proxy.isValid()); 155 | } 156 | } 157 | if (CollectionUtils.isNotEmpty(proxies)) { 158 | proxyService.updateProxies(proxies); 159 | proxies.clear(); 160 | } 161 | Thread.sleep(10000); 162 | } catch (Exception e) { 163 | LOG.warn("检验代理异常:", e); 164 | } 165 | } 166 | 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/check/ProxyRecheckSender.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | import com.meow.proxy.entity.Proxy; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.stereotype.Component; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * @author Alex 13 | * date:2017/12/19 14 | * email:jwnie@foxmail.com 15 | */ 16 | @Component 17 | public class ProxyRecheckSender implements ProxyRecheckCallBack { 18 | private final static Logger LOG = LoggerFactory.getLogger(ProxyRecheckSender.class); 19 | 20 | @Autowired 21 | ProxyRecheckHandler proxyRecheckHandler; 22 | 23 | public void sendRecheckProxies(List proxyList){ 24 | proxyRecheckHandler.handleMessage(this,proxyList); 25 | } 26 | 27 | 28 | @Override 29 | public void process(String handleStatus) { 30 | LOG.info("代理重新检测状态:"+handleStatus); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/configure/Configure.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.configure; 2 | 3 | import org.springframework.beans.factory.annotation.Value; 4 | import org.springframework.stereotype.Component; 5 | 6 | /** 7 | * Created by Jwnie on 2017/12/17. 8 | */ 9 | @Component 10 | public class Configure { 11 | @Value("${com.meow.proxy.configure.chromedriver.path}") 12 | private String chromeDriverPath; 13 | 14 | public String getChromeDriverPath() 15 | { 16 | return this.chromeDriverPath; 17 | } 18 | 19 | public void setChromeDriverPath(String chromeDriverPath) { 20 | this.chromeDriverPath = chromeDriverPath; 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/configure/ScheduleConfig.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.configure; 2 | 3 | import org.springframework.context.annotation.Bean; 4 | import org.springframework.context.annotation.Configuration; 5 | import org.springframework.scheduling.annotation.EnableScheduling; 6 | import org.springframework.scheduling.annotation.SchedulingConfigurer; 7 | import org.springframework.scheduling.config.ScheduledTaskRegistrar; 8 | 9 | import java.util.concurrent.Executor; 10 | import java.util.concurrent.Executors; 11 | 12 | /** 13 | * Springboot本身默认的执行方式是串行执行,使用线程池使之并行 14 | * @author Alex 15 | * date:2017/12/19 16 | * email:jwnie@foxmail.com 17 | */ 18 | @Configuration 19 | @EnableScheduling 20 | public class ScheduleConfig implements SchedulingConfigurer { 21 | @Override 22 | public void configureTasks(ScheduledTaskRegistrar scheduledTaskRegistrar) { 23 | scheduledTaskRegistrar.setScheduler(taskExecutor()); 24 | } 25 | 26 | @Bean(destroyMethod="shutdown") 27 | public Executor taskExecutor() { 28 | return Executors.newScheduledThreadPool(10); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/configure/TaskHolder.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.configure; 2 | 3 | import com.meow.proxy.entity.Task; 4 | import com.meow.proxy.enums.ProxySite; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * 待爬取代理网站的配置 11 | * Created by Jwnie on 2017/12/17. 12 | */ 13 | public class TaskHolder { 14 | private static TaskHolder ourInstance = new TaskHolder(); 15 | 16 | public static TaskHolder getInstance() { 17 | return ourInstance; 18 | } 19 | 20 | private List taskList = new ArrayList<>(50); 21 | 22 | private TaskHolder() { 23 | this.taskList.add(new Task("http://www.xicidaili.com/", true, 2, "xicidailiDownLoader", "xicidailiExtractor", ProxySite.xicidaili.getProxySiteName())); 24 | this.taskList.add(new Task("http://www.goubanjia.com/", true, 10, "goubanjiaDownLoader", "goubanjiaExtractor", ProxySite.goubanjia.getProxySiteName())); 25 | this.taskList.add(new Task("http://www.ip3366.net", true, 4, "ip3366DownLoader", "ip3366Extractor", ProxySite.ip3366.getProxySiteName())); 26 | this.taskList.add(new Task("http://www.data5u.com/", true, 10, "data5uDownLoader", "data5uExtractor", ProxySite.data5u.getProxySiteName())); 27 | this.taskList.add(new Task("http://www.xdaili.cn/ipagent/freeip/getFreeIps", false, 1, "baseDownLoader", "xdailiExtractor", ProxySite.xdaili.getProxySiteName())); 28 | this.taskList.add(new Task("http://www.nianshao.me/", true, 8, "nianshaoDownLoader", "nianshaoExtractor", ProxySite.nianshao.getProxySiteName())); 29 | this.taskList.add(new Task("http://proxydb.net/", true, 6, "proxydbDownLoader", "proxydbExtractor", ProxySite.proxydb.getProxySiteName())); 30 | this.taskList.add(new Task("http://www.kxdaili.com/dailiip.html", true, 8, "kxdailiDownLoader", "kxdailiExtractor", ProxySite.kxdaili.getProxySiteName())); 31 | this.taskList.add(new Task("https://proxy.coderbusy.com/", true, 6, "coderbusyDownLoader", "coderbusyExtractor", ProxySite.coderbusy.getProxySiteName())); 32 | 33 | //境外的代理網站(部分url需要VPN) 34 | this.taskList.add(new Task("https://free-proxy-list.net", false, 1, "freeProxyListDownLoader", "freeProxyListExtractor", ProxySite.freeProxyList.getProxySiteName())); 35 | } 36 | 37 | public List getTaskList() { 38 | return taskList; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/controller/ProxyControllor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.controller; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.entity.ProxyQueryResult; 6 | import com.meow.proxy.service.ProxyService; 7 | import org.apache.commons.collections.CollectionUtils; 8 | import org.apache.ibatis.annotations.Param; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.web.bind.annotation.RequestMapping; 13 | import org.springframework.web.bind.annotation.RequestMethod; 14 | import org.springframework.web.bind.annotation.RestController; 15 | 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.Map; 19 | 20 | /** 21 | * @author Alex 22 | * date:2017/12/20 23 | * email:jwnie@foxmail.com 24 | */ 25 | @RestController 26 | @RequestMapping(value = "/proxy") 27 | public class ProxyControllor { 28 | private final static Logger LOG = LoggerFactory.getLogger(ProxyControllor.class); 29 | 30 | @Autowired 31 | ProxyService proxyService; 32 | 33 | @RequestMapping(value = "/getProxy", method = RequestMethod.GET) 34 | public ProxyQueryResult getProxy(@Param("protocolType") String protocolType, @Param("isDemostic") String isDemostic, @Param("anonymousType") String anonymousType) { 35 | ProxyQueryResult proxyQueryResult = new ProxyQueryResult(); 36 | List proxies = new ArrayList(); 37 | try { 38 | proxies = proxyService.queryProxy(protocolType, isDemostic, anonymousType); 39 | if (CollectionUtils.isNotEmpty(proxies)) { 40 | int totalCount = proxyService.queryValidProxyCount(protocolType,isDemostic,anonymousType); 41 | proxyQueryResult.setTotalProxyCount(totalCount); 42 | proxyQueryResult.setProxies(proxies); 43 | proxyQueryResult.setResProxyCount(proxies.size()); 44 | } 45 | proxyQueryResult.setStatus("success"); 46 | } catch (Exception e) { 47 | LOG.error("查询代理异常:", e); 48 | proxyQueryResult.setProxies(proxies); 49 | proxyQueryResult.setResProxyCount(proxies.size()); 50 | proxyQueryResult.setStatus("failed"); 51 | } 52 | return proxyQueryResult; 53 | } 54 | 55 | @RequestMapping(value = "proxyStatistic", method = RequestMethod.GET) 56 | public JSONArray proxyStatistic() { 57 | JSONArray js = new JSONArray(); 58 | try { 59 | List> list = proxyService.proxyStatisticBySite(); 60 | if (CollectionUtils.isNotEmpty(list)) { 61 | js.addAll(list); 62 | } 63 | } catch (Exception e) { 64 | LOG.error("统计代理异常:",e); 65 | } 66 | return js; 67 | } 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/crawl/ProxyCrawl.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.crawl; 2 | 3 | import com.meow.proxy.appcontext.AppcontextUtil; 4 | import com.meow.proxy.download.BaseDownLoader; 5 | import com.meow.proxy.entity.Proxy; 6 | import com.meow.proxy.entity.Task; 7 | import com.meow.proxy.extract.Extractor; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.stereotype.Component; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | /** 17 | * Created by Jwnie on 2017/12/17. 18 | */ 19 | @Component 20 | public class ProxyCrawl { 21 | private final static Logger LOG = LoggerFactory.getLogger(ProxyCrawl.class); 22 | public List crawl(Task task){ 23 | List proxies = new ArrayList<>(50); 24 | try { 25 | BaseDownLoader downLoader = (BaseDownLoader)AppcontextUtil.getBean(task.getDownLoadClassName()); 26 | Extractor extractor = (Extractor) AppcontextUtil.getBean(task.getExtractClassName()); 27 | List htmlContentList = downLoader.downLoad(task); 28 | proxies.addAll(extractor.extract(htmlContentList)); 29 | } catch (Exception e) { 30 | LOG.error("代理抽取失败",e); 31 | } 32 | return proxies; 33 | } 34 | 35 | 36 | public List crawl(List tasks){ 37 | List proxies = new ArrayList<>(500); 38 | if(CollectionUtils.isNotEmpty(tasks)){ 39 | for (Task task : tasks){ 40 | proxies.addAll(crawl(task)); 41 | } 42 | } 43 | return proxies; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/dao/ProxyDao.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.dao; 2 | 3 | import com.meow.proxy.entity.Proxy; 4 | import org.apache.ibatis.annotations.Param; 5 | 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author Alex 11 | * date:2017/12/18 12 | * email:jwnie@foxmail.com 13 | */ 14 | public interface ProxyDao { 15 | 16 | void saveProxies(List proxyList); 17 | 18 | void updateProxies(List proxyList); 19 | 20 | List queryValidProxies(); 21 | 22 | /** 23 | * 查询前一百条有效的代理 24 | * 25 | * @return 26 | */ 27 | List queryProxy(@Param("protocolType") String protocolType,@Param("isDemostic") String isDemostic,@Param("anonymousType") String anonymousType); 28 | 29 | List> proxyStatisticBySite(); 30 | 31 | int queryValidProxyCount(@Param("protocolType") String protocolType,@Param("isDemostic") String isDemostic,@Param("anonymousType") String anonymousType); 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/deduplicate/SimpleBloomFilter.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.deduplicate; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | 5 | import java.util.BitSet; 6 | import java.util.List; 7 | 8 | /** 9 | * @author Alex 10 | * date:2017/12/18 11 | * email:jwnie@foxmail.com 12 | */ 13 | public class SimpleBloomFilter { 14 | /** 15 | * 设置每个字符串在布隆过滤器中所占的位的大小(24位) 16 | */ 17 | private static final int DEFAULT_SIZE = 2 << 24; 18 | /** 19 | * 产生随机数的种子,可产生6个不同的随机数产生器 20 | */ 21 | private static final int[] seeds = new int[]{7, 11, 13, 31, 37, 61}; 22 | /** 23 | * Java中的按位存储的思想,其算法的具体实现(布隆过滤器) 24 | */ 25 | private BitSet bits = new BitSet(DEFAULT_SIZE); 26 | /** 27 | * 根据随机数的种子,创建6个哈希函数 28 | */ 29 | private SimpleHash[] func = new SimpleHash[seeds.length]; 30 | private static SimpleBloomFilter ourInstance = new SimpleBloomFilter(); 31 | 32 | public static SimpleBloomFilter getInstance() { 33 | return ourInstance; 34 | } 35 | 36 | /** 37 | * 设置布隆过滤器所对应k(6)个哈希函数 38 | */ 39 | private SimpleBloomFilter() { 40 | for (int i = 0; i < seeds.length; i++) { 41 | func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]); 42 | } 43 | } 44 | 45 | /** 46 | * 往过滤器中加去重数据 47 | * 48 | * @param value 49 | */ 50 | public void add(String value) { 51 | for (SimpleHash f : func) { 52 | bits.set(f.hash(value), true); 53 | } 54 | } 55 | 56 | /** 57 | * 58 | * @param valueList 59 | */ 60 | public void addValueList(List valueList) { 61 | if (CollectionUtils.isEmpty(valueList)) { 62 | return; 63 | } 64 | for (String s : valueList) { 65 | add(s); 66 | } 67 | } 68 | 69 | 70 | /** 71 | * 是否已经包含该URL 72 | * 73 | * @param value 74 | * @return 75 | */ 76 | public boolean contains(String value) { 77 | if (value == null) { 78 | return false; 79 | } 80 | boolean ret = true; 81 | //根据此URL得到在布隆过滤器中的对应位,并判断其标志位(6个不同的哈希函数产生6种不同的映射) 82 | for (SimpleHash f : func) { 83 | ret = ret && bits.get(f.hash(value)); 84 | } 85 | return ret; 86 | } 87 | 88 | 89 | public static class SimpleHash { 90 | private int cap; 91 | private int seed; 92 | 93 | /** 94 | * 默认构造器,哈希表长默认为DEFAULT_SIZE大小,此哈希函数的种子为seed 95 | * 96 | * @param cap 97 | * @param seed 98 | */ 99 | public SimpleHash(int cap, int seed) { 100 | this.cap = cap; 101 | this.seed = seed; 102 | } 103 | 104 | /** 105 | * @param value 106 | * @return 107 | */ 108 | public int hash(String value) { 109 | int result = 0; 110 | int len = value.length(); 111 | for (int i = 0; i < len; i++) { 112 | //将此URL用哈希函数产生一个值(使用到了集合中的每一个元素) 113 | result = seed * result + value.charAt(i); 114 | } 115 | //产生单个信息指纹 116 | return (cap - 1) & result; 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/BaseDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download; 2 | 3 | import com.meow.proxy.base.Const; 4 | import com.meow.proxy.entity.Task; 5 | import com.meow.proxy.request.HttpClientUtil; 6 | import com.meow.proxy.request.Request; 7 | import com.meow.proxy.request.Response; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.commons.lang.StringUtils; 10 | import org.apache.http.impl.client.CloseableHttpClient; 11 | import org.openqa.selenium.By; 12 | import org.openqa.selenium.WebDriver; 13 | import org.openqa.selenium.WebElement; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | import org.springframework.beans.factory.annotation.Autowired; 17 | import org.springframework.stereotype.Component; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | import java.util.Random; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/15 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component(value = "baseDownLoader") 29 | public class BaseDownLoader implements DownLoader { 30 | private final static Logger LOG = LoggerFactory.getLogger(BaseDownLoader.class); 31 | 32 | @Autowired 33 | protected WebDriverFactory webDriverFactory; 34 | 35 | /** 36 | * 包括翻页下载,返回List 37 | * 38 | * @param task 39 | * @return 40 | */ 41 | @Override 42 | public List downLoad(Task task) { 43 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 44 | CloseableHttpClient closeableHttpClient = null; 45 | List htmlContentList = new ArrayList(20); 46 | try { 47 | if (task != null) { 48 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 49 | Request request = new Request(); 50 | setRequestParam(request); 51 | String origUrl = task.getUrl(); 52 | String htmlContent = downLoad(httpClientUtil, closeableHttpClient, request, origUrl); 53 | if (StringUtils.isNotBlank(htmlContent)) { 54 | htmlContentList.add(htmlContent); 55 | } 56 | } 57 | } catch (Exception e) { 58 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 59 | } 60 | return htmlContentList; 61 | } 62 | 63 | /** 64 | * 设置请求参数 65 | * 66 | * @param request 67 | */ 68 | protected void setRequestParam(Request request) { 69 | if (request == null) { 70 | return; 71 | } 72 | //默认为httpGet请求(子类Post请求需要覆写此方法) 73 | request.setMethod(Const.METHOD_HTTPGET); 74 | if (StringUtils.isEmpty(request.getCharSet())) { 75 | request.setCharSet("utf-8"); 76 | } 77 | request.setHeader("User-Agent", Const.USER_AGENT[new Random().nextInt(Const.USER_AGENT.length)]); 78 | request.setHeader("Connection", "keep-alive"); 79 | request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); 80 | request.setHeader("Accept-Encoding", "gzip, deflate, br"); 81 | request.setHeader("Accept-Language", "zh-CN,zh;q=0.9"); 82 | } 83 | 84 | /** 85 | * @param httpClientUtil 86 | * @param closeableHttpClient 87 | * @param request 88 | * @param url 89 | * @return 90 | */ 91 | protected String downLoad(HttpClientUtil httpClientUtil, CloseableHttpClient closeableHttpClient, Request request, String url) { 92 | String htmlContent = null; 93 | Response response = httpClientUtil.getResponse(closeableHttpClient, request, url); 94 | if (response != null) { 95 | htmlContent = response.getContent(); 96 | } 97 | return htmlContent; 98 | } 99 | 100 | /** 101 | * @param httpClientUtil 102 | * @param closeableHttpClient 103 | * @param request 104 | * @param urlList 105 | * @return 106 | */ 107 | protected List downLoad(HttpClientUtil httpClientUtil, CloseableHttpClient closeableHttpClient, Request request, List urlList) { 108 | List htmlContentList = new ArrayList(50); 109 | if (CollectionUtils.isEmpty(urlList)) { 110 | return htmlContentList; 111 | } 112 | for (String url : urlList) { 113 | String htmlContent = downLoad(httpClientUtil, closeableHttpClient, request, url); 114 | if (StringUtils.isNotBlank(htmlContent)) { 115 | htmlContentList.add(htmlContent); 116 | } 117 | } 118 | return htmlContentList; 119 | } 120 | 121 | protected String downLoad(WebDriver webDriver, String url) { 122 | String htmlContent = null; 123 | try { 124 | for (int i = 0; i < 3; i++) 125 | try { 126 | webDriver.get(url); 127 | WebElement webElement = webDriver.findElement(By.xpath("/html")); 128 | htmlContent = webElement.getAttribute("outerHTML"); 129 | } catch (Exception e) { 130 | try { 131 | Thread.sleep(3000L); 132 | } catch (InterruptedException e1) { 133 | LOG.warn("", e); 134 | } 135 | } 136 | } catch (Exception e) { 137 | LOG.warn("下载失败", e); 138 | } 139 | return htmlContent; 140 | } 141 | 142 | protected List downLoad(List urlList) { 143 | List htmlContentList = new ArrayList(50); 144 | if (CollectionUtils.isEmpty(urlList)) { 145 | return htmlContentList; 146 | } 147 | WebDriver webDriver = null; 148 | try { 149 | webDriver = this.webDriverFactory.getWebDriver(); 150 | for (String url : urlList) { 151 | String htmlContent = downLoad(webDriver, url); 152 | if (StringUtils.isNotBlank(htmlContent)) 153 | htmlContentList.add(htmlContent); 154 | } 155 | } catch (Exception e) { 156 | LOG.warn("下载异常:", e); 157 | } finally { 158 | closeResource(webDriver); 159 | } 160 | return htmlContentList; 161 | } 162 | 163 | protected void closeResource(WebDriver webDriver) { 164 | if (webDriver != null) 165 | webDriver.close(); 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/DownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download; 2 | 3 | import com.meow.proxy.entity.Task; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * 下载接口 9 | * @author Alex 10 | * date:2017/12/15 11 | * email:jwnie@foxmail.com 12 | */ 13 | public interface DownLoader { 14 | 15 | /** 16 | * 包括翻页下载,返回List 17 | * @return 18 | */ 19 | public List downLoad(Task task); 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/WebDriverFactory.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download; 2 | 3 | import com.meow.proxy.configure.Configure; 4 | import org.openqa.selenium.WebDriver; 5 | import org.openqa.selenium.chrome.ChromeDriver; 6 | import org.openqa.selenium.chrome.ChromeOptions; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.stereotype.Component; 11 | 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | import java.util.concurrent.TimeUnit; 15 | 16 | /** 17 | * @author Alex 18 | * date:2017/12/22 19 | * email:jwnie@foxmail.com 20 | */ 21 | @Component 22 | public class WebDriverFactory { 23 | private static final Logger LOG = LoggerFactory.getLogger(WebDriverFactory.class); 24 | @Autowired 25 | Configure configure; 26 | 27 | public WebDriver getWebDriver() { 28 | WebDriver mDriver = null; 29 | try { 30 | System.setProperty("webdriver.chrome.driver", configure.getChromeDriverPath()); 31 | ChromeOptions options = new ChromeOptions(); 32 | //设置不弹窗口 33 | // options.addArguments("--headless"); 34 | 35 | //设置Chrome不加载图片 36 | Map contentSettings = new HashMap(); 37 | contentSettings.put("images", 2); 38 | Map preferences = new HashMap(); 39 | preferences.put("profile.default_content_setting_values", contentSettings); 40 | options.setExperimentalOption("prefs", preferences); 41 | mDriver = new ChromeDriver(options); 42 | mDriver.manage().timeouts().pageLoadTimeout(120, TimeUnit.SECONDS); 43 | return mDriver; 44 | } catch (Exception e) { 45 | LOG.error("启动Chrome发生异常:{}", e); 46 | } 47 | return mDriver; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/CoderbusyDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * coderbusy:https://proxy.coderbusy.com/ 19 | * 20 | * @author Alex 21 | * date:2017/12/28 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "coderbusyDownLoader") 25 | public class CoderbusyDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(CoderbusyDownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | CloseableHttpClient closeableHttpClient = null; 38 | List htmlContentList = new ArrayList(20); 39 | try { 40 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 41 | if (task != null) { 42 | String origUrl = task.getUrl(); 43 | Request request = new Request(); 44 | setRequestParam(request); 45 | int pageSize = 1; 46 | if (task.isSubPageCrawl()) { 47 | pageSize = task.getSubPageSize() + 1; 48 | } 49 | List proxyUrlList = new ArrayList<>(pageSize * 2); 50 | proxyUrlList.add("https://proxy.coderbusy.com/"); 51 | //代理url拼接 52 | for (int i = 1; i <= pageSize; i++) { 53 | //透明代理 54 | proxyUrlList.add("https://proxy.coderbusy.com/zh-cn/classical/anonymous-type/transparent/p" + i+".aspx"); 55 | //普匿代理 56 | proxyUrlList.add("https://proxy.coderbusy.com/zh-cn/classical/anonymous-type/anonymous/p" + i+".aspx"); 57 | //高匿代理 58 | proxyUrlList.add("https://proxy.coderbusy.com/zh-cn/classical/anonymous-type/highanonymous/p" + i+".aspx"); 59 | //https代理 60 | proxyUrlList.add("https://proxy.coderbusy.com/zh-cn/classical/https-ready/p" + i+".aspx"); 61 | } 62 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 63 | htmlContentList.addAll(downLoad(httpClientUtil, closeableHttpClient, request, proxyUrlList)); 64 | } 65 | } 66 | } catch (Exception e) { 67 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 68 | } 69 | return htmlContentList; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/Data5uDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.Request; 7 | import org.apache.commons.collections.CollectionUtils; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.stereotype.Component; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * 无忧代理IP http://www.data5u.com/ 17 | * 18 | * @author Alex 19 | * date:2017/12/27 20 | * email:jwnie@foxmail.com 21 | */ 22 | @Component(value = "data5uDownLoader") 23 | public class Data5uDownLoader extends BaseDownLoader implements DownLoader { 24 | private final static Logger LOG = LoggerFactory.getLogger(Data5uDownLoader.class); 25 | 26 | /** 27 | * 包括翻页下载,返回List 28 | * 29 | * @param task 30 | * @return 31 | */ 32 | @Override 33 | public List downLoad(Task task) { 34 | List htmlContentList = new ArrayList(20); 35 | try { 36 | if (task != null) { 37 | String origUrl = task.getUrl(); 38 | Request request = new Request(); 39 | setRequestParam(request); 40 | int pageSize = 1; 41 | if (task.isSubPageCrawl()) { 42 | pageSize = task.getSubPageSize() + 1; 43 | } 44 | List proxyUrlList = new ArrayList<>(pageSize * 2); 45 | //代理url拼接 46 | for (int i = 1; i <= pageSize; i++) { 47 | //国内高匿代理 48 | proxyUrlList.add("http://www.data5u.com/free/gngn/index" + i + ".shtml"); 49 | //国内普通代理 50 | proxyUrlList.add("http://www.data5u.com/free/gnpt/index" + i + ".shtml"); 51 | //国外高匿代理 52 | proxyUrlList.add("http://www.data5u.com/free/gwgn/index" + i + ".shtml"); 53 | //国外普通代理 54 | proxyUrlList.add("http://www.data5u.com/free/gwpt/index" + i + ".shtml"); 55 | } 56 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 57 | htmlContentList.addAll(downLoad(proxyUrlList)); 58 | } 59 | } 60 | } catch (Exception e) { 61 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 62 | } 63 | return htmlContentList; 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/FreeProxyListDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * free-proxy-list.net 境外代理 19 | * 20 | * @author Alex 21 | * date:2017/12/15 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "freeProxyListDownLoader") 25 | public class FreeProxyListDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(FreeProxyListDownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | CloseableHttpClient closeableHttpClient = null; 38 | List htmlContentList = new ArrayList(20); 39 | try { 40 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 41 | if (task != null) { 42 | String origUrl = task.getUrl(); 43 | Request request = new Request(); 44 | request.setCharSet("utf-8"); 45 | setRequestParam(request); 46 | int pageSize = 1; 47 | if (task.isSubPageCrawl()) { 48 | pageSize = task.getSubPageSize() + 1; 49 | } 50 | List proxyUrlList = new ArrayList<>(); 51 | //代理url拼接 52 | //uk代理 53 | // proxyUrlList.add("https://free-proxy-list.net/uk-proxy.html"); 54 | //匿名代理 55 | // proxyUrlList.add("https://free-proxy-list.net/anonymous-proxy.html"); 56 | //us代理 57 | proxyUrlList.add("https://www.us-proxy.org/"); 58 | //socs代理 59 | proxyUrlList.add("https://www.socks-proxy.net/"); 60 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 61 | htmlContentList.addAll(downLoad(httpClientUtil, closeableHttpClient, request, proxyUrlList)); 62 | } 63 | } 64 | } catch (Exception e) { 65 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 66 | } 67 | return htmlContentList; 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/GoubanjiaDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.stereotype.Component; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | /** 17 | * 全网代理IP http://www.goubanjia.com 18 | * 19 | * @author Alex 20 | * date:2017/12/22 21 | * email:jwnie@foxmail.com 22 | */ 23 | @Component(value = "goubanjiaDownLoader") 24 | public class GoubanjiaDownLoader extends BaseDownLoader implements DownLoader { 25 | private final static Logger LOG = LoggerFactory.getLogger(GoubanjiaDownLoader.class); 26 | 27 | /** 28 | * 包括翻页下载,返回List 29 | * 30 | * @param task 31 | * @return 32 | */ 33 | @Override 34 | public List downLoad(Task task) { 35 | List htmlContentList = new ArrayList(20); 36 | try { 37 | if (task != null) { 38 | String origUrl = task.getUrl(); 39 | Request request = new Request(); 40 | setRequestParam(request); 41 | int pageSize = 1; 42 | if (task.isSubPageCrawl()) { 43 | pageSize = task.getSubPageSize() + 1; 44 | } 45 | List proxyUrlList = new ArrayList<>(pageSize * 2); 46 | //代理url拼接 47 | for (int i = 1; i <= pageSize; i++) { 48 | //国内高匿代理 49 | proxyUrlList.add("http://www.goubanjia.com/free/gngn/index" + i + ".shtml"); 50 | //国内透明代理 51 | proxyUrlList.add("http://www.goubanjia.com/free/gnpt/index" + i + ".shtml"); 52 | //HTTPS代理 53 | proxyUrlList.add("http://www.goubanjia.com/free/gwgn/index" + i + ".shtml"); 54 | //HTTP代理 55 | proxyUrlList.add("http://www.goubanjia.com/free/gwpt/index" + i + ".shtml"); 56 | } 57 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 58 | htmlContentList.addAll(downLoad(proxyUrlList)); 59 | } 60 | } 61 | } catch (Exception e) { 62 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 63 | } 64 | return htmlContentList; 65 | } 66 | 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/Ip3366DownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * 云代理IP:www.ip3366.net 19 | * 20 | * @author Alex 21 | * date:2017/12/15 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "ip3366DownLoader") 25 | public class Ip3366DownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(Ip3366DownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | CloseableHttpClient closeableHttpClient = null; 38 | List htmlContentList = new ArrayList(20); 39 | try { 40 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 41 | if (task != null) { 42 | String origUrl = task.getUrl(); 43 | Request request = new Request(); 44 | request.setCharSet("gbk"); 45 | setRequestParam(request); 46 | int pageSize = 1; 47 | if (task.isSubPageCrawl()) { 48 | pageSize = task.getSubPageSize() + 1; 49 | } 50 | List proxyUrlList = new ArrayList<>(pageSize * 2); 51 | //代理url拼接 52 | for (int i = 1; i <= pageSize; i++) { 53 | //国内高匿代理 54 | proxyUrlList.add("http://www.ip3366.net/free/?stype=1&page=" + i); 55 | //国内透明代理 56 | proxyUrlList.add("http://www.ip3366.net/free/?stype=2&page=" + i); 57 | //国外高匿代理 58 | proxyUrlList.add("http://www.ip3366.net/free/?stype=3&page=" + i); 59 | //国外普通代理 60 | proxyUrlList.add("http://www.ip3366.net/free/?stype=4&page=" + i); 61 | } 62 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 63 | htmlContentList.addAll(downLoad(httpClientUtil, closeableHttpClient, request, proxyUrlList)); 64 | } 65 | } 66 | } catch (Exception e) { 67 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 68 | } 69 | return htmlContentList; 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/KxdailiDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * 开心代理:http://www.kxdaili.com/dailiip.html 19 | * 20 | * @author Alex 21 | * date:2017/12/28 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "kxdailiDownLoader") 25 | public class KxdailiDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(KxdailiDownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | List htmlContentList = new ArrayList(20); 38 | try { 39 | if (task != null) { 40 | String origUrl = task.getUrl(); 41 | Request request = new Request(); 42 | setRequestParam(request); 43 | int pageSize = 1; 44 | if (task.isSubPageCrawl()) { 45 | pageSize = task.getSubPageSize() + 1; 46 | } 47 | List proxyUrlList = new ArrayList<>(pageSize * 2); 48 | //代理url拼接 49 | for (int i = 1; i <= pageSize; i++) { 50 | //国内高匿代理 51 | proxyUrlList.add("http://www.kxdaili.com/dailiip/1/" + i + ".html"); 52 | //国内普匿代理 53 | proxyUrlList.add("http://www.kxdaili.com/dailiip/1/" + i + ".html"); 54 | } 55 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 56 | htmlContentList.addAll(downLoad(proxyUrlList)); 57 | } 58 | } 59 | } catch (Exception e) { 60 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 61 | } 62 | return htmlContentList; 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/NianshaoDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * 年少代理:http://www.nianshao.me 19 | * 20 | * @author Alex 21 | * date:2017/12/28 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "nianshaoDownLoader") 25 | public class NianshaoDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(NianshaoDownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | CloseableHttpClient closeableHttpClient = null; 38 | List htmlContentList = new ArrayList(20); 39 | try { 40 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 41 | if (task != null) { 42 | String origUrl = task.getUrl(); 43 | Request request = new Request(); 44 | request.setCharSet("gbk"); 45 | setRequestParam(request); 46 | int pageSize = 1; 47 | if (task.isSubPageCrawl()) { 48 | pageSize = task.getSubPageSize() + 1; 49 | } 50 | List proxyUrlList = new ArrayList<>(pageSize * 2); 51 | //代理url拼接 52 | for (int i = 1; i <= pageSize; i++) { 53 | //HTTP代理 54 | proxyUrlList.add("http://www.nianshao.me/?stype=1&page=" + i); 55 | //HTTPS代理 56 | proxyUrlList.add("http://www.nianshao.me/?stype=2&page=" + i); 57 | //随机端口代理 58 | proxyUrlList.add("http://www.nianshao.me/?stype=5&page=" + i); 59 | } 60 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 61 | htmlContentList.addAll(downLoad(httpClientUtil, closeableHttpClient, request, proxyUrlList)); 62 | } 63 | } 64 | } catch (Exception e) { 65 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 66 | } 67 | return htmlContentList; 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/ProxydbDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * proxydb:http://proxydb.net/ 19 | * 20 | * @author Alex 21 | * date:2017/12/28 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "proxydbDownLoader") 25 | public class ProxydbDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(ProxydbDownLoader.class); 27 | private final int offset = 15; 28 | 29 | /** 30 | * 包括翻页下载,返回List 31 | * 32 | * @param task 33 | * @return 34 | */ 35 | @Override 36 | public List downLoad(Task task) { 37 | List htmlContentList = new ArrayList(20); 38 | try { 39 | if (task != null) { 40 | String origUrl = task.getUrl(); 41 | Request request = new Request(); 42 | setRequestParam(request); 43 | int pageSize = 1; 44 | if (task.isSubPageCrawl()) { 45 | pageSize = task.getSubPageSize() + 1; 46 | } 47 | List proxyUrlList = new ArrayList<>(pageSize * 2); 48 | //代理url拼接 49 | for (int i = 0; i < pageSize; i++) { 50 | //HTTP代理 51 | proxyUrlList.add("http://proxydb.net/?offset=" + (i * offset)); 52 | } 53 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 54 | htmlContentList.addAll(downLoad(proxyUrlList)); 55 | } 56 | } 57 | } catch (Exception e) { 58 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 59 | } 60 | return htmlContentList; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/download/impl/XicidailiDownLoader.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.download.impl; 2 | 3 | import com.meow.proxy.download.BaseDownLoader; 4 | import com.meow.proxy.download.DownLoader; 5 | import com.meow.proxy.entity.Task; 6 | import com.meow.proxy.request.HttpClientUtil; 7 | import com.meow.proxy.request.Request; 8 | import org.apache.commons.collections.CollectionUtils; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * 西刺免费代理IP:http://www.xicidaili.com/ 19 | * 20 | * @author Alex 21 | * date:2017/12/15 22 | * email:jwnie@foxmail.com 23 | */ 24 | @Component(value = "xicidailiDownLoader") 25 | public class XicidailiDownLoader extends BaseDownLoader implements DownLoader { 26 | private final static Logger LOG = LoggerFactory.getLogger(XicidailiDownLoader.class); 27 | 28 | /** 29 | * 包括翻页下载,返回List 30 | * 31 | * @param task 32 | * @return 33 | */ 34 | @Override 35 | public List downLoad(Task task) { 36 | HttpClientUtil httpClientUtil = HttpClientUtil.getInstance(); 37 | CloseableHttpClient closeableHttpClient = null; 38 | List htmlContentList = new ArrayList(20); 39 | try { 40 | closeableHttpClient = HttpClientUtil.getInstance().createHttpClient(); 41 | if (task != null) { 42 | String origUrl = task.getUrl(); 43 | Request request = new Request(); 44 | setRequestParam(request); 45 | int pageSize = 1; 46 | if (task.isSubPageCrawl()) { 47 | pageSize = task.getSubPageSize() + 1; 48 | } 49 | List proxyUrlList = new ArrayList<>(pageSize * 2); 50 | //代理url拼接 51 | for (int i = 1; i <= pageSize; i++) { 52 | //国内高匿代理 53 | proxyUrlList.add("http://www.xicidaili.com/nn/" + i); 54 | //国内透明代理 55 | proxyUrlList.add("http://www.xicidaili.com/nt/" + i); 56 | //HTTPS代理 57 | proxyUrlList.add("http://www.xicidaili.com/wn/" + i); 58 | //HTTP代理 59 | proxyUrlList.add("http://www.xicidaili.com/wt/" + i); 60 | } 61 | if (CollectionUtils.isNotEmpty(proxyUrlList)) { 62 | htmlContentList.addAll(downLoad(httpClientUtil, closeableHttpClient, request, proxyUrlList)); 63 | } 64 | } 65 | } catch (Exception e) { 66 | LOG.warn("下载异常,任务url:" + task.getUrl(), e); 67 | } 68 | return htmlContentList; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/entity/IPAddr.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.entity; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | 5 | /** 6 | * @author Alex 7 | * date:2017/12/22 8 | * email:jwnie@foxmail.com 9 | */ 10 | public class IPAddr { 11 | private String country; 12 | private String province; 13 | private String city; 14 | private String isp; 15 | 16 | public String getCountry() { 17 | return country; 18 | } 19 | 20 | public void setCountry(String country) { 21 | this.country = country; 22 | } 23 | 24 | public String getProvince() { 25 | return province; 26 | } 27 | 28 | public void setProvince(String province) { 29 | this.province = province; 30 | } 31 | 32 | public String getCity() { 33 | return city; 34 | } 35 | 36 | public void setCity(String city) { 37 | this.city = city; 38 | } 39 | 40 | public String getIsp() { 41 | return isp; 42 | } 43 | 44 | public void setIsp(String isp) { 45 | this.isp = isp; 46 | } 47 | 48 | @Override 49 | public String toString() { 50 | return JSONObject.toJSON(this).toString(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/entity/Proxy.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.entity; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | import com.sun.javafx.beans.IDProperty; 5 | import org.springframework.stereotype.Component; 6 | 7 | /** 8 | * @author Alex 9 | * date:2017/12/13 10 | * email:jwnie@foxmail.com 11 | */ 12 | @Component 13 | public class Proxy { 14 | private Integer id; 15 | private String ip; 16 | private int port; 17 | /** 18 | * 代理匿名类型 19 | */ 20 | private String anonymousType; 21 | /** 22 | * 代理协议类型 23 | */ 24 | private String protocolType; 25 | /** 26 | * 代理所在国家 27 | */ 28 | private String country; 29 | /** 30 | * 代理所在地区 31 | */ 32 | private String area; 33 | /** 34 | * 是否有效 35 | */ 36 | private boolean valid; 37 | /** 38 | * 代理失效时间(时间戳) 39 | */ 40 | private Long invalidTime; 41 | /** 42 | * 上次存活时长 43 | */ 44 | private Long lastSurviveTime; 45 | /** 46 | * 代理验证时间 47 | */ 48 | private Long checkTime; 49 | /** 50 | * 代理验证状态(0:未验证;1:已验证) 51 | */ 52 | private Integer checkStatus; 53 | /** 54 | * 代理评分 55 | */ 56 | private float score; 57 | /** 58 | * 代理来源站点 59 | */ 60 | private String sourceSite; 61 | /** 62 | * 代理有效次数 63 | */ 64 | private Integer validTime; 65 | /** 66 | * 代理采集时间 67 | */ 68 | private Long crawlTime; 69 | /** 70 | * 代理响应时间 71 | */ 72 | private Long responseTime; 73 | 74 | public Integer getId() { 75 | return id; 76 | } 77 | 78 | public void setId(Integer id) { 79 | this.id = id; 80 | } 81 | 82 | public String getIp() { 83 | return ip; 84 | } 85 | 86 | public void setIp(String ip) { 87 | this.ip = ip; 88 | } 89 | 90 | public int getPort() { 91 | return port; 92 | } 93 | 94 | public void setPort(int port) { 95 | this.port = port; 96 | } 97 | 98 | public String getAnonymousType() { 99 | return anonymousType; 100 | } 101 | 102 | public void setAnonymousType(String anonymousType) { 103 | this.anonymousType = anonymousType; 104 | } 105 | 106 | public String getProtocolType() { 107 | return protocolType; 108 | } 109 | 110 | public void setProtocolType(String protocolType) { 111 | this.protocolType = protocolType; 112 | } 113 | 114 | public String getCountry() { 115 | return country; 116 | } 117 | 118 | public void setCountry(String country) { 119 | this.country = country; 120 | } 121 | 122 | public String getArea() { 123 | return area; 124 | } 125 | 126 | public void setArea(String area) { 127 | this.area = area; 128 | } 129 | 130 | public boolean isValid() { 131 | return valid; 132 | } 133 | 134 | public void setValid(boolean valid) { 135 | this.valid = valid; 136 | } 137 | 138 | public Long getInvalidTime() { 139 | return invalidTime; 140 | } 141 | 142 | public void setInvalidTime(Long invalidTime) { 143 | this.invalidTime = invalidTime; 144 | } 145 | 146 | public Long getLastSurviveTime() { 147 | return lastSurviveTime; 148 | } 149 | 150 | public void setLastSurviveTime(Long lastSurviveTime) { 151 | this.lastSurviveTime = lastSurviveTime; 152 | } 153 | 154 | public Long getCheckTime() { 155 | return checkTime; 156 | } 157 | 158 | public void setCheckTime(Long checkTime) { 159 | this.checkTime = checkTime; 160 | } 161 | 162 | public Integer getCheckStatus() { 163 | return checkStatus; 164 | } 165 | 166 | public void setCheckStatus(Integer checkStatus) { 167 | this.checkStatus = checkStatus; 168 | } 169 | 170 | public float getScore() { 171 | return score; 172 | } 173 | 174 | public void setScore(float score) { 175 | this.score = score; 176 | } 177 | 178 | public String getSourceSite() { 179 | return sourceSite; 180 | } 181 | 182 | public void setSourceSite(String sourceSite) { 183 | this.sourceSite = sourceSite; 184 | } 185 | 186 | public Integer getValidTime() { 187 | return validTime; 188 | } 189 | 190 | public void setValidTime(Integer validTime) { 191 | this.validTime = validTime; 192 | } 193 | 194 | public Long getCrawlTime() { 195 | return crawlTime; 196 | } 197 | 198 | public void setCrawlTime(Long crawlTime) { 199 | this.crawlTime = crawlTime; 200 | } 201 | 202 | public Long getResponseTime() { 203 | return responseTime; 204 | } 205 | 206 | public void setResponseTime(Long responseTime) { 207 | this.responseTime = responseTime; 208 | } 209 | 210 | @Override 211 | public String toString() { 212 | return JSONObject.toJSONString(this); 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/entity/ProxyQueryResult.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.entity; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | 5 | import java.io.Serializable; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * @author Alex 11 | * date:2017/12/20 12 | * email:jwnie@foxmail.com 13 | */ 14 | public class ProxyQueryResult implements Serializable { 15 | /** 16 | * 请求状态(success/failed) 17 | */ 18 | private String status = "failed"; 19 | /** 20 | * 返回的代理数量 21 | */ 22 | private int resProxyCount; 23 | /** 24 | * 代理总数 25 | */ 26 | private int totalProxyCount; 27 | /** 28 | * 返回的代理详情 29 | */ 30 | private List proxies = new ArrayList(200); 31 | 32 | 33 | public String getStatus() { 34 | return status; 35 | } 36 | 37 | public void setStatus(String status) { 38 | this.status = status; 39 | } 40 | 41 | public int getResProxyCount() { 42 | return resProxyCount; 43 | } 44 | 45 | public void setResProxyCount(int resProxyCount) { 46 | this.resProxyCount = resProxyCount; 47 | } 48 | 49 | public int getTotalProxyCount() { 50 | return totalProxyCount; 51 | } 52 | 53 | public void setTotalProxyCount(int totalProxyCount) { 54 | this.totalProxyCount = totalProxyCount; 55 | } 56 | 57 | public List getProxies() { 58 | return proxies; 59 | } 60 | 61 | public void setProxies(List proxies) { 62 | this.proxies = proxies; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return JSONObject.toJSON(this).toString(); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/entity/Task.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.entity; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | import org.springframework.stereotype.Component; 5 | 6 | /** 7 | * 任务类 8 | * @author Alex 9 | * date:2017/12/15 10 | * email:jwnie@foxmail.com 11 | */ 12 | public class Task { 13 | /** 14 | * 任務url 15 | */ 16 | private String url; 17 | /** 18 | * 是否翻頁採集 19 | */ 20 | private boolean subPageCrawl; 21 | /** 22 | * 翻頁數量 23 | */ 24 | private int subPageSize; 25 | /** 26 | * 下載類class路徑 27 | */ 28 | private String downLoadClassName; 29 | /** 30 | * 抽取类class路径 31 | */ 32 | private String extractClassName; 33 | /** 34 | * 站点名称 35 | */ 36 | private String siteName; 37 | 38 | public String getUrl() 39 | { 40 | return this.url; 41 | } 42 | 43 | public void setUrl(String url) { 44 | this.url = url; 45 | } 46 | 47 | public boolean isSubPageCrawl() { 48 | return this.subPageCrawl; 49 | } 50 | 51 | public void setSubPageCrawl(boolean subPageCrawl) { 52 | this.subPageCrawl = subPageCrawl; 53 | } 54 | 55 | public int getSubPageSize() { 56 | return this.subPageSize; 57 | } 58 | 59 | public void setSubPageSize(int subPageSize) { 60 | this.subPageSize = subPageSize; 61 | } 62 | 63 | public String getDownLoadClassName() { 64 | return this.downLoadClassName; 65 | } 66 | 67 | public void setDownLoadClassName(String downLoadClassName) { 68 | this.downLoadClassName = downLoadClassName; 69 | } 70 | 71 | public String getExtractClassName() { 72 | return this.extractClassName; 73 | } 74 | 75 | public void setExtractClassName(String extractClassName) { 76 | this.extractClassName = extractClassName; 77 | } 78 | 79 | public String toString() 80 | { 81 | return JSONObject.toJSONString(this); 82 | } 83 | 84 | public Task(String url, boolean subPageCrawl, int subPageSize, String downLoadClassName, String extractClassName, String siteName) { 85 | this.url = url; 86 | this.subPageCrawl = subPageCrawl; 87 | this.subPageSize = subPageSize; 88 | this.downLoadClassName = downLoadClassName; 89 | this.extractClassName = extractClassName; 90 | this.siteName = siteName; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/enums/CountryType.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.enums; 2 | 3 | /** 4 | * Created by Jwnie on 2017/12/16. 5 | */ 6 | public enum CountryType { 7 | china(1,"china"), 8 | ; 9 | private int key; 10 | private String countryName; 11 | 12 | public int getKey() { 13 | return key; 14 | } 15 | 16 | public void setKey(int key) { 17 | this.key = key; 18 | } 19 | 20 | public String getCountryName() { 21 | return countryName; 22 | } 23 | 24 | public void setCountryName(String countryName) { 25 | this.countryName = countryName; 26 | } 27 | 28 | CountryType(int key, String countryName) { 29 | this.key = key; 30 | this.countryName = countryName; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/enums/ProxyAnonymousType.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.enums; 2 | 3 | /** 4 | * @author Alex 5 | * date:2017/12/13 6 | * email:jwnie@foxmail.com 7 | */ 8 | public enum ProxyAnonymousType { 9 | transparent(1, "transparent"), 10 | anonymous(2, "anonymous"), 11 | distorting(3, "distorting"), 12 | elite(4, "elite"); 13 | 14 | private int anonymousKey; 15 | private String anonymousType; 16 | 17 | public int getAnonymousKey() { 18 | return anonymousKey; 19 | } 20 | 21 | public void setAnonymousKey(int anonymousKey) { 22 | this.anonymousKey = anonymousKey; 23 | } 24 | 25 | public String getAnonymousType() { 26 | return anonymousType; 27 | } 28 | 29 | public void setAnonymousType(String anonymousType) { 30 | this.anonymousType = anonymousType; 31 | } 32 | 33 | ProxyAnonymousType(int anonymousKey, String anonymousType) { 34 | this.anonymousKey = anonymousKey; 35 | this.anonymousType = anonymousType; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/enums/ProxyProtocolType.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.enums; 2 | 3 | /** 4 | * @author Alex 5 | * date:2017/12/13 6 | * email:jwnie@foxmail.com 7 | */ 8 | public enum ProxyProtocolType { 9 | http(1, "http"), 10 | https(2, "https"), 11 | socks4(3, "socks4"), 12 | socks5(4, "socks5"), 13 | //不区分socks4或5 14 | socks(5, "socks"); 15 | 16 | private int key; 17 | private String requestType; 18 | 19 | public int getKey() { 20 | return key; 21 | } 22 | 23 | public void setKey(int key) { 24 | this.key = key; 25 | } 26 | 27 | public String getRequestType() { 28 | return requestType; 29 | } 30 | 31 | public void setRequestType(String requestType) { 32 | this.requestType = requestType; 33 | } 34 | 35 | ProxyProtocolType(int key, String requestType) { 36 | this.key = key; 37 | this.requestType = requestType; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/enums/ProxySite.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.enums; 2 | 3 | /** 4 | * Created by Jwnie on 2017/12/16. 5 | */ 6 | public enum ProxySite { 7 | xicidaili("www.xicidaili.com", "西刺免费代理IP"), 8 | goubanjia("www.goubanjia.com", "全网代理IP"), 9 | ip3366("www.ip3366.net", "云代理IP"), 10 | freeProxyList("free-proxy-list.net", "Free Proxy List"), 11 | data5u("www.data5u.com", "无忧代理IP"), 12 | xdaili("www.xdaili.cn", "讯代理IP"), 13 | nianshao("www.nianshao.me", "年少HTTP PROXY"), 14 | proxydb("proxydb.net", "proxydb"), 15 | kxdaili("kxdaili.com", "开心代理"), 16 | coderbusy("proxy.coderbusy.com", "coderbusy"), 17 | 18 | ; 19 | 20 | private String proxySiteDomain; 21 | private String proxySiteName; 22 | 23 | public String getProxySiteDomain() { 24 | return proxySiteDomain; 25 | } 26 | 27 | public void setProxySiteDomain(String proxySiteDomain) { 28 | this.proxySiteDomain = proxySiteDomain; 29 | } 30 | 31 | public String getProxySiteName() { 32 | return proxySiteName; 33 | } 34 | 35 | public void setProxySiteName(String proxySiteName) { 36 | this.proxySiteName = proxySiteName; 37 | } 38 | 39 | ProxySite(String proxySiteDomain, String proxySiteName) { 40 | this.proxySiteDomain = proxySiteDomain; 41 | this.proxySiteName = proxySiteName; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/Extractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract; 2 | 3 | import com.meow.proxy.entity.Proxy; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * 代理抽取类 9 | * @author Alex 10 | * date:2017/12/15 11 | * email:jwnie@foxmail.com 12 | */ 13 | public interface Extractor { 14 | 15 | public List extract(String htmlContent); 16 | 17 | public List extract(List htmlContentList); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/CoderbusyExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.enums.ProxyAnonymousType; 6 | import com.meow.proxy.enums.ProxyProtocolType; 7 | import com.meow.proxy.enums.ProxySite; 8 | import com.meow.proxy.extract.Extractor; 9 | import org.apache.commons.collections.CollectionUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.http.HttpHost; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | import org.springframework.stereotype.Component; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/28 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component("coderbusyExtractor") 29 | public class CoderbusyExtractor implements Extractor { 30 | private final static Logger LOG = LoggerFactory.getLogger(CoderbusyExtractor.class); 31 | 32 | @Override 33 | public List extract(String htmlContent) { 34 | List proxies = new ArrayList(100); 35 | Document document = Jsoup.parse(htmlContent); 36 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 37 | if (document != null) { 38 | Elements elements = document.select("tbody tr"); 39 | if (CollectionUtils.isNotEmpty(elements)) { 40 | for (Element element : elements) { 41 | long beginTime = System.currentTimeMillis(); 42 | Element ipEle = element.select("td").first(); 43 | if (ipEle != null) { 44 | Element portELe = ipEle.nextElementSibling(); 45 | String ip = ipEle.text(); 46 | int port = Integer.parseInt(portELe.text()); 47 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 48 | if (valid) { 49 | long end = System.currentTimeMillis(); 50 | Element countryEle = portELe.nextElementSibling(); 51 | Element protocolEle = countryEle.nextElementSibling().nextElementSibling(); 52 | Element anonymousEle = protocolEle.nextElementSibling().nextElementSibling(); 53 | String isHttps = anonymousEle.nextElementSibling().text(); 54 | String protocol = protocolEle.text(); 55 | if (StringUtils.isNotEmpty(isHttps) && "check".equals(isHttps)) { 56 | protocol = ProxyProtocolType.https.getRequestType(); 57 | } 58 | 59 | Proxy proxy = new Proxy(); 60 | proxy.setCountry(countryEle.text()); 61 | proxy.setIp(ip); 62 | proxy.setPort(port); 63 | proxy.setCheckStatus(1); 64 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 65 | proxy.setProtocolType(protocol); 66 | proxy.setSourceSite(ProxySite.coderbusy.getProxySiteName()); 67 | proxy.setCheckTime(beginTime); 68 | proxy.setCrawlTime(beginTime); 69 | proxy.setValidTime(1); 70 | proxy.setLastSurviveTime(-1L); 71 | proxy.setInvalidTime(-1L); 72 | proxy.setValid(true); 73 | proxy.setResponseTime(end - beginTime); 74 | LOG.info("Valid proxy:" + proxy.toString()); 75 | proxies.add(proxy); 76 | } 77 | } else { 78 | LOG.error("coderbusyExtractor can not extract anything..., please check."); 79 | } 80 | } 81 | } 82 | } 83 | return proxies; 84 | } 85 | 86 | @Override 87 | public List extract(List htmlContentList) { 88 | List proxies = new ArrayList(200); 89 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 90 | for (String htmlContent : htmlContentList) { 91 | proxies.addAll(extract(htmlContent)); 92 | } 93 | } 94 | return proxies; 95 | } 96 | 97 | /** 98 | * 代理匿名类型清洗 99 | * 100 | * @param element 101 | * @return 102 | */ 103 | private String getAnonymousType(Element element) { 104 | String text = element.text(); 105 | if (StringUtils.isNoneBlank(text)) { 106 | switch (text) { 107 | case "高匿名": 108 | return ProxyAnonymousType.elite.getAnonymousType(); 109 | case "透明": 110 | return ProxyAnonymousType.transparent.getAnonymousType(); 111 | case "匿名": 112 | return ProxyAnonymousType.anonymous.getAnonymousType(); 113 | default: 114 | LOG.error("Can not verify the anonymousType of proxy from ip3366>>>:" + text); 115 | } 116 | } 117 | return text; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/Data5uExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.check.ProxyIp2Addr; 5 | import com.meow.proxy.entity.IPAddr; 6 | import com.meow.proxy.entity.Proxy; 7 | import com.meow.proxy.enums.ProxyAnonymousType; 8 | import com.meow.proxy.enums.ProxySite; 9 | import com.meow.proxy.extract.Extractor; 10 | import org.apache.commons.collections.CollectionUtils; 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.apache.http.HttpHost; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | import org.springframework.stereotype.Component; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** 25 | * @author Alex 26 | * date:2017/12/27 27 | * email:jwnie@foxmail.com 28 | */ 29 | @Component(value = "data5uExtractor") 30 | public class Data5uExtractor implements Extractor { 31 | private final static Logger LOG = LoggerFactory.getLogger(Data5uExtractor.class); 32 | 33 | @Override 34 | public List extract(String htmlContent) { 35 | List proxies = new ArrayList(100); 36 | Document document = Jsoup.parse(htmlContent); 37 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 38 | if (document != null) { 39 | Elements elements = document.select("ul.l2"); 40 | if (CollectionUtils.isNotEmpty(elements)) { 41 | for (Element element : elements) { 42 | long beginTime = System.currentTimeMillis(); 43 | Element ipEle = element.select("span").first(); 44 | if (ipEle != null) { 45 | Element portEle = ipEle.nextElementSibling(); 46 | String ip = ipEle.text(); 47 | int port = Integer.parseInt(portEle.text()); 48 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 49 | if (valid) { 50 | long end = System.currentTimeMillis(); 51 | Element anonymousEle = portEle.nextElementSibling(); 52 | Element protocolEle = anonymousEle.nextElementSibling(); 53 | Element coutryEle = protocolEle.nextElementSibling(); 54 | Element areaEle = coutryEle.nextElementSibling(); 55 | String area = areaEle.text(); 56 | String country = coutryEle.text(); 57 | 58 | if ("香港".equals(country) || "澳门".equals(country) || "台湾".equals(country)) { 59 | country = "中国 " + country; 60 | } 61 | 62 | 63 | if (StringUtils.isEmpty(country)) { 64 | IPAddr ipAddr = ProxyIp2Addr.getInstance().getIPAddrBYTaobaoAPI(ip); 65 | country = ipAddr.getCountry(); 66 | StringBuilder sb = new StringBuilder(); 67 | sb.append(ipAddr.getProvince()).append(" ").append(ipAddr.getCity()); 68 | area = sb.toString(); 69 | } 70 | 71 | Proxy proxy = new Proxy(); 72 | proxy.setCountry(country); 73 | proxy.setIp(ip); 74 | proxy.setPort(port); 75 | proxy.setArea(area); 76 | proxy.setCheckStatus(1); 77 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 78 | proxy.setProtocolType(protocolEle.text()); 79 | proxy.setSourceSite(ProxySite.data5u.getProxySiteName()); 80 | proxy.setCheckTime(beginTime); 81 | proxy.setCrawlTime(beginTime); 82 | proxy.setValidTime(1); 83 | //默认值 84 | proxy.setLastSurviveTime(-1L); 85 | //默认值 86 | proxy.setInvalidTime(-1L); 87 | proxy.setValid(true); 88 | proxy.setResponseTime(end - beginTime); 89 | LOG.info("Valid proxy:" + proxy.toString()); 90 | proxies.add(proxy); 91 | } 92 | } else { 93 | LOG.error("data5uExtractor can not extract anything..., please check."); 94 | } 95 | } 96 | } 97 | } 98 | return proxies; 99 | } 100 | 101 | @Override 102 | public List extract(List htmlContentList) { 103 | List proxies = new ArrayList(200); 104 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 105 | for (String htmlContent : htmlContentList) { 106 | proxies.addAll(extract(htmlContent)); 107 | } 108 | } 109 | return proxies; 110 | } 111 | 112 | /** 113 | * 代理匿名类型清洗 114 | * 115 | * @param element 116 | * @return 117 | */ 118 | private String getAnonymousType(Element element) { 119 | String text = element.text(); 120 | if (StringUtils.isNoneBlank(text)) { 121 | switch (text) { 122 | case "高匿": 123 | return ProxyAnonymousType.elite.getAnonymousType(); 124 | case "匿名": 125 | return ProxyAnonymousType.anonymous.getAnonymousType(); 126 | case "透明": 127 | return ProxyAnonymousType.transparent.getAnonymousType(); 128 | default: 129 | LOG.error("Can not verify the anonymousType of proxy from data5u>>>:" + text); 130 | } 131 | } 132 | return text; 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/FreeProxyListExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.enums.ProxyAnonymousType; 6 | import com.meow.proxy.enums.ProxyProtocolType; 7 | import com.meow.proxy.enums.ProxySite; 8 | import com.meow.proxy.extract.Extractor; 9 | import org.apache.commons.collections.CollectionUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.http.HttpHost; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | import org.springframework.stereotype.Component; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/27 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component("freeProxyListExtractor") 29 | public class FreeProxyListExtractor implements Extractor { 30 | private final static Logger LOG = LoggerFactory.getLogger(FreeProxyListExtractor.class); 31 | 32 | @Override 33 | public List extract(String htmlContent) { 34 | List proxies = new ArrayList(100); 35 | Document document = Jsoup.parse(htmlContent); 36 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 37 | if (document != null) { 38 | Elements elements = document.select("tbody tr"); 39 | if (CollectionUtils.isNotEmpty(elements)) { 40 | for (Element element : elements) { 41 | long beginTime = System.currentTimeMillis(); 42 | Element ipEle = element.select("td").first(); 43 | if (ipEle != null) { 44 | Element portELe = ipEle.nextElementSibling(); 45 | String ip = ipEle.text(); 46 | int port = Integer.parseInt(portELe.text()); 47 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 48 | if (valid) { 49 | long end = System.currentTimeMillis(); 50 | Element countryEle = portELe.nextElementSibling().nextElementSibling(); 51 | Element versionEle = countryEle.nextElementSibling(); 52 | Element anonymousEle = countryEle.nextElementSibling(); 53 | String country = countryEle.text(); 54 | if (country.contains("Hong Kong") || country.contains("Taiwan") || country.contains("Macao")) { 55 | country += ", CN"; 56 | } 57 | 58 | Proxy proxy = new Proxy(); 59 | proxy.setCountry(country); 60 | proxy.setIp(ip); 61 | proxy.setPort(port); 62 | 63 | if (versionEle.text().contains("Socks")) { 64 | anonymousEle = versionEle.nextElementSibling(); 65 | proxy.setProtocolType(versionEle.text()); 66 | } else { 67 | Element protocolEle = anonymousEle.nextElementSibling().nextElementSibling(); 68 | if (protocolEle.text().contains("no")) { 69 | proxy.setProtocolType(ProxyProtocolType.http.getRequestType()); 70 | } else { 71 | proxy.setProtocolType(ProxyProtocolType.https.getRequestType()); 72 | } 73 | } 74 | 75 | proxy.setArea(""); 76 | proxy.setCheckStatus(1); 77 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 78 | proxy.setSourceSite(ProxySite.freeProxyList.getProxySiteName()); 79 | proxy.setCheckTime(beginTime); 80 | proxy.setCrawlTime(beginTime); 81 | proxy.setValidTime(1); 82 | proxy.setLastSurviveTime(-1L); 83 | proxy.setInvalidTime(-1L); 84 | proxy.setValid(true); 85 | proxy.setResponseTime(end - beginTime); 86 | LOG.info("Valid proxy:" + proxy.toString()); 87 | proxies.add(proxy); 88 | } 89 | } else { 90 | LOG.error("freeProxyListExtractor can not extract anything..., please check."); 91 | } 92 | } 93 | } 94 | } 95 | return proxies; 96 | } 97 | 98 | @Override 99 | public List extract(List htmlContentList) { 100 | List proxies = new ArrayList(200); 101 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 102 | for (String htmlContent : htmlContentList) { 103 | proxies.addAll(extract(htmlContent)); 104 | } 105 | } 106 | return proxies; 107 | } 108 | 109 | /** 110 | * 代理匿名类型清洗 111 | * 112 | * @param element 113 | * @return 114 | */ 115 | private String getAnonymousType(Element element) { 116 | String text = element.text(); 117 | if (StringUtils.isNoneBlank(text)) { 118 | switch (text) { 119 | case "elite proxy": 120 | return ProxyAnonymousType.elite.getAnonymousType(); 121 | case "transparent": 122 | return ProxyAnonymousType.transparent.getAnonymousType(); 123 | case "anonymous": 124 | return ProxyAnonymousType.anonymous.getAnonymousType(); 125 | default: 126 | LOG.error("Can not verify the anonymousType of proxy from free-Proxy-List>>>:" + text); 127 | } 128 | } 129 | return text; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/GoubanjiaExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.check.ProxyIp2Addr; 5 | import com.meow.proxy.entity.IPAddr; 6 | import com.meow.proxy.entity.Proxy; 7 | import com.meow.proxy.enums.ProxyAnonymousType; 8 | import com.meow.proxy.enums.ProxySite; 9 | import com.meow.proxy.extract.Extractor; 10 | import org.apache.commons.collections.CollectionUtils; 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.apache.http.HttpHost; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | import org.springframework.stereotype.Component; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** 25 | * @author Alex 26 | * date:2017/12/15 27 | * email:jwnie@foxmail.com 28 | */ 29 | @Component(value = "goubanjiaExtractor") 30 | public class GoubanjiaExtractor implements Extractor { 31 | private final static Logger LOG = LoggerFactory.getLogger(GoubanjiaExtractor.class); 32 | 33 | @Override 34 | public List extract(String htmlContent) { 35 | List proxies = new ArrayList(100); 36 | Document document = Jsoup.parse(htmlContent); 37 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 38 | if (document != null) { 39 | Elements elements = document.select("td.ip").parents(); 40 | if (CollectionUtils.isNotEmpty(elements)) { 41 | for (Element element : elements) { 42 | long beginTime = System.currentTimeMillis(); 43 | Element hostEle = element.getElementsByClass("ip").first(); 44 | hostEle.getElementsByTag("p").remove(); 45 | if (hostEle != null) { 46 | String host[] = hostEle.text().replaceAll("\\s+", "").split(":"); 47 | String ip = host[0]; 48 | int port = Integer.parseInt(host[1]); 49 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 50 | if (valid) { 51 | long end = System.currentTimeMillis(); 52 | Element anonymousEle = hostEle.nextElementSibling(); 53 | Element protocolEle = anonymousEle.nextElementSibling(); 54 | Element areaEle = protocolEle.nextElementSibling(); 55 | String area = areaEle.text(); 56 | Element coutryEle = areaEle.select("a[href]").first(); 57 | Element provinceEle = coutryEle.nextElementSibling(); 58 | Element cityEle = provinceEle.nextElementSibling(); 59 | String country = coutryEle.text(); 60 | String province = provinceEle.text(); 61 | String city = cityEle.text(); 62 | 63 | if (StringUtils.isEmpty(country)) { 64 | IPAddr ipAddr = ProxyIp2Addr.getInstance().getIPAddrBYTaobaoAPI(ip); 65 | country = ipAddr.getCountry(); 66 | if (StringUtils.isEmpty(province)) { 67 | province = ipAddr.getProvince(); 68 | } 69 | if (StringUtils.isEmpty(city)) { 70 | city = ipAddr.getCity(); 71 | } 72 | } 73 | 74 | 75 | Proxy proxy = new Proxy(); 76 | proxy.setCountry(country); 77 | proxy.setIp(ip); 78 | proxy.setPort(port); 79 | proxy.setArea(areaEle.text()); 80 | proxy.setCheckStatus(1); 81 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 82 | proxy.setProtocolType(protocolEle.text()); 83 | proxy.setSourceSite(ProxySite.goubanjia.getProxySiteName()); 84 | proxy.setCheckTime(beginTime); 85 | proxy.setCrawlTime(beginTime); 86 | proxy.setValidTime(1); 87 | //默认值 88 | proxy.setLastSurviveTime(-1L); 89 | //默认值 90 | proxy.setInvalidTime(-1L); 91 | proxy.setValid(true); 92 | proxy.setResponseTime(end - beginTime); 93 | LOG.info("Valid proxy:" + proxy.toString()); 94 | proxies.add(proxy); 95 | } 96 | } else { 97 | LOG.error("XicidailiExtractor can not extract anything..., please check."); 98 | } 99 | } 100 | } 101 | } 102 | return proxies; 103 | } 104 | 105 | @Override 106 | public List extract(List htmlContentList) { 107 | List proxies = new ArrayList(200); 108 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 109 | for (String htmlContent : htmlContentList) { 110 | proxies.addAll(extract(htmlContent)); 111 | } 112 | } 113 | return proxies; 114 | } 115 | 116 | /** 117 | * 代理匿名类型清洗 118 | * 119 | * @param element 120 | * @return 121 | */ 122 | private String getAnonymousType(Element element) { 123 | String text = element.text(); 124 | if (StringUtils.isNoneBlank(text)) { 125 | switch (text) { 126 | case "高匿": 127 | return ProxyAnonymousType.elite.getAnonymousType(); 128 | case "匿名": 129 | return ProxyAnonymousType.anonymous.getAnonymousType(); 130 | case "透明": 131 | return ProxyAnonymousType.transparent.getAnonymousType(); 132 | default: 133 | LOG.error("Can not verify the anonymousType of proxy from goubanjia>>>:" + text); 134 | } 135 | } 136 | return text; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/Ip3366Extractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.check.ProxyIp2Addr; 5 | import com.meow.proxy.entity.IPAddr; 6 | import com.meow.proxy.entity.Proxy; 7 | import com.meow.proxy.enums.CountryType; 8 | import com.meow.proxy.enums.ProxyAnonymousType; 9 | import com.meow.proxy.enums.ProxySite; 10 | import com.meow.proxy.extract.Extractor; 11 | import org.apache.commons.collections.CollectionUtils; 12 | import org.apache.commons.lang3.StringUtils; 13 | import org.apache.http.HttpHost; 14 | import org.jsoup.Jsoup; 15 | import org.jsoup.nodes.Document; 16 | import org.jsoup.nodes.Element; 17 | import org.jsoup.select.Elements; 18 | import org.slf4j.Logger; 19 | import org.slf4j.LoggerFactory; 20 | import org.springframework.stereotype.Component; 21 | 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | 25 | /** 26 | * @author Alex 27 | * date:2017/12/27 28 | * email:jwnie@foxmail.com 29 | */ 30 | @Component("ip3366Extractor") 31 | public class Ip3366Extractor implements Extractor { 32 | private final static Logger LOG = LoggerFactory.getLogger(Ip3366Extractor.class); 33 | 34 | @Override 35 | public List extract(String htmlContent) { 36 | List proxies = new ArrayList(100); 37 | Document document = Jsoup.parse(htmlContent); 38 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 39 | if (document != null) { 40 | Elements elements = document.select("div#list tbody tr"); 41 | if (CollectionUtils.isNotEmpty(elements)) { 42 | for (Element element : elements) { 43 | long beginTime = System.currentTimeMillis(); 44 | Element ipEle = element.select("td").first(); 45 | if (ipEle != null) { 46 | Element portELe = ipEle.nextElementSibling(); 47 | String ip = ipEle.text(); 48 | int port = Integer.parseInt(portELe.text()); 49 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 50 | if (valid) { 51 | long end = System.currentTimeMillis(); 52 | Element anonymousEle = portELe.nextElementSibling(); 53 | Element protocolEle = anonymousEle.nextElementSibling(); 54 | Element areaEle = protocolEle.nextElementSibling(); 55 | 56 | IPAddr ipAddr = ProxyIp2Addr.getInstance().getIPAddrBYTaobaoAPI(ip); 57 | String country = ipAddr.getCountry(); 58 | if (StringUtils.isEmpty(country)) { 59 | country = areaEle.text().replaceAll(".*_", ""); 60 | if (country.contains("省") || country.contains("市")) { 61 | country = CountryType.china.getCountryName(); 62 | } 63 | } 64 | 65 | Proxy proxy = new Proxy(); 66 | proxy.setCountry(country); 67 | proxy.setIp(ip); 68 | proxy.setPort(port); 69 | proxy.setArea(areaEle.text()); 70 | proxy.setCheckStatus(1); 71 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 72 | proxy.setProtocolType(protocolEle.text()); 73 | proxy.setSourceSite(ProxySite.ip3366.getProxySiteName()); 74 | proxy.setCheckTime(beginTime); 75 | proxy.setCrawlTime(beginTime); 76 | proxy.setValidTime(1); 77 | proxy.setLastSurviveTime(-1L); 78 | proxy.setInvalidTime(-1L); 79 | proxy.setValid(true); 80 | proxy.setResponseTime(end - beginTime); 81 | LOG.info("Valid proxy:" + proxy.toString()); 82 | proxies.add(proxy); 83 | } 84 | } else { 85 | LOG.error("Ip3366Extractor can not extract anything..., please check."); 86 | } 87 | } 88 | } 89 | } 90 | return proxies; 91 | } 92 | 93 | @Override 94 | public List extract(List htmlContentList) { 95 | List proxies = new ArrayList(200); 96 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 97 | for (String htmlContent : htmlContentList) { 98 | proxies.addAll(extract(htmlContent)); 99 | } 100 | } 101 | return proxies; 102 | } 103 | 104 | /** 105 | * 代理匿名类型清洗 106 | * 107 | * @param element 108 | * @return 109 | */ 110 | private String getAnonymousType(Element element) { 111 | String text = element.text(); 112 | if (StringUtils.isNoneBlank(text)) { 113 | switch (text) { 114 | case "高匿代理IP": 115 | return ProxyAnonymousType.elite.getAnonymousType(); 116 | case "透明代理IP": 117 | return ProxyAnonymousType.transparent.getAnonymousType(); 118 | case "普通代理IP": 119 | return ProxyAnonymousType.anonymous.getAnonymousType(); 120 | default: 121 | LOG.error("Can not verify the anonymousType of proxy from ip3366>>>:" + text); 122 | } 123 | } 124 | return text; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/KxdailiExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.enums.CountryType; 6 | import com.meow.proxy.enums.ProxyAnonymousType; 7 | import com.meow.proxy.enums.ProxyProtocolType; 8 | import com.meow.proxy.enums.ProxySite; 9 | import com.meow.proxy.extract.Extractor; 10 | import org.apache.commons.collections.CollectionUtils; 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.apache.http.HttpHost; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | import org.springframework.stereotype.Component; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** 25 | * @author Alex 26 | * date:2017/12/28 27 | * email:jwnie@foxmail.com 28 | */ 29 | @Component("kxdailiExtractor") 30 | public class KxdailiExtractor implements Extractor { 31 | private final static Logger LOG = LoggerFactory.getLogger(KxdailiExtractor.class); 32 | 33 | @Override 34 | public List extract(String htmlContent) { 35 | List proxies = new ArrayList(100); 36 | Document document = Jsoup.parse(htmlContent); 37 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 38 | if (document != null) { 39 | Elements elements = document.select("tbody tr"); 40 | if (CollectionUtils.isNotEmpty(elements)) { 41 | for (Element element : elements) { 42 | long beginTime = System.currentTimeMillis(); 43 | Element ipEle = element.select("td").first(); 44 | if (ipEle != null) { 45 | Element portELe = ipEle.nextElementSibling(); 46 | String ip = ipEle.text(); 47 | int port = Integer.parseInt(portELe.text()); 48 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 49 | if (valid) { 50 | long end = System.currentTimeMillis(); 51 | Element anonymousEle = portELe.nextElementSibling(); 52 | Element protocolEle = anonymousEle.nextElementSibling(); 53 | Element areaEle = protocolEle.nextElementSibling().nextElementSibling(); 54 | 55 | String protocol = protocolEle.text(); 56 | if (protocol.contains("HTTP,HTTPS")) { 57 | protocol = ProxyProtocolType.https.getRequestType(); 58 | } else if ("HTTP".equals(protocol)) { 59 | protocol = ProxyProtocolType.http.getRequestType(); 60 | } 61 | 62 | Proxy proxy = new Proxy(); 63 | proxy.setCountry(CountryType.china.getCountryName()); 64 | proxy.setIp(ip); 65 | proxy.setPort(port); 66 | proxy.setArea(areaEle.text()); 67 | proxy.setCheckStatus(1); 68 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 69 | proxy.setProtocolType(protocol); 70 | proxy.setSourceSite(ProxySite.kxdaili.getProxySiteName()); 71 | proxy.setCheckTime(beginTime); 72 | proxy.setCrawlTime(beginTime); 73 | proxy.setValidTime(1); 74 | proxy.setLastSurviveTime(-1L); 75 | proxy.setInvalidTime(-1L); 76 | proxy.setValid(true); 77 | proxy.setResponseTime(end - beginTime); 78 | LOG.info("Valid proxy:" + proxy.toString()); 79 | proxies.add(proxy); 80 | } 81 | } else { 82 | LOG.error("kxdailiExtractor can not extract anything..., please check."); 83 | } 84 | } 85 | } 86 | } 87 | return proxies; 88 | } 89 | 90 | @Override 91 | public List extract(List htmlContentList) { 92 | List proxies = new ArrayList(200); 93 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 94 | for (String htmlContent : htmlContentList) { 95 | proxies.addAll(extract(htmlContent)); 96 | } 97 | } 98 | return proxies; 99 | } 100 | 101 | /** 102 | * 代理匿名类型清洗 103 | * 104 | * @param element 105 | * @return 106 | */ 107 | private String getAnonymousType(Element element) { 108 | String text = element.text(); 109 | if (StringUtils.isNoneBlank(text)) { 110 | switch (text) { 111 | case "高匿": 112 | return ProxyAnonymousType.elite.getAnonymousType(); 113 | case "透明": 114 | return ProxyAnonymousType.transparent.getAnonymousType(); 115 | case "普匿": 116 | return ProxyAnonymousType.anonymous.getAnonymousType(); 117 | default: 118 | LOG.error("Can not verify the anonymousType of proxy from kxdaili>>>:" + text); 119 | } 120 | } 121 | return text; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/NianshaoExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.enums.CountryType; 6 | import com.meow.proxy.enums.ProxyAnonymousType; 7 | import com.meow.proxy.enums.ProxySite; 8 | import com.meow.proxy.extract.Extractor; 9 | import org.apache.commons.collections.CollectionUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.http.HttpHost; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | import org.springframework.stereotype.Component; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/27 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component("nianshaoExtractor") 29 | public class NianshaoExtractor implements Extractor { 30 | private final static Logger LOG = LoggerFactory.getLogger(NianshaoExtractor.class); 31 | 32 | @Override 33 | public List extract(String htmlContent) { 34 | List proxies = new ArrayList(100); 35 | Document document = Jsoup.parse(htmlContent); 36 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 37 | if (document != null) { 38 | Elements elements = document.select("table.table tbody tr"); 39 | if (CollectionUtils.isNotEmpty(elements)) { 40 | for (Element element : elements) { 41 | long beginTime = System.currentTimeMillis(); 42 | Element ipEle = element.select("td").first(); 43 | if (ipEle != null) { 44 | Element portELe = ipEle.nextElementSibling(); 45 | String ip = ipEle.text(); 46 | int port = Integer.parseInt(portELe.text()); 47 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 48 | if (valid) { 49 | long end = System.currentTimeMillis(); 50 | Element areaEle = portELe.nextElementSibling(); 51 | Element anonymousEle = areaEle.nextElementSibling(); 52 | Element protocolEle = anonymousEle.nextElementSibling(); 53 | 54 | String area = areaEle.text(); 55 | 56 | // IPAddr ipAddr = ProxyIp2Addr.getInstance().getIPAddrBYTaobaoAPI(ip); 57 | String country = ""; 58 | if (area.contains("香港") || area.contains("澳门") || area.contains("台湾")) { 59 | country = "中国 " + country; 60 | } 61 | 62 | if (area.contains("省") || area.contains("市") || area.contains("中国")) { 63 | country = CountryType.china.getCountryName(); 64 | } else { 65 | country = area; 66 | } 67 | 68 | 69 | Proxy proxy = new Proxy(); 70 | proxy.setCountry(country); 71 | proxy.setIp(ip); 72 | proxy.setPort(port); 73 | proxy.setArea(area); 74 | proxy.setCheckStatus(1); 75 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 76 | proxy.setProtocolType(protocolEle.text()); 77 | proxy.setSourceSite(ProxySite.nianshao.getProxySiteName()); 78 | proxy.setCheckTime(beginTime); 79 | proxy.setCrawlTime(beginTime); 80 | proxy.setValidTime(1); 81 | proxy.setLastSurviveTime(-1L); 82 | proxy.setInvalidTime(-1L); 83 | proxy.setValid(true); 84 | proxy.setResponseTime(end - beginTime); 85 | LOG.info("Valid proxy:" + proxy.toString()); 86 | proxies.add(proxy); 87 | } 88 | } else { 89 | LOG.error("nianshaoExtractor can not extract anything..., please check."); 90 | } 91 | } 92 | } 93 | } 94 | return proxies; 95 | } 96 | 97 | @Override 98 | public List extract(List htmlContentList) { 99 | List proxies = new ArrayList(200); 100 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 101 | for (String htmlContent : htmlContentList) { 102 | proxies.addAll(extract(htmlContent)); 103 | } 104 | } 105 | return proxies; 106 | } 107 | 108 | /** 109 | * 代理匿名类型清洗 110 | * 111 | * @param element 112 | * @return 113 | */ 114 | private String getAnonymousType(Element element) { 115 | String text = element.text(); 116 | if (StringUtils.isNoneBlank(text)) { 117 | switch (text) { 118 | case "高匿": 119 | return ProxyAnonymousType.elite.getAnonymousType(); 120 | case "透明": 121 | return ProxyAnonymousType.transparent.getAnonymousType(); 122 | case "普通": 123 | return ProxyAnonymousType.anonymous.getAnonymousType(); 124 | case "混淆": 125 | return ProxyAnonymousType.distorting.getAnonymousType(); 126 | default: 127 | LOG.error("Can not verify the anonymousType of proxy from nianshao>>>:" + text); 128 | } 129 | } 130 | return text; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/ProxydbExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | import com.meow.proxy.check.ProxyCheck; 5 | import com.meow.proxy.entity.Proxy; 6 | import com.meow.proxy.enums.CountryType; 7 | import com.meow.proxy.enums.ProxyAnonymousType; 8 | import com.meow.proxy.enums.ProxySite; 9 | import com.meow.proxy.extract.Extractor; 10 | import org.apache.commons.collections.CollectionUtils; 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.apache.http.HttpHost; 13 | import org.jsoup.Jsoup; 14 | import org.jsoup.nodes.Document; 15 | import org.jsoup.nodes.Element; 16 | import org.jsoup.select.Elements; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | import org.springframework.stereotype.Component; 20 | 21 | import java.util.ArrayList; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | /** 27 | * @author Alex 28 | * date:2017/12/28 29 | * email:jwnie@foxmail.com 30 | */ 31 | @Component("proxydbExtractor") 32 | public class ProxydbExtractor implements Extractor { 33 | private final static Logger LOG = LoggerFactory.getLogger(ProxydbExtractor.class); 34 | private Map countryMap = new HashMap(100); 35 | 36 | @Override 37 | public List extract(String htmlContent) { 38 | List proxies = new ArrayList(100); 39 | Document document = Jsoup.parse(htmlContent); 40 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 41 | if (document != null) { 42 | Elements elements = document.select("tbody tr"); 43 | if (countryMap.size() <= 0) { 44 | reflectCountry(document, countryMap); 45 | } 46 | if (CollectionUtils.isNotEmpty(elements)) { 47 | for (Element element : elements) { 48 | long beginTime = System.currentTimeMillis(); 49 | Element hostEle = element.select("td").first(); 50 | if (hostEle != null) { 51 | String host[] = hostEle.text().split(":"); 52 | String ip = host[0]; 53 | int port = Integer.parseInt(host[1]); 54 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 55 | if (valid) { 56 | long end = System.currentTimeMillis(); 57 | Element protocolEle = hostEle.nextElementSibling(); 58 | Element coutryEle = protocolEle.nextElementSibling(); 59 | Element anonymousEle = coutryEle.nextElementSibling(); 60 | 61 | String country = coutryEle.text(); 62 | if (countryMap.get(country) != null) { 63 | country = countryMap.get(country); 64 | } else { 65 | //页面结构可能发生修改 66 | reflectCountry(document, countryMap); 67 | if (countryMap.get(country) != null) { 68 | country = countryMap.get(country); 69 | } 70 | } 71 | 72 | 73 | if (country.equals("CN")) { 74 | country = CountryType.china.getCountryName(); 75 | } 76 | 77 | Proxy proxy = new Proxy(); 78 | proxy.setCountry(country); 79 | proxy.setIp(ip); 80 | proxy.setPort(port); 81 | proxy.setCheckStatus(1); 82 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 83 | proxy.setProtocolType(protocolEle.text()); 84 | proxy.setSourceSite(ProxySite.proxydb.getProxySiteName()); 85 | proxy.setCheckTime(beginTime); 86 | proxy.setCrawlTime(beginTime); 87 | proxy.setValidTime(1); 88 | proxy.setLastSurviveTime(-1L); 89 | proxy.setInvalidTime(-1L); 90 | proxy.setValid(true); 91 | proxy.setResponseTime(end - beginTime); 92 | LOG.info("Valid proxy:" + proxy.toString()); 93 | proxies.add(proxy); 94 | } 95 | } else { 96 | LOG.error("proxydbExtractor can not extract anything..., please check."); 97 | } 98 | } 99 | } 100 | } 101 | return proxies; 102 | } 103 | 104 | @Override 105 | public List extract(List htmlContentList) { 106 | List proxies = new ArrayList(200); 107 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 108 | for (String htmlContent : htmlContentList) { 109 | proxies.addAll(extract(htmlContent)); 110 | } 111 | } 112 | return proxies; 113 | } 114 | 115 | /** 116 | * 代理匿名类型清洗 117 | * 118 | * @param element 119 | * @return 120 | */ 121 | private String getAnonymousType(Element element) { 122 | String text = element.text(); 123 | if (StringUtils.isNoneBlank(text)) { 124 | switch (text) { 125 | case "Elite": 126 | return ProxyAnonymousType.elite.getAnonymousType(); 127 | case "Transparent": 128 | return ProxyAnonymousType.transparent.getAnonymousType(); 129 | case "Anonymous": 130 | return ProxyAnonymousType.anonymous.getAnonymousType(); 131 | case "Distorting": 132 | return ProxyAnonymousType.distorting.getAnonymousType(); 133 | default: 134 | LOG.error("Can not verify the anonymousType of proxy from proxydb>>>:" + text); 135 | } 136 | } 137 | return text; 138 | } 139 | 140 | private void reflectCountry(Document document, Map countryMap) { 141 | Elements countryEles = document.select("span.select option[value]"); 142 | if (CollectionUtils.isNotEmpty(countryEles)) { 143 | for (Element element : countryEles) { 144 | String countryKey = element.attr("value"); 145 | if (StringUtils.isNoneBlank(countryKey)) { 146 | String countryValue = element.text().replace(countryKey + " - ", "").replaceAll("\\(\\d+?\\)", "").replaceAll("\\s+", ""); 147 | countryMap.put(countryKey, countryValue); 148 | } 149 | } 150 | } 151 | // System.out.println("countryMap: " + JSONObject.toJSONString(countryMap)); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/XdailiExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.alibaba.fastjson.JSONObject; 5 | import com.meow.proxy.check.ProxyCheck; 6 | import com.meow.proxy.check.ProxyIp2Addr; 7 | import com.meow.proxy.entity.IPAddr; 8 | import com.meow.proxy.entity.Proxy; 9 | import com.meow.proxy.enums.CountryType; 10 | import com.meow.proxy.enums.ProxyAnonymousType; 11 | import com.meow.proxy.enums.ProxySite; 12 | import com.meow.proxy.extract.Extractor; 13 | import org.apache.commons.collections.CollectionUtils; 14 | import org.apache.commons.lang3.StringUtils; 15 | import org.apache.http.HttpHost; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | import org.springframework.stereotype.Component; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/15 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component("xdailiExtractor") 29 | public class XdailiExtractor implements Extractor { 30 | private final static Logger LOG = LoggerFactory.getLogger(XdailiExtractor.class); 31 | 32 | @Override 33 | public List extract(String htmlContent) { 34 | List proxies = new ArrayList(100); 35 | JSONObject jsonObject = JSONObject.parseObject(htmlContent); 36 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 37 | if (jsonObject != null) { 38 | JSONObject result = jsonObject.getJSONObject("RESULT"); 39 | if (result != null) { 40 | JSONArray jsonArray = result.getJSONArray("rows"); 41 | if (CollectionUtils.isNotEmpty(jsonArray)) { 42 | for (Object o : jsonArray) { 43 | JSONObject json = (JSONObject) o; 44 | long beginTime = System.currentTimeMillis(); 45 | if (json != null) { 46 | String ip = json.getString("ip"); 47 | int port = Integer.parseInt(json.getString("port")); 48 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 49 | if (valid) { 50 | long end = System.currentTimeMillis(); 51 | String area = json.getString("position"); 52 | String coutry = ""; 53 | if (area.contains("中国")) { 54 | coutry = "china"; 55 | } else { 56 | IPAddr ipAddr = ProxyIp2Addr.getInstance().getIPAddrBYTaobaoAPI(ip); 57 | coutry = ipAddr.getCountry(); 58 | } 59 | String anonymous = json.getString("anony"); 60 | String protocol = json.getString("type"); 61 | if (protocol.contains("HTTP/HTTPS")) { 62 | protocol = "http"; 63 | } 64 | 65 | Proxy proxy = new Proxy(); 66 | proxy.setCountry(CountryType.china.getCountryName()); 67 | proxy.setIp(ip); 68 | proxy.setPort(port); 69 | proxy.setArea(area); 70 | proxy.setCheckStatus(1); 71 | proxy.setAnonymousType(getAnonymousType(anonymous)); 72 | proxy.setProtocolType(protocol); 73 | proxy.setSourceSite(ProxySite.xdaili.getProxySiteName()); 74 | proxy.setCheckTime(beginTime); 75 | proxy.setCrawlTime(beginTime); 76 | proxy.setValidTime(1); 77 | proxy.setLastSurviveTime(-1L); 78 | proxy.setInvalidTime(-1L); 79 | proxy.setValid(true); 80 | proxy.setResponseTime(end - beginTime); 81 | LOG.info("Valid proxy:" + proxy.toString()); 82 | proxies.add(proxy); 83 | } 84 | } else { 85 | LOG.error("XdailiExtractor can not extract anything..., please check."); 86 | } 87 | } 88 | } 89 | } 90 | } 91 | return proxies; 92 | } 93 | 94 | @Override 95 | public List extract(List htmlContentList) { 96 | List proxies = new ArrayList(20); 97 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 98 | for (String htmlContent : htmlContentList) { 99 | proxies.addAll(extract(htmlContent)); 100 | } 101 | } 102 | return proxies; 103 | } 104 | 105 | /** 106 | * 代理匿名类型清洗 107 | * 108 | * @param text 109 | * @return 110 | */ 111 | private String getAnonymousType(String text) { 112 | if (StringUtils.isNoneBlank(text)) { 113 | switch (text) { 114 | case "高匿": 115 | return ProxyAnonymousType.elite.getAnonymousType(); 116 | case "透明": 117 | return ProxyAnonymousType.transparent.getAnonymousType(); 118 | default: 119 | LOG.error("Can not verify the anonymousType of proxy from Xdaili>>>:" + text); 120 | } 121 | } 122 | return text; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/extract/impl/XicidailiExtractor.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.extract.impl; 2 | 3 | import com.meow.proxy.check.ProxyCheck; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.enums.CountryType; 6 | import com.meow.proxy.enums.ProxyAnonymousType; 7 | import com.meow.proxy.enums.ProxySite; 8 | import com.meow.proxy.extract.Extractor; 9 | import org.apache.commons.collections.CollectionUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.http.HttpHost; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | import org.springframework.stereotype.Component; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * @author Alex 25 | * date:2017/12/15 26 | * email:jwnie@foxmail.com 27 | */ 28 | @Component("xicidailiExtractor") 29 | public class XicidailiExtractor implements Extractor { 30 | private final static Logger LOG = LoggerFactory.getLogger(XicidailiExtractor.class); 31 | 32 | @Override 33 | public List extract(String htmlContent) { 34 | List proxies = new ArrayList(100); 35 | Document document = Jsoup.parse(htmlContent); 36 | ProxyCheck proxyCheck = ProxyCheck.getInstance(); 37 | if (document != null) { 38 | Elements elements = document.select("tr.odd"); 39 | if (CollectionUtils.isNotEmpty(elements)) { 40 | for (Element element : elements) { 41 | long beginTime = System.currentTimeMillis(); 42 | Element ipEle = element.getElementsByClass("country").first().nextElementSibling(); 43 | if (ipEle != null) { 44 | Element portELe = ipEle.nextElementSibling(); 45 | String ip = ipEle.text(); 46 | int port = Integer.parseInt(portELe.text()); 47 | boolean valid = proxyCheck.checkProxyBySocket(new HttpHost(ip, port), true); 48 | if (valid) { 49 | long end = System.currentTimeMillis(); 50 | Element areaEle = portELe.nextElementSibling(); 51 | Element anonymousEle = areaEle.nextElementSibling(); 52 | Element protocolEle = anonymousEle.nextElementSibling(); 53 | 54 | Proxy proxy = new Proxy(); 55 | proxy.setCountry(CountryType.china.getCountryName()); 56 | proxy.setIp(ip); 57 | proxy.setPort(port); 58 | proxy.setArea(areaEle.text()); 59 | proxy.setCheckStatus(1); 60 | proxy.setAnonymousType(getAnonymousType(anonymousEle)); 61 | proxy.setProtocolType(protocolEle.text()); 62 | proxy.setSourceSite(ProxySite.xicidaili.getProxySiteName()); 63 | proxy.setCheckTime(beginTime); 64 | proxy.setCrawlTime(beginTime); 65 | proxy.setValidTime(1); 66 | proxy.setLastSurviveTime(-1L); 67 | proxy.setInvalidTime(-1L); 68 | proxy.setValid(true); 69 | proxy.setResponseTime(end - beginTime); 70 | LOG.info("Valid proxy:" + proxy.toString()); 71 | proxies.add(proxy); 72 | } 73 | } else { 74 | LOG.error("XicidailiExtractor can not extract anything..., please check."); 75 | } 76 | } 77 | } 78 | } 79 | return proxies; 80 | } 81 | 82 | @Override 83 | public List extract(List htmlContentList) { 84 | List proxies = new ArrayList(200); 85 | if (CollectionUtils.isNotEmpty(htmlContentList)) { 86 | for (String htmlContent : htmlContentList) { 87 | proxies.addAll(extract(htmlContent)); 88 | } 89 | } 90 | return proxies; 91 | } 92 | 93 | /** 94 | * 代理匿名类型清洗 95 | * 96 | * @param element 97 | * @return 98 | */ 99 | private String getAnonymousType(Element element) { 100 | String text = element.text(); 101 | if (StringUtils.isNoneBlank(text)) { 102 | switch (text) { 103 | case "高匿": 104 | return ProxyAnonymousType.elite.getAnonymousType(); 105 | case "透明": 106 | return ProxyAnonymousType.transparent.getAnonymousType(); 107 | default: 108 | LOG.error("Can not verify the anonymousType of proxy from XiciDaili>>>:" + text); 109 | } 110 | } 111 | return text; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/jobs/ScheduleJobs.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.jobs; 2 | 3 | import com.meow.proxy.base.Const; 4 | import com.meow.proxy.check.ProxyRecheckHandler; 5 | import com.meow.proxy.check.ProxyRecheckSender; 6 | import com.meow.proxy.configure.TaskHolder; 7 | import com.meow.proxy.crawl.ProxyCrawl; 8 | import com.meow.proxy.entity.Proxy; 9 | import com.meow.proxy.entity.Task; 10 | import com.meow.proxy.service.ProxyService; 11 | import org.apache.commons.collections.CollectionUtils; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | import org.springframework.beans.factory.annotation.Autowired; 15 | import org.springframework.scheduling.annotation.Scheduled; 16 | import org.springframework.stereotype.Component; 17 | 18 | import java.util.List; 19 | 20 | /** 21 | * @Scheduled的方法现在为并行执行 Created by Jwnie on 2017/12/17. 22 | */ 23 | @Component 24 | public class ScheduleJobs { 25 | private final static Logger LOG = LoggerFactory.getLogger(ScheduleJobs.class); 26 | @Autowired 27 | ProxyCrawl proxyCrawl; 28 | @Autowired 29 | ProxyService proxyService; 30 | @Autowired 31 | ProxyRecheckSender proxyRecheckSender; 32 | 33 | @Scheduled(fixedRateString = "${com.meow.proxy.jobs.ScheduleJobs.proxyCrawl.period}") 34 | public void proxyCrawl() { 35 | TaskHolder taskHolder = TaskHolder.getInstance(); 36 | List taskList = taskHolder.getTaskList(); 37 | if (CollectionUtils.isNotEmpty(taskList)) { 38 | LOG.info("Start to crawl valid proxy.."); 39 | List proxies = proxyCrawl.crawl(taskList); 40 | if (CollectionUtils.isNotEmpty(proxies)) { 41 | proxyService.saveProxies(proxies); 42 | } 43 | LOG.info("Save valid proxies success, proxies size: " + proxies.size()); 44 | } 45 | } 46 | 47 | @Scheduled(fixedRateString = "${com.meow.proxy.jobs.ScheduleJobs.proxyRecheck.period}") 48 | public void proxyRecheck() { 49 | long begin = System.currentTimeMillis(); 50 | List proxyList = proxyService.queryValidProxies(); 51 | proxyRecheckSender.sendRecheckProxies(proxyList); 52 | LOG.info("可用代理检测完成,用时: " + (System.currentTimeMillis() - begin) + " ms"); 53 | } 54 | 55 | } 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/request/HttpClientUtil.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.request; 2 | 3 | import com.google.common.io.ByteStreams; 4 | import com.meow.proxy.base.Const; 5 | import org.apache.commons.compress.compressors.brotli.BrotliCompressorInputStream; 6 | import org.apache.commons.io.IOUtils; 7 | import org.apache.commons.lang.StringUtils; 8 | import org.apache.http.*; 9 | import org.apache.http.client.CookieStore; 10 | import org.apache.http.client.config.AuthSchemes; 11 | import org.apache.http.client.config.CookieSpecs; 12 | import org.apache.http.client.config.RequestConfig; 13 | import org.apache.http.client.entity.UrlEncodedFormEntity; 14 | import org.apache.http.client.methods.CloseableHttpResponse; 15 | import org.apache.http.client.methods.HttpGet; 16 | import org.apache.http.client.methods.HttpPost; 17 | import org.apache.http.config.RegistryBuilder; 18 | import org.apache.http.conn.socket.ConnectionSocketFactory; 19 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 20 | import org.apache.http.conn.ssl.NoopHostnameVerifier; 21 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 22 | import org.apache.http.impl.client.*; 23 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 24 | import org.apache.http.impl.cookie.BasicClientCookie; 25 | import org.apache.http.message.BasicNameValuePair; 26 | import org.apache.http.protocol.HttpContext; 27 | import org.apache.http.util.EntityUtils; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | import javax.net.ssl.SSLContext; 32 | import javax.net.ssl.TrustManager; 33 | import javax.net.ssl.X509TrustManager; 34 | import java.io.IOException; 35 | import java.io.InputStream; 36 | import java.io.UnsupportedEncodingException; 37 | import java.net.URI; 38 | import java.security.KeyManagementException; 39 | import java.security.NoSuchAlgorithmException; 40 | import java.security.cert.CertificateException; 41 | import java.security.cert.X509Certificate; 42 | import java.util.ArrayList; 43 | import java.util.Arrays; 44 | import java.util.List; 45 | import java.util.Map; 46 | import java.util.regex.Matcher; 47 | import java.util.regex.Pattern; 48 | import java.util.zip.GZIPInputStream; 49 | 50 | /** 51 | * @author Alex 52 | * date:2017/12/13 53 | * email:jwnie@foxmail.com 54 | */ 55 | public class HttpClientUtil { 56 | private final static Logger LOG = LoggerFactory.getLogger(HttpClientUtil.class); 57 | /** 58 | * 最大连接数 59 | */ 60 | public final static int MAX_TOTAL_CONNECTIONS = 400; 61 | /** 62 | * 每个路由最大连接数 63 | */ 64 | public final static int MAX_ROUTE_CONNECTIONS = 40; 65 | /** 66 | * 连接超时时间 67 | */ 68 | public final static int CONNECT_TIMEOUT = 30000; 69 | /** 70 | * 连接池 71 | */ 72 | private static PoolingHttpClientConnectionManager clientConnectionManager = null; 73 | 74 | static { 75 | initPoolingHttpClientConnectionManager(); 76 | } 77 | 78 | public static HttpClientUtil getInstance() { 79 | return HttpUtilSingleton.HTTP_UTIL; 80 | } 81 | 82 | public CloseableHttpClient createHttpClient() { 83 | return createHttpClient(CONNECT_TIMEOUT, null, null); 84 | } 85 | 86 | 87 | public CloseableHttpClient createHttpClient(int timeOut, HttpHost httpHost, BasicClientCookie basicClientCookie) { 88 | RequestConfig.Builder builder = RequestConfig.custom() 89 | .setConnectionRequestTimeout(timeOut) 90 | .setConnectTimeout(timeOut) 91 | .setSocketTimeout(timeOut) 92 | .setCookieSpec(CookieSpecs.STANDARD);//RFC6265第4节定义的更为宽松的概要,用于与不符合标准的现有服务器的互操作性表现得很好。 93 | 94 | 95 | //设置代理 96 | if (httpHost != null && StringUtils.isNotBlank(httpHost.getHostName()) && httpHost.getPort() > 0) { 97 | builder.setProxy(httpHost); 98 | } 99 | 100 | RequestConfig requestConfig = builder.build(); 101 | HttpClientBuilder httpClientBuilder = HttpClients.custom(); 102 | httpClientBuilder.setDefaultRequestConfig(requestConfig).setRetryHandler(new RequestRetryHandler()) 103 | .setConnectionManager(clientConnectionManager); 104 | 105 | if (basicClientCookie != null) { 106 | CookieStore cookieStore = new BasicCookieStore(); 107 | cookieStore.addCookie(basicClientCookie); 108 | httpClientBuilder.setDefaultCookieStore(cookieStore); 109 | } 110 | 111 | CloseableHttpClient closeableHttpClient = httpClientBuilder.setRedirectStrategy( 112 | new DefaultRedirectStrategy() { 113 | @Override 114 | public boolean isRedirected(org.apache.http.HttpRequest request, HttpResponse response, HttpContext context) { 115 | boolean isRedirect = false; 116 | try { 117 | isRedirect = super.isRedirected((org.apache.http.HttpRequest) request, response, context); 118 | } catch (ProtocolException e) { 119 | LOG.warn("", e); 120 | } 121 | if (!isRedirect) { 122 | int responseCode = response.getStatusLine().getStatusCode(); 123 | if (responseCode == Const.REDICT_301 || responseCode == Const.REDICT_302) { 124 | return true; 125 | } 126 | } 127 | return isRedirect; 128 | } 129 | 130 | @Override 131 | protected URI createLocationURI(String location) throws ProtocolException { 132 | location = location.replace("|", "%7C"); 133 | return super.createLocationURI(location); 134 | } 135 | } 136 | ).build(); 137 | return closeableHttpClient; 138 | } 139 | 140 | 141 | public Response getResponse(CloseableHttpClient client, String url) { 142 | return getResponse(client, null, url); 143 | } 144 | 145 | /** 146 | * Request 參數為null,則默認為httpGet請求 147 | * 148 | * @param client 149 | * @param request 150 | * @param url 151 | * @return 152 | */ 153 | public Response getResponse(CloseableHttpClient client, Request request, String url) { 154 | if (request != null) { 155 | if (request.getMethod().equals(Const.METHOD_HTTPGET)) { 156 | return httpGetResponse(client, request, url); 157 | } else if (request.getMethod().equals(Const.METHOD_HTTPPOST)) { 158 | return httpPostResponse(client, request, url); 159 | } else { 160 | LOG.warn("暂不支持的http请求类型:" + request.getMethod()); 161 | return null; 162 | } 163 | } else { 164 | return httpGetResponse(client, null, url); 165 | } 166 | } 167 | 168 | public Response httpGetResponse(CloseableHttpClient client, String url) { 169 | return httpGetResponse(client, null, url); 170 | } 171 | 172 | public Response httpPostResponse(CloseableHttpClient client, String url) { 173 | return httpPostResponse(client, null, url); 174 | } 175 | 176 | 177 | private Response httpGetResponse(CloseableHttpClient client, Request request, String url) { 178 | CloseableHttpResponse closeableHttpResponse = null; 179 | HttpGet httpGet = new HttpGet(urlEncode(url)); 180 | Response response = null; 181 | //请求头设置 182 | if (request != null) { 183 | Map headers = request.getHeaders(); 184 | if (headers != null && headers.size() > 0) { 185 | for (Map.Entry entry : headers.entrySet()) { 186 | httpGet.setHeader(entry.getKey(), entry.getValue()); 187 | } 188 | } 189 | } 190 | 191 | try { 192 | closeableHttpResponse = client.execute(httpGet); 193 | response = getHttpResponse(request, closeableHttpResponse); 194 | response.setUrl(url); 195 | } catch (Exception e) { 196 | LOG.warn("请求失败,url:" + url, e); 197 | } finally { 198 | //使用连接池无需关闭 199 | //closeResources(closeableHttpResponse, null); 200 | } 201 | return response; 202 | } 203 | 204 | private Response httpPostResponse(CloseableHttpClient client, Request request, String url) { 205 | Response response = null; 206 | HttpPost httpPost = new HttpPost(urlEncode(url)); 207 | if (request != null) { 208 | Map headers = request.getHeaders(); 209 | // 设置头 210 | if (headers != null && headers.size() != 0) { 211 | for (Map.Entry entry : headers.entrySet()) { 212 | httpPost.setHeader(entry.getKey(), entry.getValue()); 213 | } 214 | } 215 | 216 | Map params = request.getParams(); 217 | if (params != null && params.size() > 0) { 218 | List nvps = new ArrayList(); 219 | for (Map.Entry entry : params.entrySet()) { 220 | nvps.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); 221 | } 222 | try { 223 | httpPost.setEntity(new UrlEncodedFormEntity(nvps, "UTF-8")); 224 | } catch (UnsupportedEncodingException e) { 225 | LOG.warn("设置post请求参数失败:", e); 226 | } 227 | } 228 | } 229 | CloseableHttpResponse closeableHttpResponse = null; 230 | try { 231 | closeableHttpResponse = client.execute(httpPost); 232 | response = getHttpResponse(request, closeableHttpResponse); 233 | response.setUrl(url); 234 | } catch (Exception e) { 235 | LOG.warn("请求失败,url:" + url, e); 236 | } finally { 237 | //使用连接池无需关闭 238 | //closeResources(closeableHttpResponse, null); 239 | } 240 | return response; 241 | } 242 | 243 | 244 | /** 245 | * 设置response 246 | * 247 | * @param request 248 | * @param httpResponse 249 | * @return 250 | * @throws UnsupportedEncodingException 251 | * @throws IOException 252 | */ 253 | private Response getHttpResponse(Request request, CloseableHttpResponse httpResponse) throws UnsupportedEncodingException, IOException { 254 | Response response = new Response(); 255 | String charSet = "utf-8"; 256 | if (request != null) { 257 | charSet = request.getCharSet(); 258 | if (StringUtils.isBlank(charSet)) { 259 | charSet = httpResponse.getEntity().getContentType() == null ? "utf-8" : StringUtils.contains(httpResponse.getEntity().getContentType().getValue(), "charset") ? getCharSet(httpResponse.getEntity().getContentType().getValue()) : "utf-8"; 260 | } 261 | } 262 | response.setStatusCode(httpResponse.getStatusLine().getStatusCode()); 263 | // 获取返回数据 264 | HttpEntity entity = httpResponse.getEntity(); 265 | Header header = entity.getContentEncoding(); 266 | InputStream in = entity.getContent(); 267 | try { 268 | if (header != null && Const.SYMBOL_ZIP.equals(header.getValue().toLowerCase())) { 269 | byte[] bytes = ByteStreams.toByteArray(new GZIPInputStream(in)); 270 | String content = new String(bytes, charSet); 271 | response.setContent(content); 272 | } else if (header != null && Const.SYMBOL_BROTLI.equals(header.getValue().toLowerCase())) { 273 | byte[] bytes = ByteStreams.toByteArray(new BrotliCompressorInputStream(in)); 274 | String content = new String(bytes, charSet); 275 | response.setContent(content); 276 | } else { 277 | byte[] bytes = EntityUtils.toByteArray(entity); 278 | String content = new String(bytes, charSet); 279 | response.setContent(content); 280 | } 281 | } catch (Exception e) { 282 | LOG.warn("读取响应内容异常: ", e); 283 | } finally { 284 | //关闭流的作用就是将用完的连接释放,下次请求可以复用,如不使用in.close();而仅仅使用response.close();结果就是连接会被关闭,并且不能被复用,如此失去了采用连接池的意义。 285 | IOUtils.closeQuietly(in); 286 | } 287 | return response; 288 | } 289 | 290 | /** 291 | * 将url进行encode编码,这里不能直接使用URlEncode(url,"utf-8");方法进编码, 292 | * 会报org.apache.http.client.ClientProtocolException,这里只将特殊字符转义 293 | * 如:+、空格、#、{、}、“ 等 294 | * 295 | * @param url 296 | * @return 297 | */ 298 | private String urlEncode(String url) { 299 | url = url.replaceAll("\\+", "%2b") 300 | .replaceAll(" ", "%20") 301 | .replaceAll("\\{", "%7b") 302 | .replaceAll("}", "%7d") 303 | .replaceAll("\"", "%22"); 304 | return url; 305 | } 306 | 307 | /** 308 | * 初始化连接池,支持http/https 309 | */ 310 | private static void initPoolingHttpClientConnectionManager() { 311 | SSLContext sslcontext = null; 312 | try { 313 | //TLS安全协议上下文获取 314 | sslcontext = SSLContext.getInstance("TLS"); 315 | // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 316 | X509TrustManager x509m = new X509TrustManager() { 317 | @Override 318 | public X509Certificate[] getAcceptedIssuers() { 319 | return null; 320 | } 321 | 322 | @Override 323 | public void checkServerTrusted(X509Certificate[] chain, 324 | String authType) throws CertificateException { 325 | } 326 | 327 | @Override 328 | public void checkClientTrusted(X509Certificate[] chain, 329 | String authType) throws CertificateException { 330 | } 331 | }; 332 | sslcontext.init(null, new TrustManager[]{x509m}, new java.security.SecureRandom()); 333 | SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslcontext, NoopHostnameVerifier.INSTANCE); 334 | RequestConfig defaultConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT) 335 | .setExpectContinueEnabled(true) 336 | .setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST)) 337 | .setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC)).build(); 338 | org.apache.http.config.Registry registry = RegistryBuilder.create() 339 | .register("http", PlainConnectionSocketFactory.INSTANCE) 340 | .register("https", sslConnectionSocketFactory).build(); 341 | 342 | clientConnectionManager = new PoolingHttpClientConnectionManager(registry); 343 | // 设置最大连接数 344 | clientConnectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS); 345 | // 设置每个连接的路由数 346 | clientConnectionManager.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS); 347 | } catch (NoSuchAlgorithmException e) { 348 | LOG.warn("", e); 349 | } catch (KeyManagementException e) { 350 | LOG.warn("", e); 351 | } 352 | } 353 | 354 | private static class HttpUtilSingleton { 355 | private final static HttpClientUtil HTTP_UTIL = new HttpClientUtil(); 356 | } 357 | 358 | /** 359 | * 截取编码方式 360 | * 361 | * @param str 362 | * @return 363 | */ 364 | private String getCharSet(String str) { 365 | String charSet = match(str, Const.CHARSET_PATTERN); 366 | return charSet; 367 | } 368 | 369 | 370 | /** 371 | * 正则匹配 372 | * 373 | * @param s 374 | * @param pattern 375 | * @return 376 | */ 377 | private String match(String s, String pattern) { 378 | Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE); 379 | Matcher matcher = p.matcher(s); 380 | if (matcher.find()) { 381 | return matcher.group(1); 382 | } else { 383 | return ""; 384 | } 385 | } 386 | 387 | public void closeResources(CloseableHttpResponse closeableHttpResponse, CloseableHttpClient closeableHttpClient) { 388 | try { 389 | if (closeableHttpResponse != null) { 390 | closeableHttpResponse.close(); 391 | } 392 | if (closeableHttpClient != null) { 393 | closeableHttpClient.close(); 394 | } 395 | } catch (IOException e) { 396 | LOG.warn("关闭closeableHttpResponse失败:", e); 397 | } 398 | } 399 | } 400 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/request/Request.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.request; 2 | 3 | import org.springframework.stereotype.Component; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * @author Alex 10 | * date:2017/12/14 11 | * email:jwnie@foxmail.com 12 | */ 13 | @Component 14 | public class Request { 15 | /** 16 | * 请求方法 17 | */ 18 | private String method; 19 | 20 | /** 21 | * 网页编码方式 22 | */ 23 | private String charSet; 24 | /** 25 | * 请求头信息 26 | */ 27 | private Map headers = new HashMap(); 28 | 29 | /** 30 | * post的一些参数 31 | */ 32 | Map params = new HashMap<>(); 33 | 34 | 35 | public String getMethod() { 36 | return method; 37 | } 38 | 39 | public void setMethod(String method) { 40 | this.method = method; 41 | } 42 | 43 | public String getCharSet() { 44 | return charSet; 45 | } 46 | 47 | public void setCharSet(String charSet) { 48 | this.charSet = charSet; 49 | } 50 | 51 | public Map getHeaders() { 52 | return headers; 53 | } 54 | 55 | public void setHeaders(Map headers) { 56 | this.headers = headers; 57 | } 58 | 59 | public Request setHeader(String name, String value) { 60 | headers.put(name, value); 61 | return this; 62 | } 63 | 64 | public Map getParams() { 65 | return params; 66 | } 67 | 68 | public void setParams(Map params) { 69 | this.params = params; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/request/RequestRetryHandler.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.request; 2 | 3 | import org.apache.http.HttpEntityEnclosingRequest; 4 | import org.apache.http.HttpRequest; 5 | import org.apache.http.NoHttpResponseException; 6 | import org.apache.http.client.HttpRequestRetryHandler; 7 | import org.apache.http.client.protocol.HttpClientContext; 8 | import org.apache.http.conn.ConnectTimeoutException; 9 | import org.apache.http.protocol.HttpContext; 10 | 11 | import javax.net.ssl.SSLException; 12 | import javax.net.ssl.SSLHandshakeException; 13 | import java.io.IOException; 14 | import java.io.InterruptedIOException; 15 | import java.net.UnknownHostException; 16 | 17 | /** 18 | * 请求重试处理 19 | * 20 | * @author Alex 21 | * date:2017/12/14 22 | * email:jwnie@foxmail.com 23 | */ 24 | public class RequestRetryHandler implements HttpRequestRetryHandler { 25 | 26 | 27 | @Override 28 | public boolean retryRequest(IOException exception, int executionCount, HttpContext httpContext) { 29 | if (executionCount >= 5) {// 如果已经重试了5次,就放弃 30 | return false; 31 | } 32 | if (exception instanceof NoHttpResponseException) {// 如果服务器丢掉了连接,那么就重试 33 | return true; 34 | } 35 | if (exception instanceof SSLHandshakeException) {// 不要重试SSL握手异常 36 | return false; 37 | } 38 | if (exception instanceof InterruptedIOException) {// 超时 39 | return false; 40 | } 41 | if (exception instanceof UnknownHostException) {// 目标服务器不可达 42 | return false; 43 | } 44 | if (exception instanceof ConnectTimeoutException) {// 连接被拒绝 45 | return false; 46 | } 47 | if (exception instanceof SSLException) {// SSL握手异常 48 | return false; 49 | } 50 | 51 | HttpClientContext clientContext = HttpClientContext 52 | .adapt(httpContext); 53 | HttpRequest request = clientContext.getRequest(); 54 | if (!(request instanceof HttpEntityEnclosingRequest)) {// 如果请求是幂等的,就再次尝试 55 | return true; 56 | } 57 | return false; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/request/Response.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.request; 2 | 3 | /** 4 | * @author Alex 5 | * date:2017/12/14 6 | * email:jwnie@foxmail.com 7 | */ 8 | public class Response { 9 | private String url; 10 | private int statusCode; 11 | private String content; 12 | 13 | public String getUrl() { 14 | return url; 15 | } 16 | 17 | public void setUrl(String url) { 18 | this.url = url; 19 | } 20 | 21 | public int getStatusCode() { 22 | return statusCode; 23 | } 24 | 25 | public void setStatusCode(int statusCode) { 26 | this.statusCode = statusCode; 27 | } 28 | 29 | public String getContent() { 30 | return content; 31 | } 32 | 33 | public void setContent(String content) { 34 | this.content = content; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/service/ProxyService.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.service; 2 | 3 | import com.meow.proxy.entity.Proxy; 4 | import org.apache.ibatis.annotations.Param; 5 | 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author Alex 11 | * date:2017/12/18 12 | * email:jwnie@foxmail.com 13 | */ 14 | public interface ProxyService { 15 | void saveProxies(List proxyList); 16 | 17 | void updateProxies(List proxyList); 18 | 19 | List queryValidProxies(); 20 | 21 | /** 22 | * 默认查询前一百条有效的代理 23 | * @return 24 | */ 25 | List queryProxy(String protocolType, String isDemostic,String anonymousType); 26 | 27 | List> proxyStatisticBySite(); 28 | 29 | int queryValidProxyCount(String protocolType, String isDemostic,String anonymousType); 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/meow/proxy/service/impl/ProxyServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.service.impl; 2 | 3 | import com.meow.proxy.dao.ProxyDao; 4 | import com.meow.proxy.entity.Proxy; 5 | import com.meow.proxy.service.ProxyService; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.stereotype.Service; 8 | 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | /** 13 | * @author Alex 14 | * date:2017/12/18 15 | * email:jwnie@foxmail.com 16 | */ 17 | @Service 18 | public class ProxyServiceImpl implements ProxyService { 19 | @Autowired 20 | ProxyDao proxyDao; 21 | 22 | @Override 23 | public void saveProxies(List proxyList) { 24 | proxyDao.saveProxies(proxyList); 25 | } 26 | 27 | @Override 28 | public void updateProxies(List proxyList) { 29 | proxyDao.updateProxies(proxyList); 30 | } 31 | 32 | @Override 33 | public List queryValidProxies() { 34 | return proxyDao.queryValidProxies(); 35 | } 36 | 37 | /** 38 | * 查询前一百条有效的代理 39 | * 40 | * @return 41 | */ 42 | @Override 43 | public List queryProxy(String protocolType, String isDemostic, String anonymousType) { 44 | return proxyDao.queryProxy(protocolType, isDemostic, anonymousType); 45 | } 46 | 47 | public List> proxyStatisticBySite() { 48 | return proxyDao.proxyStatisticBySite(); 49 | } 50 | 51 | public int queryValidProxyCount(String protocolType, String isDemostic, String anonymousType) { 52 | return proxyDao.queryValidProxyCount(protocolType, isDemostic, anonymousType); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/resources/application.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jwnie/proxyservice/0ab1009e24ceffbf5588c85cfe7556f70114c72b/src/main/resources/application.properties -------------------------------------------------------------------------------- /src/main/resources/conf/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | ${logfile.dir}/all/${logfile.name}.log 29 | 30 | 31 | ${log.pattern} 32 | UTF-8 33 | 34 | 35 | 36 | 37 | 38 | 39 | ${logfile.dir}/all/${logfile.name}.log_%d{yyyy-MM-dd}_%i.zip 40 | 41 | ${max.save.day} 42 | 43 | 44 | ${max.single.file.size} 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | ${logfile.dir}/error/${logfile.name}-error.log 53 | 54 | 55 | ${log.pattern} 56 | UTF-8 57 | 58 | 59 | 60 | ERROR 61 | 62 | 63 | 64 | 65 | ${logfile.dir}/error/${logfile.name}-error.log_%d{yyyy-MM-dd}_%i.zip 66 | 67 | ${max.save.day} 68 | 69 | 70 | ${max.single.file.size} 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | ${log.pattern} 79 | UTF-8 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /src/main/resources/mapper/ProxyMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | INSERT IGNORE INTO `Cfg_Proxy` 9 | (`ip`,`PORT`,`anonymousType`,`protocolType`,`country`,`AREA`,`valid`,`invalidTime`,`lastSurviveTime`,`checkTime`,`checkStatus`,`score`,`sourceSite`,`validTime`,`crawlTime`,`responseTime`) 10 | VALUES 11 | 12 | (#{proxy.ip},#{proxy.port},#{proxy.anonymousType},#{proxy.protocolType},#{proxy.country},#{proxy.area},#{proxy.valid},#{proxy.invalidTime},#{proxy.lastSurviveTime},#{proxy.checkTime},#{proxy.checkStatus},#{proxy.score},#{proxy.sourceSite},#{proxy.validTime},#{proxy.crawlTime},#{proxy.responseTime}) 13 | 14 | 15 | 16 | 17 | 18 | 19 | 22 | 23 | 24 | 42 | 43 | 61 | 62 | 63 | 66 | 67 | 68 | 69 | 70 | UPDATE Cfg_Proxy 71 | 72 | valid = #{item.valid}, 73 | invalidTime = #{item.invalidTime}, 74 | lastSurviveTime = #{item.lastSurviveTime}, 75 | checkTime = #{item.checkTime}, 76 | checkStatus = #{item.checkStatus}, 77 | score = #{item.score}, 78 | validTime = #{item.validTime}, 79 | responseTime = #{item.responseTime} 80 | 81 | where id = #{item.id} 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/main/resources/proxyservice.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | WORK_HOME="/home/elise/app/proxyservice" 3 | APP_NAME="com.meow.proxy.Proxyservice" 4 | APP_VERSION="proxyservice-1.0" 5 | PROGRAM="com.meow.proxy.Proxyservice" 6 | AUTHOR="alex" 7 | ABOUT="proxyservice" 8 | MAIN_CLASS="com.meow.proxy.Proxyservice" 9 | JVM_OPTION="-Xms512m -Xmx1024m -XX:+ForceTimeHighResolution" 10 | 11 | 12 | WORK_DIR=`pwd` 13 | init() 14 | { 15 | export java_home=$JAVA_HOME 16 | export PATH=$java_home/bin:$PATH 17 | export LANG=zh_CN 18 | echo "设置环境变量完成..." 19 | } 20 | 21 | start() 22 | { 23 | APP_PIDS=`ps -ef --width 4096|grep $PROGRAM |grep -v grep |awk '{print $2}'` 24 | if [ -n "$APP_PIDS" ] 25 | then 26 | echo "CrawlerApp has been started before!Can not start again." 27 | return 28 | fi 29 | 30 | cd $WORK_HOME 31 | init 32 | 33 | APPPATH=. 34 | 35 | JARPATH=$APPPATH/lib 36 | CONFPATH=$APPPATH/conf 37 | 38 | LINE=`find $JARPATH -name "*.jar" -depth` 39 | 40 | LIBPATH=$CONFPATH 41 | 42 | for LOOP in $LINE 43 | do 44 | LIBPATH=$LIBPATH:$LOOP 45 | done 46 | 47 | #main class name 48 | exec java $JVM_OPTION -cp "$APP_NAME:$LIBPATH" $MAIN_CLASS & 49 | } 50 | 51 | stop() 52 | { 53 | APP_PIDS=`ps -ef --width 4096|grep $PROGRAM |grep -v grep |awk '{print $2}'` 54 | for LOOP in $APP_PIDS 55 | do 56 | #kill -9 $LOOP 57 | kill $LOOP 58 | done 59 | } 60 | 61 | showstate() 62 | { 63 | echo "程序进行信息:" 64 | ps -ef --width 4096 | grep $APP_NAME | grep -v "grep" 65 | } 66 | 67 | showversion() 68 | { 69 | echo -e "Name:\t\t$APP_NAME " 70 | echo -e "version:\t$APP_VERSION" 71 | echo -e "Author:\t\t$AUTHOR\n" 72 | echo -e "About:\t\t$ABOUT" 73 | } 74 | 75 | case "$1" in 76 | start) 77 | start 78 | echo -e "$APP_NAME Starting...\t[OK]" 79 | ;; 80 | stop) 81 | stop 82 | echo -e "$APP_NAME Stopping...\t[OK]" 83 | ;; 84 | restart) 85 | stop 86 | sleep 2 87 | start 88 | echo -e "$APP_NAME Restarting...\t[OK]" 89 | ;; 90 | version|-v) 91 | showversion 92 | ;; 93 | state) 94 | showstate 95 | ;; 96 | *) 97 | echo "Usage: $0 {start|stop|restart|version|-v|state}" 98 | exit 1 99 | esac 100 | 101 | cd $WORK_DIR 102 | 103 | exit 0 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /src/main/resources/sql/Proxy.sql: -------------------------------------------------------------------------------- 1 | 2 | drop table if exists Cfg_Proxy; 3 | 4 | create table Cfg_Proxy 5 | ( 6 | id int not null comment '代理自增ID', 7 | ip varchar(20) not null comment '使用ip和port作为主键避免重复插入', 8 | port int not null comment '使用ip和port作为主键避免重复插入', 9 | anonymousType varchar(20) not null comment 'transparent、anonymous、distorting、elite', 10 | protocolType varchar(20) not null comment 'http、https、socks4、socks5、socks', 11 | country varchar(50), 12 | area varchar(50), 13 | valid boolean not null, 14 | invalidTime bigint comment 'ms级别时间戳', 15 | lastSurviveTime bigint comment 'ms级时间', 16 | checkTime bigint comment 'ms级别时间戳', 17 | checkStatus int not null comment '0:未验证;1:已验证', 18 | score float, 19 | sourceSite varbinary(50) not null, 20 | validTime int, 21 | crawlTime bigint not null comment 'ms级别时间戳', 22 | responseTime bigint comment 'ms级时间', 23 | primary key (ip, port) 24 | ); -------------------------------------------------------------------------------- /src/test/java/com/meow/proxy/ProxyserviceApplicationTests.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy; 2 | 3 | import org.junit.Test; 4 | import org.junit.runner.RunWith; 5 | import org.springframework.boot.test.context.SpringBootTest; 6 | import org.springframework.test.context.junit4.SpringRunner; 7 | 8 | @RunWith(SpringRunner.class) 9 | @SpringBootTest 10 | public class ProxyserviceApplicationTests { 11 | 12 | @Test 13 | public void contextLoads() { 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/test/java/com/meow/proxy/check/ProxyCheckTest.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.check; 2 | 3 | import org.apache.http.HttpHost; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | /** 8 | * @author Alex 9 | * date:2017/12/15 10 | * email:jwnie@foxmail.com 11 | */ 12 | public class ProxyCheckTest{ 13 | ProxyCheck proxyCheck = null; 14 | @Before 15 | public void init(){ 16 | proxyCheck = ProxyCheck.getInstance(); 17 | } 18 | 19 | @Test 20 | public void checkProxy(){ 21 | // HttpHost httpHost = new HttpHost("113.218.191.170",8888); 22 | // HttpHost httpHost = new HttpHost("223.241.119.16",8180); 23 | // HttpHost httpHost = new HttpHost("121.31.103.33",6666); 24 | // HttpHost httpHost = new HttpHost("139.59.169.81",8118); 25 | HttpHost httpHost = new HttpHost("191.252.111.249",3128); 26 | // System.out.println(proxyCheck.checkProxyBySocket(httpHost,false)); 27 | System.out.println(proxyCheck.checkProxyByRequestBaidu(httpHost,false)); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/com/meow/proxy/conigure/Task.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.conigure; 2 | 3 | import com.meow.proxy.download.impl.XicidailiDownLoader; 4 | import org.junit.Test; 5 | 6 | /** 7 | * Created by Jwnie on 2017/12/17. 8 | */ 9 | public class Task { 10 | 11 | @Test 12 | public void test(){ 13 | System.out.println(XicidailiDownLoader.class.getCanonicalName()); 14 | 15 | 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/test/java/com/meow/proxy/request/Request.java: -------------------------------------------------------------------------------- 1 | package com.meow.proxy.request; 2 | 3 | import com.meow.proxy.base.Const; 4 | import org.junit.Test; 5 | 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | /** 10 | * @author Alex 11 | * date:2017/12/15 12 | * email:jwnie@foxmail.com 13 | */ 14 | public class Request { 15 | 16 | @Test 17 | public void testMatch(){ 18 | System.out.println(getCharSet("charset=utf-8\n")); 19 | System.out.println(getCharSet("CHARSET = gb2312")); 20 | System.out.println(getCharSet("text/html;charset=UTF-8")); 21 | 22 | 23 | } 24 | 25 | /** 26 | * 截取编码方式 27 | * @param str 28 | * @return 29 | */ 30 | private String getCharSet(String str){ 31 | String charSet = findCharset(str, Const.CHARSET_PATTERN); 32 | return charSet; 33 | } 34 | 35 | /** 36 | * 正则匹配 37 | * @param s 38 | * @param pattern 39 | * @return 40 | */ 41 | private String findCharset(String s, String pattern) { 42 | Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE); 43 | Matcher matcher = p.matcher(s); 44 | if (matcher.find()) { 45 | return matcher.group(1); 46 | } else { 47 | return null; 48 | } 49 | } 50 | } 51 | --------------------------------------------------------------------------------