├── .gitignore ├── README.md ├── _config.yml ├── pom.xml ├── spider-consumer ├── README.md ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── jinshuai │ │ ├── Consumer.java │ │ ├── core │ │ ├── downloader │ │ │ ├── Downloader.java │ │ │ └── impl │ │ │ │ └── HttpClientPoolDownloader.java │ │ ├── parser │ │ │ ├── Parser.java │ │ │ └── impl │ │ │ │ └── NewsParser.java │ │ ├── saver │ │ │ ├── Saver.java │ │ │ └── impl │ │ │ │ └── TextSaver.java │ │ └── scheduler │ │ │ ├── Scheduler.java │ │ │ └── impl │ │ │ └── RedisScheduler.java │ │ ├── entity │ │ ├── Page.java │ │ └── UrlSeed.java │ │ └── util │ │ ├── JedisUtils.java │ │ ├── PropertiesUtils.java │ │ ├── hash │ │ ├── MurmurHash.java │ │ └── PageUtils.java │ │ └── http │ │ ├── HttpUtils.java │ │ ├── StatusHandler.java │ │ └── UserAgentArray.java │ └── resources │ ├── application.properties │ └── logback.xml ├── spider-core ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── jinshuai │ │ │ ├── Spider.java │ │ │ ├── core │ │ │ ├── README.md │ │ │ ├── downloader │ │ │ │ ├── Downloader.java │ │ │ │ └── impl │ │ │ │ │ └── HttpClientPoolDownloader.java │ │ │ ├── parser │ │ │ │ ├── Parser.java │ │ │ │ └── impl │ │ │ │ │ ├── BaiKeParser.java │ │ │ │ │ └── NewsParser.java │ │ │ ├── saver │ │ │ │ ├── Saver.java │ │ │ │ └── impl │ │ │ │ │ ├── DataBaseSaver.java │ │ │ │ │ └── TextSaver.java │ │ │ └── scheduler │ │ │ │ ├── Scheduler.java │ │ │ │ └── impl │ │ │ │ ├── PriorityQueueScheduler.java │ │ │ │ └── RedisScheduler.java │ │ │ ├── entity │ │ │ ├── Page.java │ │ │ └── UrlSeed.java │ │ │ └── util │ │ │ ├── ExcelUtils.java │ │ │ ├── JedisUtils.java │ │ │ ├── OfficeUtils.java │ │ │ ├── PropertiesUtils.java │ │ │ ├── hash │ │ │ ├── MurmurHash.java │ │ │ └── PageUtils.java │ │ │ └── http │ │ │ ├── HttpUtils.java │ │ │ ├── StatusHandler.java │ │ │ └── UserAgentArray.java │ └── resources │ │ ├── application.properties │ │ └── logback.xml │ └── test │ └── java │ └── com │ ├── TestGson.java │ ├── TestHttpClient.java │ ├── TestJDBC.java │ ├── TestJsoup.java │ ├── TestRedis.java │ └── TestReg.java └── spider-flowchart.svg /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.iml,*.idea 3 | '.idea' 4 | *.iml 5 | logs/ 6 | # Mobile Tools for Java (J2ME) 7 | .mtj.tmp/ 8 | .idea/ 9 | # Package Files # 10 | *.jar 11 | *.war 12 | .idea/* 13 | .idea* 14 | \.idea* 15 | *.ear 16 | target 17 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 18 | hs_err_pid* 19 | *.MF -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider 2 | 3 | Spider是一个基于Java的简易多线程爬虫框架,并且提供了默认组件。用户也可以根据需要实现自己的组件 4 | - 具体流程 5 | - 首先在调度器中添加初始种子,开启线程池。 6 | - 工作线程开始从种子调度器中取URL种子 7 | - 使用下载器获取URL对应的页面内容 8 | - 使用解析器解析页面内容,将页面里的URL封装成URL种子,添加到种子调度器中。 9 | - 持久器会判断是否已经存在类似的文本内容,如果存在会做丢弃处理,否则会将页面中的内容做持久化处理。 10 | 11 | ![流程图](./spider-flowchart.svg) 12 | 13 | # 使用 14 | 15 | ## 开发环境 16 | - JDK8+ 17 | - Maven3+ 18 | - lombok 19 | 20 | ## 使用 21 | - 修改`application.properties`中存放解析内容的路径`dir` 22 | - 如果使用`Redis`作为种子调度器(默认使用优先队列),需要修改`application.properties`中配置的`redis-ip`、`redis-port`和`redis-password`。如果你的Redis不需要密码验证,就不用修改文件里的`password`属性。 23 | - 如果使用消息队列(框架使用的是[RocketMQ](https://rocketmq.apache.org/)),需要修改`application.properties`中配置的`mq-ip`、`mq-port`。不用的话可以将`mq-switch`置为0,关闭消息队列。 24 | - 默认解析器解析: 如果解析其它类型的网页,需要重写`Parser.java`接口以及提供给种子调度器的初始种子 25 | - 运行`Spider.java` 26 | ```Java 27 | Spider.build() 28 | .addUrlSeed(new UrlSeed("http://xww.hebut.edu.cn/gdyw/index.htm")) 29 | .run(); 30 | ``` 31 | 32 | # 项目结构 33 | 34 | ```Shell 35 | ├── logs // 系统日志 36 | ├── spider-consumer // 消费模块(消费4XX 3XX状态码对应URL) 37 | ├── spider-core // 爬虫模块 38 | │ ├── src // 源码 39 | │ ├── |——main 40 | │ ├── ├──|——java/com/jinshuai 41 | │ ├── ├──├──|——core // 核心组件 42 | │ ├── ├──├──|————downloader // 下载器 43 | │ ├── ├──├──|————parser // 解析器 44 | │ ├── ├──├──|————saver // 持久器 45 | │ ├── ├──├──|————scheduler // URL调度器 46 | │ ├── ├──├──|——entity // 实体 47 | │ ├── ├──├──|——util // 工具 48 | │ ├── ├──|——resources // 资源目录 49 | │ ├── ├──|——|——application.properties // 配置文件 50 | 51 | ``` 52 | 53 | # 进度 54 | ## Finished 55 | - [x] 配置了[Http连接池](https://hc.apache.org/httpcomponents-client-ga/),完成了Http请求和处理Http响应
56 | - [x] [解析](https://jsoup.org/)响应的内容 57 | - [x] 配置线程池,通过[Redis](https://redis.io/)缓存URL种子 58 | - [x] 持久化解析结果 59 | - [x] 添加新的种子调度器(优先队列结合布隆过滤器) 60 | - [x] 对于Redis调度器,存放url对应的hash进行判重减少空间使用 61 | - [x] 使用SimHash进行文本相似度检测 62 | - [x] 将3XX 4XX 5XX状态码对应URL放到消息队列中去消费 63 | 64 | ## TODO 65 | - [ ] 定时解析失败日志,将失败URL重新加入爬取仓库,设置失败次数限制,超过指定次数就放弃。 66 | - [ ] 分布式环境下,统一存放解析后的文本 67 | - [ ] 各个组件进行热替换 68 | - [ ] 优化解析页面代码 69 | 70 | # 参考 71 | - **代码和设计思路**参考自[https://github.com/xjtushilei/ScriptSpider](https://github.com/xjtushilei/ScriptSpider) 72 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.jinshuai 8 | spider 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | 13 | spider-core 14 | spider-consumer 15 | 16 | 17 | spider 18 | 19 | https://github.com/jinshuai86/Spider 20 | 21 | 22 | UTF-8 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /spider-consumer/README.md: -------------------------------------------------------------------------------- 1 | # 消费者 2 | 消费状态码为3XX 4XX 5XX对应的URL -------------------------------------------------------------------------------- /spider-consumer/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | spider 8 | com.jinshuai 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 4.0.0 13 | 14 | com.jinshuai.consumer 15 | spider-consumer 16 | 1.0 17 | jar 18 | 19 | spider-consumer 20 | 21 | https://github.com/jinshuai86/Spider 22 | 23 | 24 | UTF-8 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-compiler-plugin 34 | 35 | 1.8 36 | 1.8 37 | UTF-8 38 | 39 | 40 | 41 | 42 | org.apache.maven.plugins 43 | maven-shade-plugin 44 | 1.2.1 45 | 46 | 47 | package 48 | 49 | shade 50 | 51 | 52 | 53 | 54 | com.jinshuai.Consumer 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | junit 68 | junit 69 | 4.12 70 | test 71 | 72 | 73 | 74 | org.projectlombok 75 | lombok 76 | 1.18.2 77 | provided 78 | 79 | 80 | 81 | ch.qos.logback 82 | logback-classic 83 | 1.2.3 84 | 85 | 86 | 87 | org.apache.httpcomponents 88 | httpclient 89 | 4.5.5 90 | 91 | 92 | 93 | org.jsoup 94 | jsoup 95 | 1.11.2 96 | 97 | 98 | 99 | redis.clients 100 | jedis 101 | 2.9.0 102 | 103 | 104 | 105 | com.google.code.gson 106 | gson 107 | 2.8.0 108 | 109 | 110 | 111 | com.google.guava 112 | guava 113 | 26.0-jre 114 | 115 | 116 | 117 | joda-time 118 | joda-time 119 | 2.9.9 120 | 121 | 122 | 123 | org.ansj 124 | ansj_seg 125 | 5.1.1 126 | 127 | 128 | 129 | org.apache.rocketmq 130 | rocketmq-client 131 | 4.4.0 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/Consumer.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai; 2 | 3 | import com.jinshuai.core.downloader.Downloader; 4 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader; 5 | import com.jinshuai.core.parser.Parser; 6 | import com.jinshuai.core.parser.impl.NewsParser; 7 | import com.jinshuai.core.saver.Saver; 8 | import com.jinshuai.core.saver.impl.TextSaver; 9 | import com.jinshuai.core.scheduler.Scheduler; 10 | import com.jinshuai.core.scheduler.impl.RedisScheduler; 11 | import com.jinshuai.entity.Page; 12 | import com.jinshuai.entity.UrlSeed; 13 | import com.jinshuai.util.PropertiesUtils; 14 | import lombok.extern.slf4j.Slf4j; 15 | import org.apache.rocketmq.client.consumer.DefaultMQPushConsumer; 16 | import org.apache.rocketmq.client.consumer.listener.ConsumeConcurrentlyStatus; 17 | import org.apache.rocketmq.client.consumer.listener.MessageListenerConcurrently; 18 | import org.apache.rocketmq.client.exception.MQClientException; 19 | import org.apache.rocketmq.common.message.MessageExt; 20 | import org.apache.rocketmq.remoting.common.RemotingHelper; 21 | 22 | import java.io.UnsupportedEncodingException; 23 | import java.util.concurrent.*; 24 | 25 | 26 | /** 27 | * @author: JS 28 | * @date: 2018/06/27 29 | * @description: 消费端 30 | */ 31 | @Slf4j 32 | public class Consumer { 33 | 34 | /** 35 | * 设置爬虫组件:scheduler、downloader、parser、saver、 36 | */ 37 | private Scheduler scheduler; 38 | private Downloader downloader; 39 | private Parser parser; 40 | private Saver saver; 41 | 42 | /** 43 | * 线程池参数配置 44 | */ 45 | private ScheduledThreadPoolExecutor pool; 46 | private static final int CORE_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 2; 47 | private static final int INITIAL_DELAY = 5; 48 | private static final int PERIOD = 20; 49 | 50 | private static final String CHARSET = RemotingHelper.DEFAULT_CHARSET; 51 | 52 | private Consumer setScheduler(Scheduler scheduler) { 53 | if (scheduler == null) { 54 | log.error("未设置调度器,启动失败"); 55 | System.exit(-1); 56 | } 57 | this.scheduler = scheduler; 58 | return this; 59 | } 60 | 61 | private Consumer setDownloader(Downloader downloader) { 62 | if (downloader == null) { 63 | log.error("未设置下载器,启动失败"); 64 | System.exit(-1); 65 | } 66 | this.downloader = downloader; 67 | return this; 68 | } 69 | 70 | private Consumer setParser(Parser parser) { 71 | if (parser == null) { 72 | log.error("未设置解析器,启动失败"); 73 | System.exit(-1); 74 | } 75 | this.parser = parser; 76 | return this; 77 | } 78 | 79 | private Consumer setSaver(Saver saver) { 80 | if (saver == null) { 81 | log.error("未设置保存器,启动失败"); 82 | System.exit(-1); 83 | } 84 | this.saver = saver; 85 | return this; 86 | } 87 | 88 | private Consumer setThreadPool() { 89 | pool = new ScheduledThreadPoolExecutor(CORE_POOL_SIZE); 90 | return this; 91 | } 92 | 93 | private void run() { 94 | log.info("消费者启动......"); 95 | startConsumer(); 96 | while (true) { 97 | UrlSeed urlSeed = scheduler.pop(); 98 | try { 99 | if (urlSeed == null) { 100 | // log.info("队列暂无消息,等待中......"); 101 | TimeUnit.SECONDS.sleep(1); 102 | } else { 103 | log.info("准备解析URL:[{}],优先级(默认5):[{}]", urlSeed.getUrl(), urlSeed.getPriority()); 104 | pool.scheduleAtFixedRate(new ConsumerWork(urlSeed),INITIAL_DELAY, PERIOD, TimeUnit.SECONDS); 105 | } 106 | } catch (InterruptedException e) { 107 | log.error("当前线程被中断", e); 108 | } catch (RejectedExecutionException e) { 109 | log.error("拒绝此次提交的任务[{}]", urlSeed, e); 110 | } catch (Exception e) { 111 | log.error("线程池定时任务停止工作,重新启动线程池", e); 112 | pool.scheduleAtFixedRate(new ConsumerWork(urlSeed),INITIAL_DELAY, PERIOD, TimeUnit.SECONDS); 113 | } 114 | } 115 | } 116 | 117 | private class ConsumerWork implements Runnable { 118 | 119 | private UrlSeed urlSeed; 120 | 121 | ConsumerWork(UrlSeed urlSeed) { 122 | this.urlSeed = urlSeed; 123 | } 124 | 125 | public void run() { 126 | try { 127 | log.info("已完成任务数量:[{}],运行中线程数量:[{}],最大线程运行数量: [{}],工作队列任务数量:[{}]", 128 | pool.getCompletedTaskCount(), pool.getActiveCount(), pool.getMaximumPoolSize(), pool.getQueue().size()); 129 | Page page = downloader.download(urlSeed); 130 | parser.parse(page); 131 | // 将新的种子添加到调度器中 132 | page.getUrlSeeds().forEach(seed -> scheduler.push(seed)); 133 | saver.save(page); 134 | } finally { 135 | } 136 | } 137 | } 138 | 139 | private void startConsumer() { 140 | PropertiesUtils properties = PropertiesUtils.getInstance(); 141 | String ip = properties.get("mq-ip"); 142 | String port = properties.get("mq-port"); 143 | DefaultMQPushConsumer consumer = new DefaultMQPushConsumer("Consumer-Group"); 144 | try { 145 | consumer.setNamesrvAddr(ip + ":" + port); 146 | consumer.subscribe("Forbidden-Topic", "*"); 147 | consumer.subscribe("Redirect-Topic", "*"); 148 | consumer.subscribe("ServerWrong-Topic", "*"); 149 | consumer.registerMessageListener((MessageListenerConcurrently) (msgs, context) -> { 150 | for (MessageExt msg : msgs) { 151 | try { 152 | // log.info("consume success [{}]", msg.toString()); 153 | // 其它状态码对应的url优先级是0 154 | scheduler.push(new UrlSeed(new String(msg.getBody(), CHARSET), 0)); // TODO 155 | } catch (UnsupportedEncodingException e) { 156 | log.error("unsupported encoding[{}]", CHARSET, e); 157 | } 158 | } 159 | return ConsumeConcurrentlyStatus.CONSUME_SUCCESS; }); 160 | consumer.start(); 161 | } catch (MQClientException e) { 162 | log.error("failed to start consumer", e); 163 | System.exit(-1); 164 | } 165 | log.info("Consumer Started."); 166 | } 167 | 168 | private static Consumer build() { 169 | return new Consumer() 170 | .setDownloader(new HttpClientPoolDownloader()) 171 | .setParser(new NewsParser()) 172 | .setSaver(new TextSaver()) 173 | .setScheduler(new RedisScheduler()) 174 | .setThreadPool(); 175 | } 176 | 177 | /** 178 | * Test 179 | * */ 180 | public static void main(String[] args) { 181 | Consumer.build() 182 | .run(); 183 | } 184 | 185 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/downloader/Downloader.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.downloader; 2 | 3 | import com.jinshuai.entity.Page; 4 | import com.jinshuai.entity.UrlSeed; 5 | 6 | /** 7 | * 下载器接口,可以针对此接口构造多种下载器实现 8 | * @see com.jinshuai.core.downloader.impl.HttpClientPoolDownloader 9 | * */ 10 | public interface Downloader { 11 | 12 | /*** 13 | * @param urlSeed 待使用种子 14 | * @return 响应体内容封装成的Page 15 | */ 16 | Page download(UrlSeed urlSeed); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/downloader/impl/HttpClientPoolDownloader.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.downloader.impl; 2 | 3 | import com.jinshuai.core.downloader.Downloader; 4 | import com.jinshuai.entity.Page; 5 | import com.jinshuai.entity.UrlSeed; 6 | import com.jinshuai.util.http.HttpUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | 11 | /** 12 | * @author: JS 13 | * @date: 2018/3/26 14 | * @description: 15 | * 通过Http连接池下载 16 | */ 17 | @Slf4j 18 | public class HttpClientPoolDownloader implements Downloader { 19 | 20 | public Page download(UrlSeed urlSeed) { 21 | Page page = null; 22 | try { 23 | String html = HttpUtils.getSingleInstance().getContent(urlSeed.getUrl()); 24 | Document document = Jsoup.parse(html, urlSeed.getUrl()); 25 | page = new Page(urlSeed, document); 26 | } catch (Exception e) { 27 | log.error("下载器下载的相应文本获取DOM树失败", e); 28 | } 29 | return page; 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.parser; 2 | 3 | import com.jinshuai.entity.Page; 4 | 5 | /** 6 | * @author JS 7 | * @date 2018/03/26 8 | * @description 9 | * 解析Page 10 | * */ 11 | public interface Parser { 12 | 13 | /** 14 | * @param page 要解析的Page 15 | * @return 解析后的Page(Map、Set) 16 | * @description 解析Page中的Document的内容到Map中,URL到Set中 17 | * */ 18 | Page parse(Page page); 19 | 20 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/parser/impl/NewsParser.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.parser.impl; 2 | 3 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader; 4 | import com.jinshuai.core.parser.Parser; 5 | import com.jinshuai.entity.Page; 6 | import com.jinshuai.entity.UrlSeed; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.joda.time.DateTime; 9 | import org.joda.time.DateTimeUtils; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | 13 | import java.util.*; 14 | 15 | /** 16 | * @author: JS 17 | * @date: 2018/3/26 18 | * @description: 19 | * 针对hebut新闻类的网页,解析相应内容。 20 | */ 21 | @Slf4j 22 | public class NewsParser implements Parser { 23 | 24 | private static volatile int firstTime = 0; 25 | 26 | // TODO: 待优化解析过程 27 | public Page parse(Page page) { 28 | // 获取DOM树 29 | Document document; 30 | try { 31 | document = page.getDocument(); 32 | long priority = timestamp2Priority(document); 33 | // 种子,并进行预处理 34 | Set urlSeeds = new HashSet<>(); 35 | Iterator seedIterator = document.getElementsByTag("a").iterator(); 36 | while (seedIterator.hasNext()) { 37 | Element element3 = (Element) seedIterator.next(); 38 | String href = element3.attr("href"); 39 | if (href.contains("http://www.hebut.edu.cn/")|| href.contains("/") || href.contains("#") || href.contains("index.htm") || href.contains("javascript:void(0);")) continue; 40 | if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) continue; 41 | urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/gdyw/" + href, priority)); 42 | } 43 | page.setUrlSeeds(urlSeeds); 44 | if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) { 45 | return page; 46 | } 47 | Map items = new HashMap(3); 48 | // 标题 49 | Element titleElement = document.selectFirst("div.sub_articleTitle"); 50 | items.put("title", titleElement.getElementsByTag("h2").text()); 51 | // 时间 52 | Element dateElement = document.selectFirst("div.sub_articleAuthor"); 53 | items.put("date", dateElement.getElementsByTag("strong").eachText().get(0)); 54 | // 正文 55 | Element textElement = document.selectFirst("div.sub_articleInfo"); 56 | Iterator textIterator = textElement.getElementsByTag("span").iterator(); 57 | StringBuilder stringBuilder = new StringBuilder(); 58 | while (textIterator.hasNext()) { 59 | Element element3 = (Element) textIterator.next(); 60 | stringBuilder.append(element3.text()); 61 | } 62 | items.put("content", stringBuilder.toString()); 63 | page.setItems(items); 64 | } catch (Exception e) { 65 | log.error("解析页面[{}]出错",page.getUrlSeed().getUrl(),e); 66 | } finally { 67 | return page; 68 | } 69 | } 70 | 71 | /** 72 | * 该Page中的url时间戳参考该Page的时间戳计算优先级 73 | * */ 74 | private long timestamp2Priority(Document document) { 75 | String date; 76 | try { 77 | date = document.selectFirst("div.sub_articleAuthor").getElementsByTag("strong").eachText().get(0); 78 | } catch (Exception e) { 79 | log.error("解析页面异常",e); 80 | return 5; 81 | } 82 | DateTime dateTime = new DateTime(date); 83 | // 获取时间戳的差值 84 | long v = DateTimeUtils.currentTimeMillis() - dateTime.getMillis(); 85 | // 换算成天数 86 | v /= 86400000; 87 | // 发布时间超过10天设置低的优先级:3,10天:5,小于10天:10 88 | return v > 10 ? 3 : v == 10 ? 5 : 10; 89 | } 90 | 91 | private Page getHyperLinkTag(Page page) { 92 | if (page == null) { 93 | throw new RuntimeException("page 为空"); 94 | } 95 | // 获取DOM树 96 | Document document = page.getDocument(); 97 | // 如果是首页 98 | if ("http://xww.hebut.edu.cn".equals(page.getUrlSeed().getUrl()) && firstTime == 0) { 99 | Set urlSeeds = new HashSet(); 100 | Iterator seedIterator = document.getElementsByTag("a").iterator(); 101 | while (seedIterator.hasNext()) { 102 | Element element3 = (Element) seedIterator.next(); 103 | String href = element3.attr("href"); 104 | if (href.contains("#") || href.contains("index.html") || href.contains("javascript:void(0);")) continue; 105 | if (href.startsWith("gdyw") || href.startsWith("zhyw")) { 106 | urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/" + href, 107 | (int) (Math.random() * 10))); 108 | } 109 | } 110 | page.setUrlSeeds(urlSeeds); 111 | // 已经访问过首页 112 | firstTime = 1; 113 | } 114 | return page; 115 | } 116 | /** 117 | * test 118 | * */ 119 | public static void main(String[] args) { 120 | UrlSeed urlSeed = new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5); 121 | Page page = new HttpClientPoolDownloader().download(urlSeed); 122 | // Page page = new Page(new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5), Jsoup.parse("","http://xww.hebut.edu.cn/gdyw/index.htm")); 123 | System.out.println(new NewsParser().parse(page)); 124 | } 125 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/saver/Saver.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.saver; 2 | 3 | import com.jinshuai.entity.Page; 4 | 5 | /** 6 | * 数据持久化 7 | * */ 8 | public interface Saver { 9 | 10 | /** 11 | * just do it 12 | * */ 13 | void save(Page page); 14 | 15 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/saver/impl/TextSaver.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.saver.impl; 2 | 3 | import com.jinshuai.core.saver.Saver; 4 | import com.jinshuai.entity.Page; 5 | import com.jinshuai.util.PropertiesUtils; 6 | import com.jinshuai.util.hash.PageUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | 9 | import java.io.File; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | import java.util.Date; 13 | 14 | /** 15 | * @author: JS 16 | * @date: 2018/3/27 17 | * @description: 18 | * 存储到txt 19 | */ 20 | @Slf4j 21 | public class TextSaver implements Saver { 22 | 23 | private String parentDir; 24 | 25 | private PageUtils pageUtil = PageUtils.getInstance(); 26 | 27 | private PropertiesUtils propertiesUtil = PropertiesUtils.getInstance(); 28 | 29 | public TextSaver() { 30 | init(); 31 | } 32 | 33 | /** 34 | * 初始化文件要存的目录 35 | * */ 36 | private void init() { 37 | parentDir = PropertiesUtils.getInstance().get("dir"); 38 | File file = new File(parentDir); 39 | if (!file.exists()) { 40 | file.mkdirs(); 41 | } 42 | log.info("解析后的文件存放位置:[{}]",parentDir); 43 | } 44 | 45 | public void save(Page page) { 46 | if (page == null) { 47 | return; 48 | } 49 | // 文本相似度检测 50 | String similarCheck = propertiesUtil.get("similarCheck"); 51 | if (similarCheck != null && !similarCheck.trim().equals("") &&similarCheck.equalsIgnoreCase("true")) { 52 | String title = page.getItems().get("title"); 53 | String content = page.getItems().get("content"); 54 | if(pageUtil.exist(title, content)) { 55 | log.info("标题为 [{}] 的相似文章已经存在", title); 56 | } 57 | } 58 | File file = new File(String.format("%s%s.txt",parentDir,new Date().getTime())); 59 | try (FileWriter fw = new FileWriter(file)) { 60 | if (page.getItems() == null) { 61 | fw.flush(); 62 | return; 63 | } 64 | fw.append(String.format("[标题] %s\n",page.getItems().get("title"))); 65 | fw.append(String.format("[日期] %s\n", page.getItems().get("date"))); 66 | fw.append(String.format("[正文] %s\n",page.getItems().get("content"))); 67 | fw.append(String.format("[链接] %s\n",page.getUrlSeed().getUrl())); 68 | fw.flush(); 69 | } catch (IOException e) { 70 | log.error("存储路径无效",e); 71 | } 72 | } 73 | 74 | public static void main(String[] args) throws IOException { 75 | // String parentDir = "E:/HEBUTNews/"; 76 | // File file = new File(parentDir+ (new Date().getTime()) + ".txt"); 77 | // //file.createNewFile(); 78 | // if (!file.getParentFile().exists()) { 79 | // //file.getParentFile().mkdirs(); 80 | // 81 | // } 82 | // FileWriter fileWriter = new FileWriter(file); 83 | // fileWriter.append("fasdfs"); 84 | // fileWriter.flush(); 85 | Saver saver = new TextSaver(); 86 | 87 | // new TextSaver().save(new Page(new UrlSeed("",5), Jsoup.parse("HTML","")).setItems(null)); 88 | } 89 | 90 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/scheduler/Scheduler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.scheduler; 2 | 3 | import com.jinshuai.entity.UrlSeed; 4 | 5 | /** 6 | * @author JS 7 | * @date 2018/03/26 8 | * @description: 9 | * 种子调度器: 提供种子,存放种子。 10 | * */ 11 | public interface Scheduler { 12 | 13 | /** 14 | * 存放种子 15 | * */ 16 | void push(UrlSeed urlSeed); 17 | /** 18 | * 提供种子 19 | * */ 20 | UrlSeed pop(); 21 | 22 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/core/scheduler/impl/RedisScheduler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.scheduler.impl; 2 | 3 | import com.google.gson.Gson; 4 | import com.jinshuai.core.scheduler.Scheduler; 5 | import com.jinshuai.entity.UrlSeed; 6 | import com.jinshuai.util.JedisUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | import redis.clients.jedis.Jedis; 9 | 10 | /** 11 | * @author: JS 12 | * @date: 2018/3/26 13 | * @description: 将种子存放到Redis 14 | */ 15 | @Slf4j 16 | public class RedisScheduler implements Scheduler { 17 | 18 | /** 19 | * 存放UrlSeed.url hash && 进行种子判重 20 | */ 21 | private final static String PREFIX_SET = "Spider.consumer.set"; 22 | 23 | /** 24 | * 根据种子的优先级先简单创建不同的几个队列 25 | */ 26 | private final static String PREFIX_QUEUE_HIGH = "Spider.queue.consumer.high"; 27 | private final static String PREFIX_QUEUE_LOW = "Spider.queue.consumer.low"; 28 | private final static String PREFIX_QUEUE_DEFAULT = "Spider.queue.consumer.default"; 29 | private final static String PREFIX_QUEUE_CONSUMER = "Spider.queue.consumer.consumer"; 30 | 31 | /** 32 | * @param urlSeed 种子 33 | * @desciption: 配置 jedisPool 34 | * 添加种子的URL到Set,种子序列话后的JSON文本到List 35 | * 添加种子之前需要判断种子是否已经存在。 36 | */ 37 | public void push(UrlSeed urlSeed) { 38 | try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) { 39 | // 种子不存在 40 | if (!jedis.sismember(PREFIX_SET, urlSeed.getUrlHash())) { 41 | // 添加种子Url对应的hash到判重Set 42 | jedis.sadd(PREFIX_SET, urlSeed.getUrlHash()); 43 | // 添加种子序列化后的JSON文本到List 44 | Gson gson = new Gson(); 45 | String urlSeedToJson = gson.toJson(urlSeed); 46 | long urlSeedPriority = urlSeed.getPriority(); 47 | if (urlSeedPriority > 5) { 48 | jedis.lpush(PREFIX_QUEUE_HIGH, urlSeedToJson); 49 | } else if (urlSeedPriority == 5) { 50 | jedis.lpush(PREFIX_QUEUE_DEFAULT, urlSeedToJson); 51 | } else if (urlSeedPriority > 0) { 52 | jedis.lpush(PREFIX_QUEUE_LOW, urlSeedToJson); 53 | } else { 54 | jedis.lpush(PREFIX_QUEUE_CONSUMER, urlSeedToJson); 55 | } 56 | } 57 | } catch (Exception e) { 58 | log.error("JedisPushUrl[{}]出错", urlSeed.toString(), e); 59 | } 60 | } 61 | 62 | /** 63 | * @description: 消费者只从对应的消费队列中取种子 64 | */ 65 | public UrlSeed pop() { 66 | Jedis jedis = JedisUtils.getSingleInstance().getJedis(); 67 | Gson gson = new Gson(); 68 | String urlSeedToJson = null; 69 | UrlSeed urlSeed = null; 70 | try { 71 | if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_CONSUMER)) != null) { 72 | urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class); 73 | } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_HIGH)) != null) { 74 | urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class); 75 | } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_DEFAULT)) != null) { 76 | urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class); 77 | } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_LOW)) != null) { 78 | urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class); 79 | } 80 | return urlSeed; 81 | } catch (Exception e) { 82 | log.error("JedisPopUrl [{}]出错", urlSeedToJson, e); 83 | } finally { 84 | if (jedis != null && jedis.isConnected()) 85 | jedis.disconnect(); 86 | } 87 | return gson.fromJson(urlSeedToJson, UrlSeed.class); 88 | } 89 | 90 | /** 91 | * test connection 92 | */ 93 | public static void main(String[] args) { 94 | Jedis jedis = JedisUtils.getSingleInstance().getJedis(); 95 | System.out.println(jedis.ping()); 96 | UrlSeed urlSeed = new RedisScheduler().pop(); 97 | System.out.println(urlSeed); 98 | jedis.lpush(PREFIX_QUEUE_LOW, "dasdasdasdsa"); 99 | } 100 | 101 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/entity/Page.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.entity; 2 | 3 | import org.jsoup.nodes.Document; 4 | 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | /** 9 | * @author: JS 10 | * @date: 2018/3/26 11 | * @description: 12 | * 每一个UrlSeed对应的页面抽象为一个Page 13 | */ 14 | public class Page { 15 | 16 | /** 17 | * Page对应的UrlSeed 18 | * */ 19 | private UrlSeed urlSeed; 20 | 21 | /** 22 | * Page对应的jsoup文档 23 | * */ 24 | private Document document; 25 | 26 | /** 27 | * Page包含的url 28 | * */ 29 | private Set urlSeeds; 30 | 31 | /** 32 | * Page所包含的有用信息 33 | * */ 34 | private Map items; 35 | 36 | public Page(UrlSeed urlSeed, Document document) { 37 | this.urlSeed = urlSeed; 38 | this.document = document; 39 | } 40 | 41 | public UrlSeed getUrlSeed() { 42 | return urlSeed; 43 | } 44 | 45 | public Page setUrlSeed(UrlSeed urlSeed) { 46 | this.urlSeed = urlSeed; 47 | return this; 48 | } 49 | 50 | public Document getDocument() { 51 | return document; 52 | } 53 | 54 | public Page setDocument(Document document) { 55 | this.document = document; 56 | return this; 57 | } 58 | 59 | public Set getUrlSeeds() { 60 | return urlSeeds; 61 | } 62 | 63 | public Page setUrlSeeds(Set urlSeeds) { 64 | this.urlSeeds = urlSeeds; 65 | return this; 66 | } 67 | 68 | public Map getItems() { 69 | return items; 70 | } 71 | 72 | public Page setItems(Map items) { 73 | this.items = items; 74 | return this; 75 | } 76 | 77 | @Override 78 | public String toString() { 79 | return "Page{" + 80 | "urlSeed=" + urlSeed + 81 | ", urlSeeds=" + urlSeeds + 82 | ", items=" + items + 83 | '}'; 84 | } 85 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/entity/UrlSeed.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.entity; 2 | 3 | import com.jinshuai.util.hash.MurmurHash; 4 | import lombok.EqualsAndHashCode; 5 | import lombok.ToString; 6 | 7 | /** 8 | * @author: JS 9 | * @date: 2018/3/26 10 | * @description: 11 | * 每个Url需要设置优先级,不需要对低于某个优先级的Url进行解析。 12 | */ 13 | @ToString 14 | @EqualsAndHashCode 15 | public class UrlSeed { 16 | 17 | /** 18 | * 种子对应的Url 19 | * */ 20 | private String url; 21 | 22 | /** 23 | * url hash 24 | * */ 25 | private String urlHash; 26 | 27 | /** 28 | * 种子优先级 29 | * 硬编码为5,通过时间戳设置优先级 30 | * */ 31 | private long priority = 5; 32 | 33 | public UrlSeed(String url, long priority) { 34 | this.url = url; 35 | this.priority = priority; 36 | this.urlHash = String.valueOf(MurmurHash.hash64(url)); 37 | } 38 | 39 | public UrlSeed(String url) { 40 | this.url = url; 41 | this.urlHash = String.valueOf(MurmurHash.hash64(url)); 42 | } 43 | 44 | public String getUrl() { 45 | return url; 46 | } 47 | 48 | public UrlSeed setUrl(String url) { 49 | this.url = url; 50 | return this; 51 | } 52 | 53 | public String getUrlHash() { 54 | return urlHash; 55 | } 56 | 57 | public long getPriority() { 58 | return priority; 59 | } 60 | 61 | public UrlSeed setPriority(long priority) { 62 | this.priority = priority; 63 | return this; 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/util/JedisUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import redis.clients.jedis.Jedis; 5 | import redis.clients.jedis.JedisPool; 6 | import redis.clients.jedis.JedisPoolConfig; 7 | 8 | import java.util.Map; 9 | import java.util.concurrent.ConcurrentHashMap; 10 | 11 | /** 12 | * @author: JS 13 | * @date: 2018/3/27 14 | * @description: 15 | * 对Jedis简单的封装 16 | */ 17 | @Slf4j 18 | public class JedisUtils { 19 | 20 | /** 21 | * JedisUtils实例 22 | * */ 23 | private static volatile JedisUtils jedisUtils; 24 | 25 | /** 26 | * 获取JedisUtils单例 27 | * */ 28 | public static JedisUtils getSingleInstance() { 29 | if (jedisUtils == null) { 30 | synchronized (JedisUtils.class) { 31 | jedisUtils = new JedisUtils(); 32 | } 33 | } 34 | return jedisUtils; 35 | } 36 | 37 | private JedisPool jedisPool; 38 | 39 | JedisUtils() { 40 | init(); 41 | } 42 | 43 | private void init() { 44 | configJedisPool(); 45 | } 46 | 47 | /** 48 | * 获取套接字、密码 49 | * */ 50 | private static final String IP = PropertiesUtils.getInstance().get("redis-ip"); 51 | private static final int PORT = Integer.valueOf(PropertiesUtils.getInstance().get("redis-port")); 52 | private static final String PASSWORD = PropertiesUtils.getInstance().get("redis-password"); 53 | 54 | /** 55 | * 可用连接实例的最大数目,默认值为8; 56 | * 如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。 57 | */ 58 | private static int MAX_ACTIVE = 2048; 59 | 60 | /** 61 | * 控制一个pool最多有多少个状态为idle(空闲的)的jedis实例,默认值也是8。 62 | */ 63 | private static int MAX_IDLE = 200; 64 | 65 | /** 66 | * 等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。如果超过等待时间,则直接抛出JedisConnectionException; 67 | * */ 68 | private static int MAX_WAIT = 10000; 69 | 70 | /** 71 | * 超时时间 72 | * */ 73 | private static int TIMEOUT = 10000; 74 | 75 | /** 76 | * 保存若干个jedisPool 77 | * key 为IP+port 78 | * */ 79 | private static Map maps = new ConcurrentHashMap<>(); 80 | 81 | private void configJedisPool() { 82 | if (maps.get(IP) == null) { 83 | JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); 84 | jedisPoolConfig.setMaxTotal(MAX_ACTIVE); 85 | jedisPoolConfig.setMaxIdle(MAX_IDLE); 86 | jedisPoolConfig.setMaxWaitMillis(MAX_WAIT); 87 | jedisPoolConfig.setTestOnReturn(true); 88 | // 未设置密码 89 | if (PASSWORD == null || PASSWORD.length() == 0) { 90 | log.info("配置文件中未设置Redis密码,请确保Redis服务器不需要密码验证!!!"); 91 | jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT); 92 | } else { 93 | jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT, PASSWORD); 94 | } 95 | maps.put(IP,jedisPool); 96 | } else { 97 | jedisPool = maps.get(IP); 98 | } 99 | } 100 | 101 | /** 102 | * 从jedisPool中获取jedis 103 | * */ 104 | public Jedis getJedis() { 105 | Jedis jedis = null; 106 | try { 107 | jedis = jedisPool.getResource(); 108 | } catch (Exception e) { 109 | log.error("连接Redis失败,检查IP、端口、密码", e); 110 | } 111 | return jedis; 112 | } 113 | 114 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/util/PropertiesUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | 4 | import lombok.extern.slf4j.Slf4j; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.util.Map; 9 | import java.util.Properties; 10 | import java.util.concurrent.ConcurrentHashMap; 11 | 12 | /** 13 | * @author: JS 14 | * @date: 2018/5/4 15 | * @description: 16 | * 读取配置文件工具类 17 | */ 18 | @Slf4j 19 | public class PropertiesUtils { 20 | 21 | private Map cache = new ConcurrentHashMap<>(); 22 | 23 | private static volatile PropertiesUtils propertiesUtils; 24 | 25 | public static PropertiesUtils getInstance() { 26 | if (propertiesUtils == null) { 27 | synchronized (PropertiesUtils.class) { 28 | if (propertiesUtils == null) { 29 | propertiesUtils = new PropertiesUtils(); 30 | } 31 | } 32 | } 33 | return propertiesUtils; 34 | } 35 | 36 | public String get(String key) { 37 | if (key == null) 38 | return null; 39 | if (cache.get(key) != null) { 40 | return cache.get(key); 41 | } 42 | Properties properties = new Properties(); 43 | InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties"); 44 | try { 45 | properties.load(inputStream); 46 | } catch (IOException e) { 47 | log.error("加载配置文件[application.properties]失败",e); 48 | } 49 | String value = properties.getProperty(key); 50 | cache.put(key,value); 51 | return value; 52 | } 53 | 54 | public static void main(String[] args) throws IOException { 55 | // Properties properties = new Properties(); 56 | // InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties"); 57 | // properties.load(inputStream); 58 | // System.out.println(properties.getProperty("ip")); 59 | // System.out.println(properties.getProperty("ip")); 60 | System.out.println(PropertiesUtils.getInstance().get("ip")); 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/util/hash/MurmurHash.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.hash; 2 | 3 | import java.math.BigInteger; 4 | import java.nio.ByteBuffer; 5 | import java.nio.ByteOrder; 6 | 7 | /** 8 | * @author: JS 9 | * @date: 2019/5/25 10 | * @description: 生成64位Hash 代码:https://www.cnkirito.moe/consistent-hash-lb/ 11 | */ 12 | public class MurmurHash { 13 | 14 | public static BigInteger hash64(String word) { 15 | ByteBuffer buf = ByteBuffer.wrap(word.getBytes()); 16 | int seed = 0x1234ABCD; 17 | 18 | ByteOrder byteOrder = buf.order(); 19 | buf.order(ByteOrder.LITTLE_ENDIAN); 20 | 21 | long m = 0xc6a4a7935bd1e995L; 22 | int r = 47; 23 | 24 | long h = seed ^ (buf.remaining() * m); 25 | 26 | long k; 27 | while (buf.remaining() >= 8) { 28 | k = buf.getLong(); 29 | 30 | k *= m; 31 | k ^= k >>> r; 32 | k *= m; 33 | 34 | h ^= k; 35 | h *= m; 36 | } 37 | 38 | if (buf.remaining() > 0) { 39 | ByteBuffer finish = ByteBuffer.allocate(8).order( 40 | ByteOrder.LITTLE_ENDIAN); 41 | // for big-endian version, do this first: 42 | // finish.position(8-buf.remaining()); 43 | finish.put(buf).rewind(); 44 | h ^= finish.getLong(); 45 | h *= m; 46 | } 47 | h ^= h >>> r; 48 | h *= m; 49 | h ^= h >>> r; 50 | 51 | buf.order(byteOrder); 52 | return new BigInteger(String.valueOf(h & 0xffffffffL)); 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/util/hash/PageUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.hash; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.ansj.app.keyword.KeyWordComputer; 5 | import org.ansj.app.keyword.Keyword; 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.math.BigInteger; 10 | import java.nio.file.Files; 11 | import java.util.Collection; 12 | import java.util.HashSet; 13 | import java.util.Map; 14 | import java.util.Set; 15 | import java.util.concurrent.ConcurrentHashMap; 16 | 17 | /** 18 | * @author: JS 19 | * @date: 2019/5/25 20 | * @description: SimHash进行doc相似度检测 21 | */ 22 | @Slf4j 23 | public class PageUtils { 24 | 25 | private volatile static PageUtils instance; 26 | 27 | private Map> invertedIndex = new ConcurrentHashMap<>(); 28 | 29 | /** 30 | * 测试:跟踪文本 31 | * */ 32 | private Map fingerContent = new ConcurrentHashMap<>(); 33 | 34 | public static PageUtils getInstance() { 35 | if (instance == null) { 36 | synchronized (PageUtils.class) { 37 | if (instance == null) { 38 | instance = new PageUtils(); 39 | } 40 | } 41 | } 42 | return instance; 43 | } 44 | 45 | private static final int BIT_SIZE = 64; 46 | 47 | private static final int TABLE_SIZE = 16; 48 | 49 | private static final int HAMMING_DISTANCE = 3; 50 | 51 | private ThreadLocal simhashStrContainer = new ThreadLocal<>(); 52 | 53 | public boolean exist(String title, String content) { 54 | boolean exist = false; 55 | BigInteger fingerprint = getSimHash(title, content); 56 | // fingerContent.put(fingerprint, title + "====" + content); 57 | String hashStr = simhashStrContainer.get(); 58 | // 防止分词错误NPE 59 | if (hashStr.length() == BIT_SIZE) { 60 | // 获取每一个table对应的所有候选结果 61 | for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) { 62 | String table = hashStr.substring(start, start + TABLE_SIZE); 63 | Set fingerprints = invertedIndex.get(table); 64 | if (fingerprints != null && fingerprints.size() > 0) { 65 | for (BigInteger fingerprintRes : fingerprints) { 66 | // 海明距离 67 | int hammingDistance = fingerprintRes.xor(fingerprint).bitCount(); 68 | if (hammingDistance <= HAMMING_DISTANCE) { 69 | // log.error("标题 [{}] \r\n 内容[{}] \r\n 与 标题内容[{}] 相似\r\n 汉明距离:[{}]", title, content, fingerContent.get(fingerprintRes), hammingDistance); 70 | exist = true; 71 | break; 72 | } 73 | } 74 | } 75 | if (exist) 76 | break; 77 | } 78 | // 构建倒排索引(16 * 4) 79 | constructInvertedIndex(fingerprint); 80 | } 81 | return exist; 82 | } 83 | 84 | private BigInteger getSimHash(String title, String content) { 85 | double[] featureVector = new double[BIT_SIZE]; 86 | // 1. 分词,计算权重 87 | Collection result = getParticiple(title, content); 88 | // 2. hash 89 | // 3. 加权 90 | // 4. 合并 91 | featureVector = weightingAndCombine(featureVector, result); 92 | // 5. 降维 93 | // 6. SimHash 指纹 94 | return decreaseDimensionAndGetFingerprint(featureVector); 95 | } 96 | 97 | /** 98 | * 分词 99 | * */ 100 | private Collection getParticiple(String title, String content) { 101 | int keyNumber = content.length() / 2 < 5 ? 5 : content.length(); // TODO 102 | KeyWordComputer kwc = new KeyWordComputer(keyNumber); 103 | return kwc.computeArticleTfidf(title, content); 104 | } 105 | 106 | /** 107 | * 哈希 加权 合并 108 | * */ 109 | private double[] weightingAndCombine(double[] featureVector, Collection result) { 110 | for (Keyword keyword : result) { 111 | String keyStr = keyword.getName(); 112 | BigInteger keyHash = MurmurHash.hash64(keyStr); 113 | for (int i = 0; i < BIT_SIZE; i++) { 114 | final BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i); 115 | // 3. 加权 116 | // 4. 合并 117 | if (keyHash.and(bitMask).signum() != 0) { 118 | featureVector[i] += keyword.getScore(); 119 | } else { 120 | featureVector[i] -= keyword.getScore(); 121 | } 122 | } 123 | } 124 | return featureVector; 125 | } 126 | 127 | /** 128 | * 降维 获取指纹 129 | * */ 130 | private BigInteger decreaseDimensionAndGetFingerprint(double[] featureVector) { 131 | BigInteger fingerprint = BigInteger.ZERO; 132 | StringBuilder simHashBuilder = new StringBuilder(); 133 | for (int i = 0; i < BIT_SIZE; i++) { 134 | BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i); 135 | if (featureVector[i] > 0) { 136 | fingerprint = fingerprint.or(fingerprint.xor(bitMask)); 137 | simHashBuilder.append(1); 138 | } else { 139 | simHashBuilder.append(0); 140 | } 141 | } 142 | simhashStrContainer.set(simHashBuilder.toString()); 143 | return fingerprint; 144 | } 145 | 146 | /** 147 | * 构建倒排索引 148 | * 149 | * 150 | * */ 151 | private void constructInvertedIndex(BigInteger fingerprint) { 152 | String hashStr = simhashStrContainer.get(); 153 | for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) { 154 | String table = hashStr.substring(start, start + TABLE_SIZE); 155 | Set docs = invertedIndex.get(table); 156 | if (docs == null) { 157 | docs = new HashSet<>(); 158 | } 159 | docs.add(fingerprint); 160 | invertedIndex.put(table, docs); 161 | } 162 | } 163 | 164 | public static void main(String[] args) throws IOException { 165 | PageUtils pageUtil = PageUtils.getInstance(); 166 | StringBuilder sb1 = new StringBuilder(); 167 | File file1 = new File("D:/Data/1.txt"); 168 | Files.readAllLines(file1.toPath()).forEach(line ->{ 169 | sb1.append(line); 170 | }); 171 | 172 | StringBuilder sb2 = new StringBuilder(); 173 | File file2 = new File("D:/Data/2.txt"); 174 | Files.readAllLines(file2.toPath()).forEach(line -> { 175 | sb2.append(line); 176 | }); 177 | 178 | pageUtil.exist("学校党委理论学习中心组召开扩大会议",sb1.toString()); 179 | pageUtil.exist("校党委理论学习中心组召开专题会议学习传达全国“两会”精神", sb2.toString()); 180 | 181 | 182 | // BigInteger simhash1 = pageUtil.getSimHash("","我来自河北省,你们可以叫我金帅"); 183 | // BigInteger simhash2 = pageUtil.getSimHash("","我来自河北省,我是金帅"); 184 | // System.out.println(simhash1.xor(simhash2).bitCount()); 185 | // System.out.println(simhash1.xor(simhash2).toString(2)); 186 | // System.out.println("=========十进制========"); 187 | // System.out.println(simhash1.toString()); 188 | // System.out.println(simhash2.toString()); 189 | // System.out.println("=========二进制========"); 190 | // System.out.println(simhash1.toString(2)); 191 | // System.out.println(simhash2.toString(2)); 192 | 193 | } 194 | 195 | } 196 | -------------------------------------------------------------------------------- /spider-consumer/src/main/java/com/jinshuai/util/http/HttpUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.http; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.http.Header; 5 | import org.apache.http.HttpEntity; 6 | import org.apache.http.HttpResponse; 7 | import org.apache.http.client.config.RequestConfig; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.config.Registry; 10 | import org.apache.http.config.RegistryBuilder; 11 | import org.apache.http.config.SocketConfig; 12 | import org.apache.http.conn.socket.ConnectionSocketFactory; 13 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 14 | import org.apache.http.conn.ssl.NoopHostnameVerifier; 15 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 16 | import org.apache.http.conn.ssl.TrustSelfSignedStrategy; 17 | import org.apache.http.entity.ContentType; 18 | import org.apache.http.impl.client.CloseableHttpClient; 19 | import org.apache.http.impl.client.HttpClients; 20 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 21 | import org.apache.http.ssl.SSLContexts; 22 | import org.apache.http.util.ByteArrayBuffer; 23 | 24 | import javax.net.ssl.HostnameVerifier; 25 | import javax.net.ssl.SSLContext; 26 | import java.io.IOException; 27 | import java.io.InputStream; 28 | import java.net.MalformedURLException; 29 | import java.net.URI; 30 | import java.net.URISyntaxException; 31 | import java.net.URL; 32 | import java.nio.charset.Charset; 33 | import java.util.Random; 34 | import java.util.concurrent.TimeUnit; 35 | import java.util.regex.Matcher; 36 | import java.util.regex.Pattern; 37 | 38 | /** 39 | * @author: JS 40 | * @date: 2018/3/22 41 | * @description: 42 | * 创建单例HttpUtils,获取HttpClient实例执行HTTP请求根据状态码解析响应体。 43 | */ 44 | @Slf4j 45 | public class HttpUtils { 46 | 47 | private static final ThreadLocal httpGetContainer = new ThreadLocal<>(); 48 | 49 | private static final ThreadLocal httpEntityContainer = new ThreadLocal<>(); 50 | 51 | private static volatile HttpUtils HTTPUTILS; 52 | 53 | private PoolingHttpClientConnectionManager httpClientConnectionManager; 54 | 55 | private CloseableHttpClient httpClient; 56 | 57 | private static final int MAX_TOTAL_CONNECTIONS = 20; 58 | private static final int SOCKET_TIMEOUT = 5000; 59 | private static final int MAX_CONNECTIONS_PER_ROUTE = 200; 60 | private static final int CONNECTION_REQUEST_TIMEOUT = 5000; 61 | private static final int CONNECT_TIMEOUT = 5000; 62 | 63 | /** 64 | * 获取HttpUtils单例 65 | * */ 66 | public static HttpUtils getSingleInstance() { 67 | if (HTTPUTILS == null) { 68 | synchronized (HttpUtils.class) { 69 | if (HTTPUTILS == null) { 70 | HTTPUTILS = new HttpUtils(); 71 | } 72 | } 73 | } 74 | return HTTPUTILS; 75 | } 76 | 77 | HttpUtils() { 78 | init(); 79 | } 80 | 81 | private void init() { 82 | configHttpPool(); 83 | configHttpClient(); 84 | } 85 | 86 | /** 87 | * 配置HTTP连接池 88 | * 89 | * */ 90 | private void configHttpPool() { 91 | try { 92 | // 配置SSL 93 | SSLContext sslcontext = SSLContexts.custom() 94 | .loadTrustMaterial(null, new TrustSelfSignedStrategy()) 95 | .build(); 96 | 97 | // HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.getDefaultHostnameVerifier(); 98 | // 关闭域名证书验证 99 | HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE; 100 | 101 | SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory( 102 | sslcontext, hostnameVerifier); 103 | 104 | Registry socketFactoryRegistry = RegistryBuilder.create() 105 | .register("http", PlainConnectionSocketFactory.getSocketFactory()) 106 | .register("https", sslsf) 107 | .build(); 108 | 109 | // 将SSL集成到HttpConnectionManager 110 | httpClientConnectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry); 111 | // 设置HTTP连接池最大连接数 112 | httpClientConnectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS); 113 | // 每个路由最大的连接数 114 | httpClientConnectionManager.setDefaultMaxPerRoute(MAX_CONNECTIONS_PER_ROUTE); 115 | // 设置socket超时时间 116 | SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(SOCKET_TIMEOUT).build(); 117 | httpClientConnectionManager.setDefaultSocketConfig(socketConfig); 118 | } catch (Exception e) { 119 | log.error("SSL配置出错",e); 120 | } 121 | } 122 | 123 | /** 124 | * 配置HttpClient 125 | * 126 | * */ 127 | private void configHttpClient() { 128 | // 请求配置 129 | RequestConfig requestConfig = RequestConfig.custom() 130 | .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT) 131 | .setConnectTimeout(CONNECT_TIMEOUT) 132 | .build(); 133 | // 将配置信息应用到HttpClient 134 | if (httpClientConnectionManager == null) { 135 | log.error("httpClientConnectionManager未被初始化"); 136 | return; 137 | } 138 | httpClient = HttpClients.custom() 139 | .setDefaultRequestConfig(requestConfig) 140 | .setConnectionManager(httpClientConnectionManager) 141 | .build(); 142 | } 143 | 144 | /** 145 | * 配置HttpGet 146 | * 147 | * */ 148 | private HttpGet getHttpGet(String urlStr) { 149 | URL url; 150 | URI uri = null; 151 | try { 152 | url = new URL(urlStr); 153 | uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null); 154 | } catch (MalformedURLException | URISyntaxException e) { 155 | log.error("字符串格式不正确[{}]",urlStr,e); 156 | } 157 | HttpGet httpGet = new HttpGet(uri); 158 | // 添加请求头header 159 | httpGet.addHeader("Accept", "*/*"); 160 | httpGet.addHeader("Accept-Encoding", "gzip, deflate"); 161 | httpGet.addHeader("Connection", "keep-alive"); 162 | int randomUserAgent = new Random().nextInt(UserAgentArray.USER_AGENT.length); 163 | httpGet.addHeader("User-Agent",UserAgentArray.USER_AGENT[randomUserAgent]); 164 | 165 | return httpGet; 166 | } 167 | 168 | /** 169 | * 发Get请求 170 | * 171 | * */ 172 | private void sendRequest(String urlStr) { 173 | HttpGet httpGet = httpGetContainer.get(); 174 | try { 175 | HttpResponse response = httpClient.execute(httpGet); 176 | // 根据状态码执行不同的操作 177 | int statusCode = response.getStatusLine().getStatusCode(); 178 | switch (statusCode / 100) { 179 | case 2: 180 | executeStrategy(SuccessStrategy.getInstance(), urlStr, response); 181 | break; 182 | case 3: 183 | executeStrategy(RedirectStrategy.getInstance(), urlStr, response); 184 | break; 185 | case 4: 186 | executeStrategy(ClientErrorStrategy.getInstance(), urlStr, response); 187 | break; 188 | case 5: 189 | executeStrategy(ServerErrorStrategy.getInstance(), urlStr, response); 190 | break; 191 | } 192 | } catch (IOException e) { 193 | log.error("IO出错[{}]", urlStr, e); 194 | } 195 | } 196 | 197 | /** 198 | * 获取 HttpEntity 199 | * 200 | * */ 201 | public String getContent(String urlStr) { 202 | // url为空或者不是http协议 203 | if (urlStr == null || !urlStr.startsWith("http")) { 204 | return null; 205 | } 206 | // 防止SSL过程中的握手警报 http://dovov.com/ssljava-1-7-0unrecognized_name.html 207 | if (urlStr.startsWith("https")) { 208 | System.setProperty("jsse.enableSNIExtension", "false"); 209 | } 210 | String content = null; 211 | try { 212 | httpGetContainer.set(getHttpGet(urlStr)); 213 | sendRequest(urlStr); 214 | HttpEntity httpEntity = httpEntityContainer.get(); 215 | if (httpEntity == null) { 216 | log.error("HttpEntity为空"); 217 | return null; 218 | } 219 | InputStream inputStream = httpEntity.getContent(); 220 | content = parseStream(inputStream, httpEntity); 221 | } catch (IOException e) { 222 | log.error("获取响应流失败", e); 223 | } catch (Exception e) { 224 | log.error("获取内容异常", e); 225 | } finally { 226 | httpGetContainer.get().releaseConnection(); 227 | httpGetContainer.remove(); 228 | } 229 | return content; 230 | } 231 | 232 | /** 233 | * 解析响应流 234 | * 235 | * */ 236 | private String parseStream(InputStream inputStream, HttpEntity httpEntity) { 237 | String pageContent = null; 238 | // 获取页面编码:1. 从响应头content-type 2. 如果没有则从返回的HTML中获取Meta标签里的编码 239 | ByteArrayBuffer byteArrayBuffer = new ByteArrayBuffer(4096); 240 | byte[] tempStore = new byte[4096]; 241 | int count; 242 | try { 243 | // read(tempStore) 会重新从零开始存->刷新字节数组 ,并返回读到的字节数量 244 | while ((count = inputStream.read(tempStore)) != -1) { 245 | byteArrayBuffer.append(tempStore, 0, count); 246 | } 247 | // TODO:下面复制粘贴的:https://github.com/xjtushilei/ScriptSpider 248 | // 根据获取的字节编码转为String类型 249 | String charset = "UTF-8"; 250 | ContentType contentType = ContentType.getOrDefault(httpEntity); 251 | Charset charsets = contentType.getCharset(); 252 | pageContent = new String(byteArrayBuffer.toByteArray()); 253 | // 如果响应头中含有content-type字段,直接读取然后设置编码即可。 254 | if (null != charsets) { 255 | charset = charsets.toString(); 256 | } else { 257 | // 发现HttpClient带的功能有问题,这里自己又写了一下。 258 | Pattern pattern = Pattern.compile("([\\s\\S]*?) 2 | 12 | 13 | 14 | 15 | spider-consumer 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | [%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n 25 | 26 | UTF-8 27 | 28 | 29 | INFO 30 | ACCEPT 31 | DENY 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | ERROR 58 | 59 | 60 | ${LOG_HOME}/error.%d{yyyy-MM-dd}.log 61 | 30 62 | 63 | 64 | UTF-8 65 | [%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n 66 | 67 | 68 | 69 | 70 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 96 | -------------------------------------------------------------------------------- /spider-core/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | 6 | spider 7 | com.jinshuai 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 12 | com.jinshuai.core 13 | spider-core 14 | 1.0 15 | jar 16 | 17 | spider-core 18 | 19 | https://github.com/jinshuai86/Spider 20 | 21 | 22 | UTF-8 23 | 24 | 25 | 26 | 27 | 28 | 29 | org.apache.maven.plugins 30 | maven-compiler-plugin 31 | 32 | 1.8 33 | 1.8 34 | UTF-8 35 | 36 | 37 | 38 | 39 | org.apache.maven.plugins 40 | maven-shade-plugin 41 | 1.2.1 42 | 43 | 44 | package 45 | 46 | shade 47 | 48 | 49 | 50 | 51 | com.jinshuai.Spider 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | junit 65 | junit 66 | 4.12 67 | test 68 | 69 | 70 | 71 | ch.qos.logback 72 | logback-classic 73 | 1.2.3 74 | 75 | 76 | 77 | org.apache.httpcomponents 78 | httpclient 79 | 4.5.5 80 | 81 | 82 | 83 | org.jsoup 84 | jsoup 85 | 1.11.2 86 | 87 | 88 | 89 | redis.clients 90 | jedis 91 | 2.9.0 92 | 93 | 94 | 95 | com.google.code.gson 96 | gson 97 | 2.8.0 98 | 99 | 100 | 101 | mysql 102 | mysql-connector-java 103 | 8.0.11 104 | 105 | 106 | 107 | com.google.guava 108 | guava 109 | 26.0-jre 110 | 111 | 112 | 113 | org.projectlombok 114 | lombok 115 | 1.18.2 116 | provided 117 | 118 | 119 | 120 | org.apache.poi 121 | poi-ooxml 122 | 3.17 123 | 124 | 125 | 126 | joda-time 127 | joda-time 128 | 2.9.9 129 | 130 | 131 | 132 | org.ansj 133 | ansj_seg 134 | 5.1.1 135 | 136 | 137 | 138 | org.apache.rocketmq 139 | rocketmq-client 140 | 4.4.0 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/Spider.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai; 2 | 3 | import com.jinshuai.core.downloader.Downloader; 4 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader; 5 | import com.jinshuai.core.parser.Parser; 6 | import com.jinshuai.core.parser.impl.NewsParser; 7 | import com.jinshuai.core.saver.Saver; 8 | import com.jinshuai.core.saver.impl.TextSaver; 9 | import com.jinshuai.core.scheduler.Scheduler; 10 | import com.jinshuai.core.scheduler.impl.PriorityQueueScheduler; 11 | import com.jinshuai.core.scheduler.impl.RedisScheduler; 12 | import com.jinshuai.entity.Page; 13 | import com.jinshuai.entity.UrlSeed; 14 | import com.jinshuai.util.PropertiesUtils; 15 | import lombok.extern.slf4j.Slf4j; 16 | 17 | import java.util.concurrent.*; 18 | 19 | 20 | /** 21 | * @author: JS 22 | * @date: 2018/3/27 23 | * @description: 程序启动入口 24 | */ 25 | @Slf4j 26 | public class Spider { 27 | 28 | /** 29 | * 设置爬虫组件:scheduler、downloader、parser、saver、 30 | */ 31 | private Scheduler scheduler; 32 | private Downloader downloader; 33 | private Parser parser; 34 | private Saver saver; 35 | 36 | /** 37 | * 初始目标任务量 38 | * */ 39 | private static long targetTaskNumbers = 800; 40 | 41 | /** 42 | * 线程池参数配置 43 | */ 44 | private ThreadPoolExecutor pool; 45 | private static final int CORE_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 2; 46 | private static final int MAX_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 4; 47 | private static final long KEEP_ALIVE_TIME = 1500L; 48 | private static final int MAX_QUEUE_SIZE = 100; 49 | 50 | /** 51 | * 最多只有MAX_QUEUE_SIZE + MAX_POOL_SIZE个任务并发执行 -> 控制任务的提交速率 52 | */ 53 | private Semaphore semaphore = new Semaphore(MAX_QUEUE_SIZE + MAX_POOL_SIZE); 54 | 55 | private Spider setScheduler(Scheduler scheduler) { 56 | if (scheduler == null) { 57 | log.error("未设置调度器,启动失败"); 58 | System.exit(-1); 59 | } 60 | this.scheduler = scheduler; 61 | return this; 62 | } 63 | 64 | private Spider setDownloader(Downloader downloader) { 65 | if (downloader == null) { 66 | log.error("未设置下载器,启动失败"); 67 | System.exit(-1); 68 | } 69 | this.downloader = downloader; 70 | return this; 71 | } 72 | 73 | private Spider setParser(Parser parser) { 74 | if (parser == null) { 75 | log.error("未设置解析器,启动失败"); 76 | System.exit(-1); 77 | } 78 | this.parser = parser; 79 | return this; 80 | } 81 | 82 | private Spider setSaver(Saver saver) { 83 | if (saver == null) { 84 | log.error("未设置保存器,启动失败"); 85 | System.exit(-1); 86 | } 87 | this.saver = saver; 88 | return this; 89 | } 90 | 91 | private Spider setThreadPool() { 92 | pool = new ThreadPoolExecutor(CORE_POOL_SIZE, MAX_POOL_SIZE, KEEP_ALIVE_TIME, TimeUnit.MILLISECONDS, 93 | new LinkedBlockingQueue<>(MAX_QUEUE_SIZE)); 94 | 95 | return this; 96 | } 97 | 98 | private Spider addUrlSeed(UrlSeed urlSeed) { 99 | if (urlSeed == null) { 100 | log.error("未添加初始种子,启动失败"); 101 | System.exit(-1); 102 | } 103 | scheduler.push(urlSeed); 104 | return this; 105 | } 106 | 107 | private Spider setTargetTaskNumbers() { 108 | String configTargetNum = PropertiesUtils.getInstance().get("targetNum"); 109 | if (configTargetNum != null && !configTargetNum.trim().equals("")) { 110 | try { 111 | targetTaskNumbers = Long.valueOf(configTargetNum); 112 | if (targetTaskNumbers <= 0) { 113 | log.error("无效的目标任务数量:[{}]", targetTaskNumbers); 114 | } 115 | } catch (Exception e) { 116 | log.error("无效的目标任务数量:[{}],使用默认值", configTargetNum, e); 117 | } 118 | } 119 | return this; 120 | } 121 | 122 | private void run() { 123 | log.info("爬虫启动......"); 124 | Runtime.getRuntime().addShutdownHook(new Thread(()->{ 125 | pool.shutdown(); // clean resource 126 | })); 127 | UrlSeed urlSeed = null; 128 | while (true) { 129 | try { 130 | // the url_store has no url and there is no active thread 131 | if ((urlSeed = scheduler.pop()) == null && pool.getActiveCount() == 0 && pool.getQueue().size() == 0) { 132 | pool.shutdown(); 133 | log.info("解析完毕,正在停止......"); 134 | System.exit(-1); //TODO 为了停止生产者,可以改为轮询标志位 135 | break; 136 | } else if (urlSeed == null) { 137 | log.info("种子仓库已无种子,等待中......"); 138 | TimeUnit.SECONDS.sleep(1); 139 | } else { 140 | log.info("准备解析URL:[{}],优先级(默认5):[{}]", urlSeed.getUrl(), urlSeed.getPriority()); 141 | semaphore.acquire(); 142 | pool.execute(new SpiderWork(urlSeed)); 143 | } 144 | if (pool.getCompletedTaskCount() >= targetTaskNumbers && urlSeed == null && pool.getQueue().size() == 0) { 145 | pool.shutdown(); 146 | log.info("达到目标,正在停止......"); 147 | System.exit(-1); //TODO 为了停止生产者,可以改为轮询标志位 148 | } 149 | } catch (InterruptedException e) { 150 | log.error("当前线程被中断", e); //TODO 151 | } catch (RejectedExecutionException e) { 152 | log.error("拒绝此次提交的任务[{}]", urlSeed, e); 153 | semaphore.release(); 154 | } 155 | } 156 | } 157 | 158 | private class SpiderWork implements Runnable { 159 | 160 | private UrlSeed urlSeed; 161 | 162 | SpiderWork(UrlSeed urlSeed) { 163 | this.urlSeed = urlSeed; 164 | } 165 | 166 | public void run() { 167 | try { 168 | log.info("已完成任务数量:[{}],运行中线程数量:[{}],最大线程运行数量: [{}],工作队列任务数量:[{}]", 169 | pool.getCompletedTaskCount(), pool.getActiveCount(), pool.getMaximumPoolSize(), pool.getQueue().size()); 170 | Page page = downloader.download(urlSeed); 171 | parser.parse(page); 172 | // add new url to scheduler 173 | page.getUrlSeeds().forEach(seed -> scheduler.push(seed)); 174 | saver.save(page); 175 | } finally { 176 | semaphore.release(); 177 | } 178 | } 179 | } 180 | 181 | private static Spider build() { 182 | 183 | return new Spider() 184 | .setTargetTaskNumbers() 185 | .setDownloader(new HttpClientPoolDownloader()) 186 | .setParser(new NewsParser()) 187 | .setSaver(new TextSaver()) 188 | // .setScheduler(new RedisScheduler()) 189 | .setScheduler(new PriorityQueueScheduler(targetTaskNumbers)) 190 | .setThreadPool(); 191 | 192 | 193 | } 194 | 195 | /** 196 | * Test 197 | * 198 | * 线程池提交任务流程: 199 | * 判断当前活跃的线程数量和corePoolSize的大小关系,如果没达到corePoolSize就会开新的线程执行任务,如果达到了 200 | * 判断和工作队列的大小关系,如果工作队列还没有满,将任务放到工作队列中,如果满了 201 | * 判断和maximumPoolSize的大小关系,如果没达到maximumPoolSize,就会新开线程执行任务,如果达到了 202 | * 回调注册的拒绝策略 203 | * 204 | */ 205 | public static void main(String[] args) { 206 | Spider.build() 207 | .addUrlSeed(new UrlSeed("http://xww.hebut.edu.cn/gdyw/index.htm")) 208 | .run(); 209 | } 210 | 211 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/README.md: -------------------------------------------------------------------------------- 1 | - 下载器Downloader根据种子调度器scheduler提供的种子UrlSeed进行下载 2 | - 解析器Parser解析下载器的响应,将响应内容封装成一个Page 3 | - 持久器Saver将Page持久化 -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/downloader/Downloader.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.downloader; 2 | 3 | import com.jinshuai.entity.Page; 4 | import com.jinshuai.entity.UrlSeed; 5 | 6 | /** 7 | * 下载器接口,可以针对此接口构造多种下载器实现 8 | * @see com.jinshuai.core.downloader.impl.HttpClientPoolDownloader 9 | * */ 10 | public interface Downloader { 11 | 12 | /*** 13 | * @param urlSeed 待使用种子 14 | * @return 响应体内容封装成的Page 15 | */ 16 | Page download(UrlSeed urlSeed); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/downloader/impl/HttpClientPoolDownloader.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.downloader.impl; 2 | 3 | import com.jinshuai.core.downloader.Downloader; 4 | import com.jinshuai.entity.Page; 5 | import com.jinshuai.entity.UrlSeed; 6 | import com.jinshuai.util.http.HttpUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | /** 14 | * @author: JS 15 | * @date: 2018/3/26 16 | * @description: 17 | * 通过Http连接池下载 18 | */ 19 | @Slf4j 20 | public class HttpClientPoolDownloader implements Downloader { 21 | 22 | public Page download(UrlSeed urlSeed) { 23 | Page page = null; 24 | try { 25 | String html = HttpUtils.getSingleInstance().getContent(urlSeed.getUrl()); 26 | Document document = Jsoup.parse(html, urlSeed.getUrl()); 27 | page = new Page(urlSeed, document); 28 | } catch (Exception e) { 29 | log.error("下载器下载的相应文本获取DOM树失败", e); 30 | } 31 | return page; 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.parser; 2 | 3 | import com.jinshuai.entity.Page; 4 | 5 | /** 6 | * @author JS 7 | * @date 2018/03/26 8 | * @description 9 | * 解析Page 10 | * */ 11 | public interface Parser { 12 | 13 | /** 14 | * @param page 要解析的Page 15 | * @return 解析后的Page(Map、Set) 16 | * @description 解析Page中的Document的内容到Map中,URL到Set中 17 | * */ 18 | Page parse(Page page); 19 | 20 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/parser/impl/BaiKeParser.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.parser.impl; 2 | 3 | import com.jinshuai.core.parser.Parser; 4 | import com.jinshuai.entity.Page; 5 | 6 | /** 7 | * @author: JS 8 | * @date: 2018/3/29 9 | * @description: 10 | * 百度百科解析器 //TODO:to do 11 | */ 12 | public class BaiKeParser implements Parser{ 13 | 14 | @Override 15 | public Page parse(Page page) { 16 | return null; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/parser/impl/NewsParser.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.parser.impl; 2 | 3 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader; 4 | import com.jinshuai.core.parser.Parser; 5 | import com.jinshuai.entity.Page; 6 | import com.jinshuai.entity.UrlSeed; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.joda.time.DateTime; 9 | import org.joda.time.DateTimeUtils; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import java.util.*; 16 | 17 | /** 18 | * @author: JS 19 | * @date: 2018/3/26 20 | * @description: 21 | * 针对hebut新闻类的网页,解析相应内容。 22 | */ 23 | @Slf4j 24 | public class NewsParser implements Parser { 25 | 26 | private static volatile int firstTime = 0; 27 | 28 | // TODO: 待优化解析过程 29 | public Page parse(Page page) { 30 | // 获取DOM树 31 | Document document; 32 | try { 33 | document = page.getDocument(); 34 | long priority = timestamp2Priority(document); 35 | // 种子,并进行预处理 36 | Set urlSeeds = new HashSet<>(); 37 | Iterator seedIterator = document.getElementsByTag("a").iterator(); 38 | while (seedIterator.hasNext()) { 39 | Element element3 = (Element) seedIterator.next(); 40 | String href = element3.attr("href"); 41 | if (href.contains("http://www.hebut.edu.cn/")|| href.contains("/") || href.contains("#") || href.contains("index.htm") || href.contains("javascript:void(0);")) continue; 42 | if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) continue; 43 | urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/gdyw/" + href, priority)); 44 | } 45 | page.setUrlSeeds(urlSeeds); 46 | if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) { 47 | return page; 48 | } 49 | Map items = new HashMap(3); 50 | // 标题 51 | Element titleElement = document.selectFirst("div.sub_articleTitle"); 52 | items.put("title", titleElement.getElementsByTag("h2").text()); 53 | // 时间 54 | Element dateElement = document.selectFirst("div.sub_articleAuthor"); 55 | items.put("date", dateElement.getElementsByTag("strong").eachText().get(0)); 56 | // 正文 57 | Element textElement = document.selectFirst("div.sub_articleInfo"); 58 | Iterator textIterator = textElement.getElementsByTag("span").iterator(); 59 | StringBuilder stringBuilder = new StringBuilder(); 60 | while (textIterator.hasNext()) { 61 | Element element3 = (Element) textIterator.next(); 62 | stringBuilder.append(element3.text()); 63 | } 64 | items.put("content", stringBuilder.toString()); 65 | page.setItems(items); 66 | } catch (Exception e) { 67 | log.error("解析页面[{}]出错",page.getUrlSeed().getUrl(),e); 68 | } finally { 69 | return page; 70 | } 71 | } 72 | 73 | /** 74 | * 该Page中的url时间戳参考该Page的时间戳计算优先级 75 | * */ 76 | private long timestamp2Priority(Document document) { 77 | String date; 78 | try { 79 | date = document.selectFirst("div.sub_articleAuthor").getElementsByTag("strong").eachText().get(0); 80 | } catch (Exception e) { 81 | log.error("解析页面异常",e); 82 | return 5; 83 | } 84 | DateTime dateTime = new DateTime(date); 85 | // 获取时间戳的差值 86 | long v = DateTimeUtils.currentTimeMillis() - dateTime.getMillis(); 87 | // 换算成天数 88 | v /= 86400000; 89 | // 发布时间超过10天设置低的优先级:3,10天:5,小于10天:3 90 | return v > 10 ? 3 : v == 10 ? 5 : 10; 91 | } 92 | 93 | private Page getHyperLinkTag(Page page) { 94 | if (page == null) { 95 | throw new RuntimeException("page 为空"); 96 | } 97 | // 获取DOM树 98 | Document document = page.getDocument(); 99 | // 如果是首页 100 | if ("http://xww.hebut.edu.cn".equals(page.getUrlSeed().getUrl()) && firstTime == 0) { 101 | Set urlSeeds = new HashSet(); 102 | Iterator seedIterator = document.getElementsByTag("a").iterator(); 103 | while (seedIterator.hasNext()) { 104 | Element element3 = (Element) seedIterator.next(); 105 | String href = element3.attr("href"); 106 | if (href.contains("#") || href.contains("index.html") || href.contains("javascript:void(0);")) continue; 107 | if (href.startsWith("gdyw") || href.startsWith("zhyw")) { 108 | urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/" + href, 109 | (int) (Math.random() * 10))); 110 | } 111 | } 112 | page.setUrlSeeds(urlSeeds); 113 | // 已经访问过首页 114 | firstTime = 1; 115 | } 116 | return page; 117 | } 118 | /** 119 | * test 120 | * */ 121 | public static void main(String[] args) { 122 | UrlSeed urlSeed = new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5); 123 | Page page = new HttpClientPoolDownloader().download(urlSeed); 124 | // Page page = new Page(new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5), Jsoup.parse("","http://xww.hebut.edu.cn/gdyw/index.htm")); 125 | System.out.println(new NewsParser().parse(page)); 126 | } 127 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/saver/Saver.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.saver; 2 | 3 | import com.jinshuai.entity.Page; 4 | 5 | /** 6 | * 数据持久化 7 | * */ 8 | public interface Saver { 9 | 10 | /** 11 | * just do it 12 | * */ 13 | void save(Page page); 14 | 15 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/saver/impl/DataBaseSaver.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.saver.impl; 2 | 3 | import com.jinshuai.core.saver.Saver; 4 | import com.jinshuai.entity.Page; 5 | 6 | /** 7 | * @author: JS 8 | * @date: 2018/3/27 9 | * @description: 10 | * 存放到数据库中 11 | */ 12 | public class DataBaseSaver implements Saver { 13 | 14 | public void save(Page page) { 15 | //TODO 16 | } 17 | 18 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/saver/impl/TextSaver.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.saver.impl; 2 | 3 | import com.jinshuai.core.saver.Saver; 4 | import com.jinshuai.entity.Page; 5 | import com.jinshuai.util.PropertiesUtils; 6 | import com.jinshuai.util.hash.PageUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | 9 | import java.io.File; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | import java.util.Date; 13 | 14 | /** 15 | * @author: JS 16 | * @date: 2018/3/27 17 | * @description: 18 | * 存储到txt 19 | */ 20 | @Slf4j 21 | public class TextSaver implements Saver { 22 | 23 | private String parentDir; 24 | 25 | private PageUtils pageUtil = PageUtils.getInstance(); 26 | 27 | private PropertiesUtils propertiesUtil = PropertiesUtils.getInstance(); 28 | 29 | public TextSaver() { 30 | init(); 31 | } 32 | 33 | /** 34 | * 初始化文件要存的目录 35 | * */ 36 | private void init() { 37 | parentDir = PropertiesUtils.getInstance().get("dir"); 38 | File file = new File(parentDir); 39 | if (!file.exists()) { 40 | file.mkdirs(); 41 | } 42 | log.info("解析后的文件存放位置:[{}]",parentDir); 43 | } 44 | 45 | public void save(Page page) { 46 | if (page == null) { 47 | return; 48 | } 49 | // 文本相似度检测 50 | String similarCheck = propertiesUtil.get("similarCheck"); 51 | if (similarCheck != null && !similarCheck.trim().equals("") &&similarCheck.equalsIgnoreCase("true")) { 52 | String title = page.getItems().get("title"); 53 | String content = page.getItems().get("content"); 54 | if(pageUtil.exist(title, content)) { 55 | log.info("标题为 [{}] 的相似文章已经存在", title); 56 | } 57 | } 58 | File file = new File(String.format("%s%s.txt",parentDir,new Date().getTime())); 59 | try (FileWriter fw = new FileWriter(file)) { 60 | if (page.getItems() == null) { 61 | fw.flush(); 62 | return; 63 | } 64 | fw.append(String.format("[标题] %s\n",page.getItems().get("title"))); 65 | fw.append(String.format("[日期] %s\n", page.getItems().get("date"))); 66 | fw.append(String.format("[正文] %s\n",page.getItems().get("content"))); 67 | fw.append(String.format("[链接] %s\n",page.getUrlSeed().getUrl())); 68 | fw.flush(); 69 | } catch (IOException e) { 70 | log.error("存储路径无效",e); 71 | } 72 | } 73 | 74 | public static void main(String[] args) throws IOException { 75 | // String parentDir = "E:/HEBUTNews/"; 76 | // File file = new File(parentDir+ (new Date().getTime()) + ".txt"); 77 | // //file.createNewFile(); 78 | // if (!file.getParentFile().exists()) { 79 | // //file.getParentFile().mkdirs(); 80 | // 81 | // } 82 | // FileWriter fileWriter = new FileWriter(file); 83 | // fileWriter.append("fasdfs"); 84 | // fileWriter.flush(); 85 | Saver saver = new TextSaver(); 86 | 87 | // new TextSaver().save(new Page(new UrlSeed("",5), Jsoup.parse("HTML","")).setItems(null)); 88 | } 89 | 90 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/scheduler/Scheduler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.scheduler; 2 | 3 | import com.jinshuai.entity.UrlSeed; 4 | 5 | /** 6 | * @author JS 7 | * @date 2018/03/26 8 | * @description: 9 | * 种子调度器: 提供种子,存放种子。 10 | * */ 11 | public interface Scheduler { 12 | 13 | /** 14 | * 存放种子 15 | * */ 16 | void push(UrlSeed urlSeed); 17 | /** 18 | * 提供种子 19 | * */ 20 | UrlSeed pop(); 21 | 22 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/scheduler/impl/PriorityQueueScheduler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.scheduler.impl; 2 | 3 | import com.google.common.hash.BloomFilter; 4 | import com.google.common.hash.Funnels; 5 | import com.jinshuai.core.scheduler.Scheduler; 6 | import com.jinshuai.entity.UrlSeed; 7 | import lombok.extern.slf4j.Slf4j; 8 | 9 | import java.nio.charset.Charset; 10 | import java.util.PriorityQueue; 11 | 12 | /** 13 | * @author: JS 14 | * @date: 2018/10/19 15 | * @description: 优先级队列结合布隆过滤器进行种子调度 16 | */ 17 | @Slf4j 18 | public class PriorityQueueScheduler implements Scheduler { 19 | 20 | /** 21 | * 存储种子的优先队列,采用大根堆实现 22 | */ 23 | private final PriorityQueue urlQueue; 24 | 25 | /** 26 | * 布隆过滤器判断种子是否重复 27 | * 预定要完成的任务数量是800 28 | * 允许0.01的错误率P 29 | */ 30 | private final BloomFilter bloomFilter; 31 | 32 | public PriorityQueueScheduler(long targetNum) { 33 | bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")), targetNum, 0.01); 34 | urlQueue = new PriorityQueue<>( 35 | (o1, o2) -> -Long.compare(o1.getPriority(), o2.getPriority()) 36 | ); 37 | } 38 | 39 | @Override 40 | public void push(UrlSeed urlSeed) { 41 | String url = urlSeed.getUrl(); 42 | // 判断url是否已经在种子队列中 43 | if (bloomFilter.mightContain(url)) { 44 | // log.warn("url:[{}]已存在", urlSeed.getUrl()); 45 | return; 46 | } 47 | urlQueue.add(urlSeed); 48 | bloomFilter.put(url); 49 | } 50 | 51 | @Override 52 | public UrlSeed pop() { 53 | if (urlQueue.size() == 0) { 54 | return null; 55 | } 56 | return urlQueue.poll(); 57 | } 58 | 59 | /** 60 | * test 61 | */ 62 | public static void main(String[] args) { 63 | UrlSeed urlSeed1 = new UrlSeed("123",5); 64 | UrlSeed urlSeed2 = new UrlSeed("1234",6); 65 | UrlSeed urlSeed3 = new UrlSeed("1234",4); 66 | PriorityQueueScheduler priorityQueueScheduler = new PriorityQueueScheduler(800); 67 | priorityQueueScheduler.push(urlSeed1); 68 | priorityQueueScheduler.push(urlSeed2); 69 | priorityQueueScheduler.push(urlSeed3); 70 | } 71 | 72 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/core/scheduler/impl/RedisScheduler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.core.scheduler.impl; 2 | 3 | import com.google.gson.Gson; 4 | import com.jinshuai.core.scheduler.Scheduler; 5 | import com.jinshuai.entity.UrlSeed; 6 | import com.jinshuai.util.JedisUtils; 7 | import lombok.extern.slf4j.Slf4j; 8 | import redis.clients.jedis.Jedis; 9 | 10 | /** 11 | * @author: JS 12 | * @date: 2018/3/26 13 | * @description: 14 | * 将种子存放到Redis 15 | */ 16 | @Slf4j 17 | public class RedisScheduler implements Scheduler { 18 | 19 | /** 20 | * 存放UrlSeed.url hash && 进行种子判重 21 | * */ 22 | private final static String PREFIX_SET = "Spider.set"; 23 | 24 | /** 25 | * 根据种子的优先级先简单创建不同的几个队列 26 | * */ 27 | private final static String PREFIX_QUEUE_HIGH = "Spider.queue.high"; 28 | private final static String PREFIX_QUEUE_LOW = "Spider.queue.low"; 29 | private final static String PREFIX_QUEUE_DEFAULT = "Spider.queue.default"; 30 | 31 | /** 32 | * @param urlSeed 种子 33 | * @desciption: 34 | * 配置 jedisPool 35 | * 添加种子的URL到Set,种子序列话后的JSON文本到List 36 | * 添加种子之前需要判断种子是否已经存在。 37 | * */ 38 | public void push(UrlSeed urlSeed) { 39 | try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) { 40 | // 种子不存在 41 | if (!jedis.sismember(PREFIX_SET, urlSeed.getUrlHash())) { 42 | // 添加种子Url对应的hash到判重Set 43 | jedis.sadd(PREFIX_SET, urlSeed.getUrlHash()); 44 | // 添加种子序列化后的JSON文本到List 45 | Gson gson = new Gson(); 46 | String urlSeedToJson = gson.toJson(urlSeed); 47 | long urlSeedPriority = urlSeed.getPriority(); 48 | if (urlSeedPriority > 5) { 49 | jedis.lpush(PREFIX_QUEUE_HIGH, urlSeedToJson); 50 | } else if (urlSeedPriority == 5) { 51 | jedis.lpush(PREFIX_QUEUE_DEFAULT, urlSeedToJson); 52 | } else { 53 | jedis.lpush(PREFIX_QUEUE_LOW, urlSeedToJson); 54 | } 55 | } 56 | } catch (Exception e) { 57 | log.error("JedisPushUrl[{}]出错",urlSeed.toString(),e); 58 | } 59 | } 60 | 61 | /** 62 | * @return 从列表中获取的种子JSON反序列化为UrlSeed 63 | * @description: 64 | * 优先从高优先级别的列表里取种子 65 | * */ 66 | public UrlSeed pop() { 67 | Gson gson = new Gson(); 68 | String urlSeedToJson = null; 69 | UrlSeed urlSeed = null; 70 | try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) { 71 | if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_HIGH)) != null) { 72 | urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class); 73 | } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_DEFAULT)) != null) { 74 | urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class); 75 | } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_LOW)) != null) { 76 | urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class); 77 | } 78 | return urlSeed; 79 | } catch (Exception e) { 80 | log.error("JedisPopUrl [{}]出错", urlSeedToJson, e); 81 | } 82 | return gson.fromJson(urlSeedToJson,UrlSeed.class); 83 | } 84 | 85 | /** 86 | * test connection 87 | * */ 88 | public static void main(String[] args) { 89 | Jedis jedis = JedisUtils.getSingleInstance().getJedis(); 90 | System.out.println(jedis.ping()); 91 | UrlSeed urlSeed = new RedisScheduler().pop(); 92 | System.out.println(urlSeed); 93 | jedis.lpush(PREFIX_QUEUE_LOW, "dasdasdasdsa"); 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/entity/Page.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.entity; 2 | 3 | import org.jsoup.nodes.Document; 4 | 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | /** 9 | * @author: JS 10 | * @date: 2018/3/26 11 | * @description: 12 | * 每一个UrlSeed对应的页面抽象为一个Page 13 | */ 14 | public class Page { 15 | 16 | /** 17 | * Page对应的UrlSeed 18 | * */ 19 | private UrlSeed urlSeed; 20 | 21 | /** 22 | * Page对应的jsoup文档 23 | * */ 24 | private Document document; 25 | 26 | /** 27 | * Page包含的url 28 | * */ 29 | private Set urlSeeds; 30 | 31 | /** 32 | * Page所包含的有用信息 33 | * */ 34 | private Map items; 35 | 36 | public Page(UrlSeed urlSeed, Document document) { 37 | this.urlSeed = urlSeed; 38 | this.document = document; 39 | } 40 | 41 | public UrlSeed getUrlSeed() { 42 | return urlSeed; 43 | } 44 | 45 | public Page setUrlSeed(UrlSeed urlSeed) { 46 | this.urlSeed = urlSeed; 47 | return this; 48 | } 49 | 50 | public Document getDocument() { 51 | return document; 52 | } 53 | 54 | public Page setDocument(Document document) { 55 | this.document = document; 56 | return this; 57 | } 58 | 59 | public Set getUrlSeeds() { 60 | return urlSeeds; 61 | } 62 | 63 | public Page setUrlSeeds(Set urlSeeds) { 64 | this.urlSeeds = urlSeeds; 65 | return this; 66 | } 67 | 68 | public Map getItems() { 69 | return items; 70 | } 71 | 72 | public Page setItems(Map items) { 73 | this.items = items; 74 | return this; 75 | } 76 | 77 | @Override 78 | public String toString() { 79 | return "Page{" + 80 | "urlSeed=" + urlSeed + 81 | ", urlSeeds=" + urlSeeds + 82 | ", items=" + items + 83 | '}'; 84 | } 85 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/entity/UrlSeed.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.entity; 2 | 3 | import com.jinshuai.util.hash.MurmurHash; 4 | import lombok.EqualsAndHashCode; 5 | import lombok.ToString; 6 | 7 | /** 8 | * @author: JS 9 | * @date: 2018/3/26 10 | * @description: 11 | * 每个Url需要设置优先级,不需要对低于某个优先级的Url进行解析。 12 | */ 13 | @ToString 14 | @EqualsAndHashCode 15 | public class UrlSeed { 16 | 17 | /** 18 | * 种子对应的Url 19 | * */ 20 | private String url; 21 | 22 | /** 23 | * url hash 24 | * */ 25 | private String urlHash; 26 | 27 | /** 28 | * 种子优先级 29 | * 硬编码为5,通过时间戳设置优先级 30 | * */ 31 | private long priority = 5; 32 | 33 | public UrlSeed(String url, long priority) { 34 | this.url = url; 35 | this.priority = priority; 36 | this.urlHash = String.valueOf(MurmurHash.hash64(url)); 37 | } 38 | 39 | public UrlSeed(String url) { 40 | this.url = url; 41 | this.urlHash = String.valueOf(MurmurHash.hash64(url)); 42 | } 43 | 44 | public String getUrl() { 45 | return url; 46 | } 47 | 48 | public UrlSeed setUrl(String url) { 49 | this.url = url; 50 | return this; 51 | } 52 | 53 | public String getUrlHash() { 54 | return urlHash; 55 | } 56 | 57 | public long getPriority() { 58 | return priority; 59 | } 60 | 61 | public UrlSeed setPriority(long priority) { 62 | this.priority = priority; 63 | return this; 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/ExcelUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.poi.openxml4j.exceptions.InvalidFormatException; 5 | import org.apache.poi.ss.usermodel.*; 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.util.ArrayList; 10 | import java.util.Iterator; 11 | import java.util.List; 12 | 13 | /** 14 | * @author: JS 15 | * @date: 2018/11/9 16 | * @description: handle excel 17 | */ 18 | @Slf4j 19 | public class ExcelUtils implements OfficeUtils { 20 | @Override 21 | public List read() { 22 | List result = new ArrayList<>(); 23 | try { 24 | Workbook wb = WorkbookFactory.create(new File("F:/XXX.xls")); 25 | wb.close(); 26 | Sheet sheet = wb.getSheetAt(0); 27 | for(Iterator rowIterator = sheet.rowIterator(); rowIterator.hasNext();) { 28 | Row row = (Row)rowIterator.next(); 29 | Cell cell = row.getCell(1); 30 | result.add(cell.getStringCellValue()); 31 | } 32 | } catch (IOException | InvalidFormatException e ) { 33 | log.error("读取excel出错,检查路径是否正确、行列号是否越界",e); 34 | } 35 | return result; 36 | } 37 | 38 | @Override 39 | @Deprecated 40 | public void write() { 41 | } 42 | 43 | public static void main(String[] args) { 44 | List list = new ExcelUtils().read(); 45 | list.forEach(System.out::println); 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/JedisUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import redis.clients.jedis.Jedis; 5 | import redis.clients.jedis.JedisPool; 6 | import redis.clients.jedis.JedisPoolConfig; 7 | 8 | import java.util.Map; 9 | import java.util.concurrent.ConcurrentHashMap; 10 | 11 | /** 12 | * @author: JS 13 | * @date: 2018/3/27 14 | * @description: 15 | * 对Jedis简单的封装 16 | */ 17 | @Slf4j 18 | public class JedisUtils { 19 | 20 | /** 21 | * JedisUtils实例 22 | * */ 23 | private static volatile JedisUtils jedisUtils; 24 | 25 | /** 26 | * 获取JedisUtils单例 27 | * */ 28 | public static JedisUtils getSingleInstance() { 29 | if (jedisUtils == null) { 30 | synchronized (JedisUtils.class) { 31 | jedisUtils = new JedisUtils(); 32 | } 33 | } 34 | return jedisUtils; 35 | } 36 | 37 | private JedisPool jedisPool; 38 | 39 | private JedisUtils() { 40 | init(); 41 | } 42 | 43 | private void init() { 44 | configJedisPool(); 45 | } 46 | 47 | /** 48 | * 获取套接字、密码 49 | * */ 50 | private static final String IP = PropertiesUtils.getInstance().get("redis-ip"); 51 | private static final int PORT = Integer.valueOf(PropertiesUtils.getInstance().get("redis-port")); 52 | private static final String PASSWORD = PropertiesUtils.getInstance().get("redis-password"); 53 | 54 | /** 55 | * 可用连接实例的最大数目,默认值为8; 56 | * 如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。 57 | */ 58 | private static int MAX_ACTIVE = 2048; 59 | 60 | /** 61 | * 控制一个pool最多有多少个状态为idle(空闲的)的jedis实例,默认值也是8。 62 | */ 63 | private static int MAX_IDLE = 200; 64 | 65 | /** 66 | * 等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。如果超过等待时间,则直接抛出JedisConnectionException; 67 | * */ 68 | private static int MAX_WAIT = 10000; 69 | 70 | /** 71 | * 超时时间 72 | * */ 73 | private static int TIMEOUT = 10000; 74 | 75 | /** 76 | * 保存若干个jedisPool 77 | * key 为IP+port 78 | * */ 79 | private static Map maps = new ConcurrentHashMap<>(); 80 | 81 | private void configJedisPool() { 82 | if (maps.get(IP) == null) { 83 | JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); 84 | jedisPoolConfig.setMaxTotal(MAX_ACTIVE); 85 | jedisPoolConfig.setMaxIdle(MAX_IDLE); 86 | jedisPoolConfig.setMaxWaitMillis(MAX_WAIT); 87 | jedisPoolConfig.setTestOnReturn(true); 88 | // 未设置密码 89 | if (PASSWORD == null || PASSWORD.length() == 0) { 90 | log.info("配置文件中未设置Redis密码,请确保Redis服务器不需要密码验证!!!"); 91 | jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT); 92 | } else { 93 | jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT, PASSWORD); 94 | } 95 | maps.put(IP,jedisPool); 96 | } else { 97 | jedisPool = maps.get(IP); 98 | } 99 | } 100 | 101 | /** 102 | * 从jedisPool中获取jedis 103 | * */ 104 | public Jedis getJedis() { 105 | Jedis jedis = null; 106 | try { 107 | jedis = jedisPool.getResource(); 108 | } catch (Exception e) { 109 | log.error("连接Redis失败,检查IP、端口、密码", e); 110 | } 111 | return jedis; 112 | } 113 | 114 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/OfficeUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | import java.util.List; 4 | 5 | public interface OfficeUtils { 6 | 7 | List read(); 8 | 9 | void write(); 10 | 11 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/PropertiesUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util; 2 | 3 | 4 | import lombok.extern.slf4j.Slf4j; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.util.Map; 9 | import java.util.Properties; 10 | import java.util.concurrent.ConcurrentHashMap; 11 | 12 | /** 13 | * @author: JS 14 | * @date: 2018/5/4 15 | * @description: 16 | * 读取配置文件工具类 17 | */ 18 | @Slf4j 19 | public class PropertiesUtils { 20 | 21 | private Map cache = new ConcurrentHashMap<>(); 22 | 23 | private static volatile PropertiesUtils propertiesUtils; 24 | 25 | public static PropertiesUtils getInstance() { 26 | if (propertiesUtils == null) { 27 | synchronized (PropertiesUtils.class) { 28 | if (propertiesUtils == null) { 29 | propertiesUtils = new PropertiesUtils(); 30 | } 31 | } 32 | } 33 | return propertiesUtils; 34 | } 35 | 36 | private PropertiesUtils(){} 37 | 38 | public String get(String key) { 39 | if (key == null) 40 | return null; 41 | if (cache.get(key) != null) { 42 | return cache.get(key); 43 | } 44 | Properties properties = new Properties(); 45 | InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties"); 46 | try { 47 | properties.load(inputStream); 48 | } catch (IOException e) { 49 | log.error("加载配置文件[application.properties]失败",e); 50 | } 51 | String value = properties.getProperty(key); 52 | cache.put(key,value); 53 | return value; 54 | } 55 | 56 | public static void main(String[] args) throws IOException { 57 | // Properties properties = new Properties(); 58 | // InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties"); 59 | // properties.load(inputStream); 60 | // System.out.println(properties.getProperty("ip")); 61 | // System.out.println(properties.getProperty("ip")); 62 | System.out.println(PropertiesUtils.getInstance().get("dir")); 63 | System.out.println(PropertiesUtils.getInstance().get("dir")); 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/hash/MurmurHash.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.hash; 2 | 3 | import java.math.BigInteger; 4 | import java.nio.ByteBuffer; 5 | import java.nio.ByteOrder; 6 | 7 | /** 8 | * @author: JS 9 | * @date: 2019/5/25 10 | * @description: 生成64位Hash 代码:https://www.cnkirito.moe/consistent-hash-lb/ 11 | */ 12 | public class MurmurHash { 13 | 14 | public static BigInteger hash64(String word) { 15 | ByteBuffer buf = ByteBuffer.wrap(word.getBytes()); 16 | int seed = 0x1234ABCD; 17 | 18 | ByteOrder byteOrder = buf.order(); 19 | buf.order(ByteOrder.LITTLE_ENDIAN); 20 | 21 | long m = 0xc6a4a7935bd1e995L; 22 | int r = 47; 23 | 24 | long h = seed ^ (buf.remaining() * m); 25 | 26 | long k; 27 | while (buf.remaining() >= 8) { 28 | k = buf.getLong(); 29 | 30 | k *= m; 31 | k ^= k >>> r; 32 | k *= m; 33 | 34 | h ^= k; 35 | h *= m; 36 | } 37 | 38 | if (buf.remaining() > 0) { 39 | ByteBuffer finish = ByteBuffer.allocate(8).order( 40 | ByteOrder.LITTLE_ENDIAN); 41 | // for big-endian version, do this first: 42 | // finish.position(8-buf.remaining()); 43 | finish.put(buf).rewind(); 44 | h ^= finish.getLong(); 45 | h *= m; 46 | } 47 | h ^= h >>> r; 48 | h *= m; 49 | h ^= h >>> r; 50 | 51 | buf.order(byteOrder); 52 | return new BigInteger(String.valueOf(h & 0xffffffffL)); 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/hash/PageUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.hash; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.ansj.app.keyword.KeyWordComputer; 5 | import org.ansj.app.keyword.Keyword; 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.math.BigInteger; 10 | import java.nio.file.Files; 11 | import java.util.*; 12 | import java.util.concurrent.ConcurrentHashMap; 13 | 14 | /** 15 | * @author: JS 16 | * @date: 2019/5/25 17 | * @description: SimHash进行doc相似度检测 18 | */ 19 | @Slf4j 20 | public class PageUtils { 21 | 22 | private volatile static PageUtils instance; 23 | 24 | private Map> invertedIndex = new ConcurrentHashMap<>(); 25 | 26 | /** 27 | * 测试:跟踪文本 28 | * */ 29 | private Map fingerContent = new ConcurrentHashMap<>(); 30 | 31 | public static PageUtils getInstance() { 32 | if (instance == null) { 33 | synchronized (PageUtils.class) { 34 | if (instance == null) { 35 | instance = new PageUtils(); 36 | } 37 | } 38 | } 39 | return instance; 40 | } 41 | 42 | private PageUtils(){} 43 | 44 | private static final int BIT_SIZE = 64; 45 | 46 | private static final int TABLE_SIZE = 16; 47 | 48 | private static final int HAMMING_DISTANCE = 3; 49 | 50 | private ThreadLocal simhashStrContainer = new ThreadLocal<>(); 51 | 52 | public boolean exist(String title, String content) { 53 | boolean exist = false; 54 | BigInteger fingerprint = getSimHash(title, content); 55 | // fingerContent.put(fingerprint, title + "====" + content); 56 | String hashStr = simhashStrContainer.get(); 57 | // 防止分词错误NPE 58 | if (hashStr.length() == BIT_SIZE) { 59 | // 获取每一个table对应的所有候选结果 60 | for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) { 61 | String table = hashStr.substring(start, start + TABLE_SIZE); 62 | Set fingerprints = invertedIndex.get(table); 63 | if (fingerprints != null && fingerprints.size() > 0) { 64 | for (BigInteger fingerprintRes : fingerprints) { 65 | // 海明距离 66 | int hammingDistance = fingerprintRes.xor(fingerprint).bitCount(); 67 | if (hammingDistance <= HAMMING_DISTANCE) { 68 | // log.error("标题 [{}] \r\n 内容[{}] \r\n 与 标题内容[{}] 相似\r\n 汉明距离:[{}]", title, content, fingerContent.get(fingerprintRes), hammingDistance); 69 | exist = true; 70 | break; 71 | } 72 | } 73 | } 74 | if (exist) 75 | break; 76 | } 77 | // 构建倒排索引(16 * 4) 78 | constructInvertedIndex(fingerprint); 79 | } 80 | return exist; 81 | } 82 | 83 | private BigInteger getSimHash(String title, String content) { 84 | double[] featureVector = new double[BIT_SIZE]; 85 | // 1. 分词,计算权重 86 | Collection result = getParticiple(title, content); 87 | // 2. hash 88 | // 3. 加权 89 | // 4. 合并 90 | featureVector = weightingAndCombine(featureVector, result); 91 | // 5. 降维 92 | // 6. SimHash 指纹 93 | return decreaseDimensionAndGetFingerprint(featureVector); 94 | } 95 | 96 | /** 97 | * 分词 98 | * */ 99 | private Collection getParticiple(String title, String content) { 100 | int keyNumber = content.length() / 2 < 5 ? 5 : content.length(); // TODO 101 | KeyWordComputer kwc = new KeyWordComputer(keyNumber); 102 | return kwc.computeArticleTfidf(title, content); 103 | } 104 | 105 | /** 106 | * 哈希 加权 合并 107 | * */ 108 | private double[] weightingAndCombine(double[] featureVector, Collection result) { 109 | for (Keyword keyword : result) { 110 | String keyStr = keyword.getName(); 111 | BigInteger keyHash = MurmurHash.hash64(keyStr); 112 | for (int i = 0; i < BIT_SIZE; i++) { 113 | final BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i); 114 | // 3. 加权 115 | // 4. 合并 116 | if (keyHash.and(bitMask).signum() != 0) { 117 | featureVector[i] += keyword.getScore(); 118 | } else { 119 | featureVector[i] -= keyword.getScore(); 120 | } 121 | } 122 | } 123 | return featureVector; 124 | } 125 | 126 | /** 127 | * 降维 获取指纹 128 | * */ 129 | private BigInteger decreaseDimensionAndGetFingerprint(double[] featureVector) { 130 | BigInteger fingerprint = BigInteger.ZERO; 131 | StringBuilder simHashBuilder = new StringBuilder(); 132 | for (int i = 0; i < BIT_SIZE; i++) { 133 | BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i); 134 | if (featureVector[i] > 0) { 135 | fingerprint = fingerprint.or(fingerprint.xor(bitMask)); 136 | simHashBuilder.append(1); 137 | } else { 138 | simHashBuilder.append(0); 139 | } 140 | } 141 | simhashStrContainer.set(simHashBuilder.toString()); 142 | return fingerprint; 143 | } 144 | 145 | /** 146 | * 构建倒排索引 147 | * < table, {simhash1, simhash2 simhash3...} > 148 | * 149 | * */ 150 | private void constructInvertedIndex(BigInteger fingerprint) { 151 | String hashStr = simhashStrContainer.get(); 152 | for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) { 153 | String table = hashStr.substring(start, start + TABLE_SIZE); 154 | Set docs = invertedIndex.get(table); 155 | if (docs == null) { 156 | docs = new HashSet<>(); 157 | } 158 | docs.add(fingerprint); 159 | invertedIndex.put(table, docs); 160 | } 161 | } 162 | 163 | public static void main(String[] args) throws IOException { 164 | PageUtils pageUtil = PageUtils.getInstance(); 165 | StringBuilder sb1 = new StringBuilder(); 166 | File file1 = new File("D:/Data/1.txt"); 167 | Files.readAllLines(file1.toPath()).forEach(line ->{ 168 | sb1.append(line); 169 | }); 170 | 171 | StringBuilder sb2 = new StringBuilder(); 172 | File file2 = new File("D:/Data/2.txt"); 173 | Files.readAllLines(file2.toPath()).forEach(line -> { 174 | sb2.append(line); 175 | }); 176 | 177 | pageUtil.exist("学校党委理论学习中心组召开扩大会议",sb1.toString()); 178 | pageUtil.exist("校党委理论学习中心组召开专题会议学习传达全国“两会”精神", sb2.toString()); 179 | 180 | 181 | // BigInteger simhash1 = pageUtil.getSimHash("","我来自河北省,你们可以叫我金帅"); 182 | // BigInteger simhash2 = pageUtil.getSimHash("","我来自河北省,我是金帅"); 183 | // System.out.println(simhash1.xor(simhash2).bitCount()); 184 | // System.out.println(simhash1.xor(simhash2).toString(2)); 185 | // System.out.println("=========十进制========"); 186 | // System.out.println(simhash1.toString()); 187 | // System.out.println(simhash2.toString()); 188 | // System.out.println("=========二进制========"); 189 | // System.out.println(simhash1.toString(2)); 190 | // System.out.println(simhash2.toString(2)); 191 | 192 | } 193 | 194 | } 195 | -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/http/HttpUtils.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.http; 2 | 3 | import com.jinshuai.util.PropertiesUtils; 4 | import lombok.extern.slf4j.Slf4j; 5 | import org.apache.http.Header; 6 | import org.apache.http.HttpEntity; 7 | import org.apache.http.HttpResponse; 8 | import org.apache.http.client.config.RequestConfig; 9 | import org.apache.http.client.methods.HttpGet; 10 | import org.apache.http.config.Registry; 11 | import org.apache.http.config.RegistryBuilder; 12 | import org.apache.http.config.SocketConfig; 13 | import org.apache.http.conn.socket.ConnectionSocketFactory; 14 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 15 | import org.apache.http.conn.ssl.NoopHostnameVerifier; 16 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 17 | import org.apache.http.conn.ssl.TrustSelfSignedStrategy; 18 | import org.apache.http.entity.ContentType; 19 | import org.apache.http.impl.client.CloseableHttpClient; 20 | import org.apache.http.impl.client.HttpClients; 21 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 22 | import org.apache.http.ssl.SSLContexts; 23 | import org.apache.http.util.ByteArrayBuffer; 24 | import org.apache.rocketmq.client.exception.MQClientException; 25 | import org.apache.rocketmq.client.producer.DefaultMQProducer; 26 | import org.apache.rocketmq.client.producer.SendCallback; 27 | import org.apache.rocketmq.client.producer.SendResult; 28 | import org.apache.rocketmq.common.message.Message; 29 | import org.apache.rocketmq.remoting.common.RemotingHelper; 30 | import org.apache.rocketmq.remoting.exception.RemotingException; 31 | 32 | import javax.net.ssl.HostnameVerifier; 33 | import javax.net.ssl.SSLContext; 34 | import javax.net.ssl.SSLPeerUnverifiedException; 35 | import javax.print.attribute.HashAttributeSet; 36 | import java.io.IOException; 37 | import java.io.InputStream; 38 | import java.io.UnsupportedEncodingException; 39 | import java.net.MalformedURLException; 40 | import java.net.URI; 41 | import java.net.URISyntaxException; 42 | import java.net.URL; 43 | import java.nio.charset.Charset; 44 | import java.util.HashMap; 45 | import java.util.Map; 46 | import java.util.Random; 47 | import java.util.concurrent.TimeUnit; 48 | import java.util.regex.Matcher; 49 | import java.util.regex.Pattern; 50 | 51 | /** 52 | * @author: JS 53 | * @date: 2018/3/22 54 | * @description: 55 | * 创建单例HttpUtils,获取HttpClient实例执行HTTP请求根据状态码解析响应体。 56 | */ 57 | @Slf4j 58 | public class HttpUtils { 59 | 60 | private static final ThreadLocal httpGetContainer = new ThreadLocal<>(); 61 | 62 | private static final ThreadLocal httpEntityContainer = new ThreadLocal<>(); 63 | 64 | private static volatile HttpUtils HTTPUTILS; 65 | 66 | private PoolingHttpClientConnectionManager httpClientConnectionManager; 67 | 68 | private CloseableHttpClient httpClient; 69 | 70 | private static final int MAX_TOTAL_CONNECTIONS = 20; 71 | private static final int SOCKET_TIMEOUT = 5000; 72 | private static final int MAX_CONNECTIONS_PER_ROUTE = 200; 73 | private static final int CONNECTION_REQUEST_TIMEOUT = 5000; 74 | private static final int CONNECT_TIMEOUT = 5000; 75 | 76 | private static DefaultMQProducer producer; 77 | /** 78 | * 消息队列开关 79 | * 1-打开 80 | * 0-关闭 81 | * */ 82 | private static String mqSwitch; 83 | 84 | private static final String CHARSET = RemotingHelper.DEFAULT_CHARSET; 85 | 86 | 87 | /** 88 | * 获取HttpUtils单例 89 | * */ 90 | public static HttpUtils getSingleInstance() { 91 | if (HTTPUTILS == null) { 92 | synchronized (HttpUtils.class) { 93 | if (HTTPUTILS == null) { 94 | HTTPUTILS = new HttpUtils(); 95 | } 96 | } 97 | } 98 | return HTTPUTILS; 99 | } 100 | 101 | private HttpUtils() { 102 | init(); 103 | } 104 | 105 | private void init() { 106 | configHttpPool(); 107 | configHttpClient(); 108 | configMQ(); 109 | } 110 | 111 | /** 112 | * 配置消息队列 113 | * */ 114 | private void configMQ() { 115 | mqSwitch = PropertiesUtils.getInstance().get("mq-switch"); 116 | // the switch of MQ is closed 117 | if (mqSwitch == null || "0".equals(mqSwitch)) { 118 | return; 119 | } 120 | producer = new DefaultMQProducer("Producer-Group"); 121 | String ip = PropertiesUtils.getInstance().get("mq-ip"); 122 | String port = PropertiesUtils.getInstance().get("mq-port"); 123 | producer.setNamesrvAddr(ip + ":" + port); 124 | try { 125 | producer.start(); 126 | log.info("mq-producer started"); 127 | } catch (MQClientException e) { 128 | log.error("failed to start producer", e); 129 | } 130 | producer.setRetryTimesWhenSendAsyncFailed(0); 131 | } 132 | 133 | /** 134 | * 配置HTTP连接池 135 | * 136 | * */ 137 | private void configHttpPool() { 138 | try { 139 | // 配置SSL 140 | SSLContext sslcontext = SSLContexts.custom() 141 | .loadTrustMaterial(null, new TrustSelfSignedStrategy()) 142 | .build(); 143 | 144 | HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.getDefaultHostnameVerifier(); 145 | // 关闭域名证书验证 146 | // HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE; 147 | 148 | SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory( 149 | sslcontext, hostnameVerifier); 150 | 151 | Registry socketFactoryRegistry = RegistryBuilder.create() 152 | .register("http", PlainConnectionSocketFactory.getSocketFactory()) 153 | .register("https", sslsf) 154 | .build(); 155 | 156 | // 将SSL集成到HttpConnectionManager 157 | httpClientConnectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry); 158 | // 设置HTTP连接池最大连接数 159 | httpClientConnectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS); 160 | // 每个路由最大的连接数 161 | httpClientConnectionManager.setDefaultMaxPerRoute(MAX_CONNECTIONS_PER_ROUTE); 162 | // 设置socket超时时间 163 | SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(SOCKET_TIMEOUT).build(); 164 | httpClientConnectionManager.setDefaultSocketConfig(socketConfig); 165 | } catch (Exception e) { 166 | log.error("SSL配置出错",e); 167 | } 168 | } 169 | 170 | /** 171 | * 配置HttpClient 172 | * 173 | * */ 174 | private void configHttpClient() { 175 | // 请求配置 176 | RequestConfig requestConfig = RequestConfig.custom() 177 | .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT) 178 | .setConnectTimeout(CONNECT_TIMEOUT) 179 | .build(); 180 | // 将配置信息应用到HttpClient 181 | if (httpClientConnectionManager == null) { 182 | log.error("httpClientConnectionManager未被初始化"); 183 | return; 184 | } 185 | httpClient = HttpClients.custom() 186 | .setDefaultRequestConfig(requestConfig) 187 | .setConnectionManager(httpClientConnectionManager) 188 | .build(); 189 | } 190 | 191 | /** 192 | * 配置HttpGet 193 | * 194 | * */ 195 | private HttpGet getHttpGet(String urlStr) { 196 | URL url; 197 | URI uri = null; 198 | try { 199 | url = new URL(urlStr); 200 | uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null); 201 | } catch (MalformedURLException | URISyntaxException e) { 202 | log.error("字符串格式不正确[{}]",urlStr,e); 203 | } 204 | HttpGet httpGet = new HttpGet(uri); 205 | // 添加请求头header 206 | httpGet.addHeader("Accept", "*/*"); 207 | httpGet.addHeader("Accept-Encoding", "gzip, deflate"); 208 | httpGet.addHeader("Connection", "keep-alive"); 209 | int randomUserAgent = new Random().nextInt(UserAgentArray.USER_AGENT.length); 210 | httpGet.addHeader("User-Agent",UserAgentArray.USER_AGENT[randomUserAgent]); 211 | 212 | return httpGet; 213 | } 214 | 215 | /** 216 | * 发Get请求 217 | * 218 | * */ 219 | private void sendRequest(String urlStr) { 220 | HttpGet httpGet = httpGetContainer.get(); 221 | try { 222 | HttpResponse response = httpClient.execute(httpGet); 223 | // 根据状态码执行不同的操作 224 | int statusCode = response.getStatusLine().getStatusCode(); 225 | StatusHandler strategy = StatusContext.getStrategy(statusCode); 226 | strategy.process(urlStr, response); 227 | } catch (IOException e) { 228 | log.error("IO出错[{}]", urlStr, e); 229 | } 230 | } 231 | 232 | /** 233 | * 获取 HttpEntity 234 | * 235 | * */ 236 | public String getContent(String urlStr) { 237 | // url为空或者不是http协议 238 | if (urlStr == null || !urlStr.startsWith("http")) { 239 | return null; 240 | } 241 | // 防止SSL过程中的握手警报 http://dovov.com/ssljava-1-7-0unrecognized_name.html 242 | if (urlStr.startsWith("https")) { 243 | System.setProperty("jsse.enableSNIExtension", "false"); 244 | } 245 | String content = null; 246 | try { 247 | httpGetContainer.set(getHttpGet(urlStr)); 248 | sendRequest(urlStr); 249 | HttpEntity httpEntity = httpEntityContainer.get(); 250 | if (httpEntity == null) { 251 | log.error("HttpEntity为空"); 252 | return null; 253 | } 254 | InputStream inputStream = httpEntity.getContent(); 255 | content = parseStream(inputStream, httpEntity); 256 | } catch (IOException e) { 257 | log.error("获取响应流失败", e); 258 | } catch (Exception e) { 259 | log.error("获取内容异常", e); 260 | } finally { 261 | httpGetContainer.get().releaseConnection(); 262 | httpGetContainer.remove(); 263 | } 264 | return content; 265 | } 266 | 267 | /** 268 | * 解析响应流 269 | * 270 | * */ 271 | private String parseStream(InputStream inputStream, HttpEntity httpEntity) { 272 | String pageContent = null; 273 | // 获取页面编码:1. 从响应头content-type 2. 如果没有则从返回的HTML中获取Meta标签里的编码 274 | ByteArrayBuffer byteArrayBuffer = new ByteArrayBuffer(4096); 275 | byte[] tempStore = new byte[4096]; 276 | int count; 277 | try { 278 | // read(tempStore) 会重新从零开始存->刷新字节数组 ,并返回读到的字节数量 279 | while ((count = inputStream.read(tempStore)) != -1) { 280 | byteArrayBuffer.append(tempStore, 0, count); 281 | } 282 | // TODO:下面复制粘贴的:https://github.com/xjtushilei/ScriptSpider 283 | // 根据获取的字节编码转为String类型 284 | String charset = "UTF-8"; 285 | ContentType contentType = ContentType.getOrDefault(httpEntity); 286 | Charset charsets = contentType.getCharset(); 287 | pageContent = new String(byteArrayBuffer.toByteArray()); 288 | // 如果响应头中含有content-type字段,直接读取然后设置编码即可。 289 | if (null != charsets) { 290 | charset = charsets.toString(); 291 | } else { 292 | // 发现HttpClient带的功能有问题,这里自己又写了一下。 293 | Pattern pattern = Pattern.compile("([\\s\\S]*?) status2Handler = new HashMap<>(); 336 | 337 | static { 338 | status2Handler.put(2, SuccessStrategy.getInstance()); 339 | status2Handler.put(3, RedirectStrategy.getInstance()); 340 | status2Handler.put(4, ClientErrorStrategy.getInstance()); 341 | status2Handler.put(5, ServerErrorStrategy.getInstance()); 342 | } 343 | 344 | static StatusHandler getStrategy(int statusCode) { 345 | return status2Handler.get(statusCode / 100); 346 | } 347 | 348 | } 349 | 350 | /** 351 | * 2XX 策略 352 | * 成功获取响应时对应的执行策略 353 | * 354 | * */ 355 | public static class SuccessStrategy implements StatusHandler { 356 | 357 | private static final StatusHandler statusHandler = new SuccessStrategy(); 358 | 359 | static StatusHandler getInstance() { 360 | return statusHandler; 361 | } 362 | 363 | @Override 364 | public void process(String url, HttpResponse response) { 365 | httpEntityContainer.set(response.getEntity()); 366 | } 367 | 368 | } 369 | 370 | /** 371 | * 3XX 策略 372 | * 重定向时对应的执行策略 373 | * 374 | * */ 375 | public static class RedirectStrategy implements StatusHandler { 376 | 377 | private static final StatusHandler statusHandler = new RedirectStrategy(); 378 | 379 | static StatusHandler getInstance() { 380 | return statusHandler; 381 | } 382 | 383 | @Override 384 | public void process(String url, HttpResponse response) { 385 | Header location = response.getFirstHeader("Location"); 386 | // 将location对应的URL放到仓库中 387 | // scheduler.push(new UrlSeed(location.getValue(), 5)); 388 | log.error("301: 资源已被重定向[{}]", url); 389 | sendMessage(location.getValue(), "Redirect-Topic"); 390 | } 391 | 392 | } 393 | 394 | /** 395 | * 4XX 策略 396 | * 主要处理需要认证的资源401,需要授权的资源403,以及不存在的资源404 397 | * 当请求次数过多以后,就容易报403 398 | * 当 401,403时,将资源放到低优先级的队列或者消息队列中,额外处理。 TODO 399 | * */ 400 | public static class ClientErrorStrategy implements StatusHandler { 401 | 402 | private static final StatusHandler statusHandler = new ClientErrorStrategy(); 403 | 404 | static StatusHandler getInstance() { 405 | return statusHandler; 406 | } 407 | 408 | @Override 409 | public void process(String url, HttpResponse response) { 410 | int status = response.getStatusLine().getStatusCode(); 411 | if (status == 401 || status == 403) { 412 | log.warn("401: 无权访问此资源[{}]", url); 413 | // send to mq 414 | sendMessage(url, "Forbidden-Topic"); 415 | } else if (status == 404) { 416 | log.warn("404: 请求的资源不存在[{}]", url); 417 | } 418 | 419 | } 420 | 421 | } 422 | 423 | /** 424 | * 5XX 策略 425 | * 远端服务器出错,应对办法是暂时停止爬虫 TODO 426 | * */ 427 | public static class ServerErrorStrategy implements StatusHandler { 428 | 429 | private static final StatusHandler statusHandler = new ServerErrorStrategy(); 430 | 431 | static StatusHandler getInstance() { 432 | return statusHandler; 433 | } 434 | 435 | @Override 436 | public void process(String url, HttpResponse response) { 437 | log.error("500: 远端服务器出错[{}]", url); 438 | Header retryAfter = response.getFirstHeader("Retry-After"); 439 | long waitSeconds = 20; 440 | if (retryAfter != null) { 441 | waitSeconds = Long.parseLong(retryAfter.getValue()); 442 | } 443 | log.info("由于远程服务器出错,爬虫休息 [{}] 秒后,尝试继续执行任务.....", waitSeconds); 444 | try { 445 | TimeUnit.SECONDS.sleep(waitSeconds); 446 | } catch (InterruptedException e) { 447 | log.error("sleep error", e); 448 | } 449 | sendMessage(url, "ServerWrong-Topic"); 450 | } 451 | 452 | } 453 | 454 | /** 455 | * Test HttpUtils 456 | * 457 | * 具体逻辑:HttpClient用封装好的HttpGet发送get请求,获取HttpEntity,从HttpEntity中获取响应内容以及响应头 458 | * 从响应头Content-Type中获取charset编码格式,如果响应头中没有编码格式响应头,就从响应内容中解析meta标签获取编码格式 459 | * 然后将字节数组按响应头中的编码格式创建字符串 460 | * */ 461 | public static void main(String[] args) { 462 | String url1 = "https://jinshuai86.github.io/about"; 463 | String url2 = "http://port.patentstar.cn/bns/PtDataSvc.asmx?op=GetPatentData&_strPID=CN105961023A&_PdTpe=CnDesXmlTxt"; 464 | String url3 = "https://www.toutiao.com/"; 465 | String url4 = "http://xww.hebut.edu.cn"; 466 | String url5 = "http://www.baidu.com"; 467 | String url7 = "https://www.douban.com"; 468 | String url8 = "https://baike.baidu.com/item/"; 469 | String[] arr = {"碳酸铵","硫酸铁", "醋酸钠", "碳酸钙", "氢氧化钠", "硫酸亚铁", "高锰酸钾"}; 470 | // for (int i = 0; i < 10; i++) { 471 | // HttpUtils.getSingleInstance().sendMessage("https://www.douban.com", "Forbidden-Topic"); 472 | // } 473 | for (int i = 0; i < 2; i++) { 474 | // System.out.println(i + " ============ "); 475 | // HttpUtils.getSingleInstance().getContent(url8 + arr[i % arr.length]); 476 | System.out.println(HttpUtils.getSingleInstance().getContent(url3)); 477 | } 478 | } 479 | 480 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/http/StatusHandler.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.http; 2 | 3 | import org.apache.http.HttpResponse; 4 | 5 | /** 6 | * @author: JS 7 | * @date: 2019/4/12 8 | * @description: 状态码处理策略 9 | */ 10 | public interface StatusHandler { 11 | 12 | void process(String URL, HttpResponse response); 13 | 14 | } -------------------------------------------------------------------------------- /spider-core/src/main/java/com/jinshuai/util/http/UserAgentArray.java: -------------------------------------------------------------------------------- 1 | package com.jinshuai.util.http; 2 | 3 | /** 4 | * @author: JS 5 | * @date: 2018/3/23 6 | * @description: 7 | * 找的一些用户代理,防止后台限制发多个请求。 8 | * TODO: 待放在文件中 9 | */ 10 | public class UserAgentArray { 11 | 12 | public static final String[] USER_AGENT = { 13 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 14 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 15 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 16 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 17 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 18 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 19 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 20 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 21 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 22 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 23 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 24 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 25 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 28 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 29 | }; 30 | 31 | } -------------------------------------------------------------------------------- /spider-core/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | # Redis 默认没有password验证,如需要密码验证,直接在下方修改 password=你的密码 即可,否则保持默认 2 | redis-ip=127.0.0.1 3 | redis-port=6379 4 | redis-password= 5 | 6 | # 解析的内容存放目录 7 | dir=D:/Data/HEBUTNews/core 8 | 9 | # 开启文本相似度检测 10 | similarCheck=false 11 | 12 | # 目标任务数量 13 | targetNum=100 14 | 15 | # MQ配置 16 | mq-ip=127.0.0.1 17 | mq-port=9876 18 | # MQ开关:0-关闭,1-打开 19 | mq-switch=0 -------------------------------------------------------------------------------- /spider-core/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | 15 | spider-core 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | [%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n 25 | 26 | UTF-8 27 | 28 | 29 | INFO 30 | ACCEPT 31 | DENY 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | ERROR 58 | 59 | 60 | ${LOG_HOME}/error.%d{yyyy-MM-dd}.log 61 | 30 62 | 63 | 64 | UTF-8 65 | [%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n 66 | 67 | 68 | 69 | 70 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 96 | -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestGson.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import com.google.gson.Gson; 4 | import com.jinshuai.entity.UrlSeed; 5 | import org.ansj.splitWord.analysis.*; 6 | import org.junit.Assert; 7 | import org.junit.Test; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** 12 | * @author: JS 13 | * @date: 2018/3/27 14 | * @description: 15 | */ 16 | public class TestGson { 17 | 18 | @Test 19 | public void testSegment() { 20 | String[] arr = {"碳酸铵","硫酸铁", "醋酸钠", "碳酸钙", "氢氧化钠", "硫酸亚铁", "高锰酸钾"}; 21 | for (String str : arr) { 22 | System.out.println(BaseAnalysis.parse(str)); 23 | System.out.println(ToAnalysis.parse(str)); 24 | System.out.println(DicAnalysis.parse(str)); 25 | System.out.println(IndexAnalysis.parse(str)); 26 | System.out.println(NlpAnalysis.parse(str)); 27 | } 28 | 29 | } 30 | 31 | } -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestHttpClient.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import org.apache.http.HttpEntity; 4 | import org.apache.http.HttpResponse; 5 | import org.apache.http.NameValuePair; 6 | import org.apache.http.client.ClientProtocolException; 7 | import org.apache.http.client.ResponseHandler; 8 | import org.apache.http.client.entity.UrlEncodedFormEntity; 9 | import org.apache.http.client.methods.CloseableHttpResponse; 10 | import org.apache.http.client.methods.HttpGet; 11 | import org.apache.http.client.methods.HttpPost; 12 | import org.apache.http.client.utils.URIBuilder; 13 | import org.apache.http.impl.client.AbstractHttpClient; 14 | import org.apache.http.impl.client.CloseableHttpClient; 15 | import org.apache.http.impl.client.HttpClientBuilder; 16 | import org.apache.http.impl.client.HttpClients; 17 | import org.apache.http.message.BasicNameValuePair; 18 | import org.apache.http.util.EntityUtils; 19 | 20 | import java.io.IOException; 21 | import java.net.URI; 22 | import java.net.URISyntaxException; 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.List; 26 | import java.util.Random; 27 | 28 | /** 29 | * @author: JS 30 | * @date: 2018/3/22 31 | * @description: 32 | */ 33 | public class TestHttpClient{ 34 | 35 | private static CloseableHttpClient httpClient = HttpClients.createDefault(); 36 | private static ResponseHandler responseHandler; 37 | private static CloseableHttpResponse response; 38 | private static HttpEntity httpEntity; 39 | 40 | public static void main(String[] args) { 41 | testPost(); 42 | } 43 | 44 | static void testPost() { 45 | try { 46 | HttpPost httpPost = new HttpPost("http://ikc.hebut.edu.cn/view/User/Login.ashx"); 47 | List nvps = new ArrayList(); 48 | nvps.add(new BasicNameValuePair("userid", "js_214")); 49 | nvps.add(new BasicNameValuePair("userpassword", "123456")); 50 | httpPost.setEntity(new UrlEncodedFormEntity(nvps)); 51 | response = httpClient.execute(httpPost); 52 | System.out.println(response.getStatusLine()); 53 | httpEntity = response.getEntity(); 54 | // do something useful with the response body 55 | // and ensure it is fully consumed 56 | EntityUtils.consume(httpEntity); 57 | EntityUtils.toString(httpEntity); 58 | } catch(IOException e) { 59 | e.printStackTrace(); 60 | } finally { 61 | try { 62 | response.close(); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | } 68 | 69 | static void testGet() { 70 | HttpGet httpGet = new HttpGet(getURI("https","baike.baidu.com","/item/数据库引擎",null)); 71 | try { 72 | response = httpClient.execute(httpGet); 73 | httpEntity = response.getEntity(); System.out.println(response.getStatusLine()); 74 | System.out.println("Executing request " + httpGet.getRequestLine()); 75 | 76 | // Create a custom response handler 77 | ResponseHandler responseHandler = TestHttpClient.getSingleResponseHandlerInstance(); 78 | String responseBody = httpClient.execute(httpGet, responseHandler); 79 | System.out.println("----------------------------------------"); 80 | System.out.println(responseBody); 81 | } catch (IOException e) { 82 | e.printStackTrace(); 83 | } finally { 84 | try { 85 | response.close(); 86 | httpClient.close(); 87 | response.getEntity(); 88 | } catch (Exception e) { 89 | e.printStackTrace(); 90 | } 91 | } 92 | } 93 | 94 | static ResponseHandler getSingleResponseHandlerInstance() { 95 | if (responseHandler == null) { 96 | synchronized (TestHttpClient.class) { 97 | if (responseHandler == null) { 98 | responseHandler = new ResponseHandler() { 99 | public String handleResponse(final HttpResponse httpResponse) throws ClientProtocolException, IOException { 100 | int status = httpResponse.getStatusLine().getStatusCode(); 101 | if (status >= 200 && status < 300) { 102 | HttpEntity entity = httpResponse.getEntity(); 103 | return entity != null ? EntityUtils.toString(entity,"UTF-8") : null; 104 | } else { 105 | throw new ClientProtocolException("Unexpected response status: " + status); 106 | } 107 | } 108 | }; 109 | } 110 | } 111 | } 112 | return responseHandler; 113 | } 114 | 115 | static URI getURI(String scheme, String host, String path, HashMap ...parameters) { 116 | URI uri = null; 117 | try { 118 | uri = new URIBuilder() 119 | .setScheme(scheme) 120 | .setHost(host) 121 | .setPath(path) 122 | .setParameter("btnG", "Google Search") 123 | .setParameter("aq", "f") 124 | .setParameter("oq", "") 125 | .build(); 126 | } catch (URISyntaxException e) { 127 | e.printStackTrace(); 128 | } 129 | return uri; 130 | } 131 | 132 | } -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestJDBC.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | import java.sql.Connection; 7 | import java.sql.DriverManager; 8 | import java.sql.SQLException; 9 | 10 | /** 11 | * @author: JS 12 | * @date: 2018/4/22 13 | * @description: 14 | * conn.close() 调用关闭以后 15 | */ 16 | public class TestJDBC { 17 | 18 | public static void main(String[] args) throws ClassNotFoundException, SQLException { 19 | Class.forName("com.mysql.jdbc.Driver"); 20 | Connection connection = DriverManager.getConnection(""); 21 | connection.close(); 22 | try (InputStream inputStream = new FileInputStream(new File(""))) { 23 | connection.close(); 24 | } catch (Exception e) { 25 | 26 | } finally { 27 | // ... 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestJsoup.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import junit.framework.TestCase; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Elements; 8 | 9 | import java.io.IOException; 10 | import java.util.HashSet; 11 | import java.util.Iterator; 12 | import java.util.Set; 13 | 14 | /** 15 | * @author: JS 16 | * @date: 2018/3/23 17 | * @description: 18 | */ 19 | public class TestJsoup extends TestCase{ 20 | 21 | public void testJsoup() { 22 | Document document = Jsoup.parse("\n" + 23 | "\n" + 24 | "\n" + 25 | " \n" + 26 | " \n" + 27 | " \n" + 28 | " \n" + 29 | " 计算机辅助创新设计公共服务平台\n" + 30 | " \n" + 31 | " \n" + 32 | " \n" + 33 | " \n" + 34 | " \n" + 35 | " \n" + 36 | " \n" + 41 | " \n" + 42 | " \n" + 43 | " \n" + 44 | " \n" + 45 | " \n" + 46 | "\n" + 47 | "\n" + 48 | "
\n" + 49 | "
\n" + 50 | "\t
\n" + 51 | "

\n" + 52 | "\t\t\n" + 53 | " 计算机辅助创新设计公共服务平台
\n" + 54 | " \n" + 55 | "
\n" + 56 | " \n" + 57 | "

\n" + 58 | " \n" + 59 | " \n" + 60 | " \n" + 61 | " \n" + 62 | "\t\t\n" + 67 | "\t\t
\n" + 68 | "\t\t\t
\n" + 69 | "\t\t\t\t\n" + 70 | "\t\t\t\t
    \n" + 71 | "\t\t\t
    \n" + 72 | "\t\t\t\n" + 77 | "\t\t
    \n" + 78 | "\n" + 79 | "\t\tTRIZ创新辅助APP(下载量:)\n" + 80 | " \n" + 81 | "\t
    \n" + 82 | "\t\n" + 83 | "\n" + 84 | "\t\n" + 85 | "\n" + 86 | "\t\n" + 91 | "\n" + 92 | "\n" + 93 | "
    \n" + 94 | "
    \n" + 95 | "
    \n" + 96 | "\t\n" + 97 | "
    \n" + 98 | "\t
    \n" + 99 | "
    \n" + 100 | "\n" + 101 | "\t\t\t\n" + 102 | "\t\n" + 103 | "\t
    \n" + 104 | "\n" + 105 | "\t\t
    \n" + 106 | "\t\t\t
    \n" + 107 | "\t\t\t\t\t\t\t\t\t \n" + 108 | " \n" + 109 | "\t\t\t\t\t\t\t\t
    \n" + 110 | "
    \n" + 111 | "\t\t\t
    \n" + 112 | "\n" + 113 | "\t\t\t
    \n" + 114 | "\t\t\t
    \n" + 115 | "\t\t\t\t\t\t\t\t\t\n" + 116 | "\t\t\t\t\t\t\t\t
    \n" + 117 | "\t\t\t
    \n" + 118 | "\n" + 119 | "\t\t\t
    \n" + 120 | "\t\t\t
    \n" + 121 | "\t\t\t\t\t\t\t\t\t\n" + 122 | "\t\t\t\t\t\t\t\t
    \n" + 123 | "\t\t\t\t\t
    \t\t\t\t\t\t\t\t\t
    \n" + 124 | "\t\t\t
    \n" + 125 | "\n" + 126 | "\t\t\t
    \n" + 127 | "\t
    \n" + 128 | "\n" + 129 | "\t
    \n" + 130 | "\n" + 131 | "\t
    Previous
    \n" + 132 | "\t
    Next
    \n" + 133 | "\n" + 134 | "\t
    • Go to slide 1
    • Go to slide 2
    • Go to slide 3
    \n" + 135 | "\n" + 136 | "\t\n" + 137 | "\n" + 138 | "\t
    \t
    \n" + 139 | "
    \n" + 140 | "\n" + 141 | "\n" + 143 | "\n" + 144 | "\n" + 145 | "
    \n" + 146 | "\n" + 147 | "\n" + 148 | "\t
    \n" + 149 | "\t

    创新动态

    \n" + 150 | "
    \n" + 151 | "
    \n" + 152 | "\t
    \n" + 153 | "
    \n" + 154 | "\n" + 155 | "\n" + 156 | "\n" + 157 | "
    \n" + 158 | "\t
    \n" + 159 | "\t

    创意展示 - 新产品

    \n" + 160 | "
    \n" + 161 | "
    \n" + 162 | "\t
    \n" + 163 | "
    \n" + 164 | "\n" + 165 | "
    \n" + 166 | "\t
    \n" + 167 | "\t

    创意展示 - 新创意

    \n" + 168 | "
    \n" + 169 | "
    \n" + 170 | "\t
    \n" + 171 | "
    \n" + 172 | "\n" + 173 | "
    \n" + 174 | "\n" + 175 | "\t
    \n" + 176 | "\t

    知识共享

    \n" + 177 | "
    \n" + 178 | "
    \n" + 179 | "\t
    \n" + 180 | "
    \n" + 181 | "\n" + 182 | "\n" + 183 | "
    \n" + 184 | "\t
    \n" + 185 | "\t

    方案征集

    \n" + 186 | "
    \n" + 187 | "
    \n" + 188 | "\n" + 189 | "
    \n" + 190 | "\n" + 191 | "
    \n" + 192 | "\t\n" + 195 | "\t
    \n" + 196 | "
    \n" + 197 | "\n" + 198 | "\n" + 199 | "\n" + 200 | "\n" + 201 | "\n" + 202 | "\n" + 203 | "\n" + 204 | "\n" + 205 | "\n" + 206 | "
    \n" + 207 | " \n" + 208 | "
    \n" + 209 | "

    \n" + 210 | " 友情链接 \n" + 211 | "

    \n" + 212 | " \n" + 213 | " \"\" \n" + 215 | " \"\" \n" + 217 | " \"\"\n" + 218 | " \n" + 219 | " \"\" \n" + 221 | " \"\" \n" + 223 | " \"\"\n" + 224 | "
    \n" + 225 | " \n" + 237 | " \n" + 252 | " \n" + 267 | "
    \n" + 268 | "
    \n" + 269 | " \n" + 270 | "
    \n" + 271 | "
    \n" + 272 | "
    \n" + 273 | "
    \n" + 274 | "版权所有:河北工业大学CAI研究实验室 天津市北辰区西平道 5340 号, 邮编:300401\n" + 275 | "
    \n" + 276 | "
    \n" + 277 | "
    \n" + 278 | "
    \n" + 279 | "
    \n" + 280 | "
    \n" + 281 | "
    \n" + 282 | "
    \n" + 283 | "

    \n" + 285 | " 订阅计算机辅助创新设计公共服务平台

    \n" + 286 | "
    \n" + 287 | "
    \n" + 288 | "

    \n" + 289 | " 订阅地址
    \n" + 290 | "

    \n" + 291 | "

    \n" + 292 | " 订阅到
    \n" + 293 | " \n" + 294 | " QQ邮箱 \n" + 295 | " 鲜果 \n" + 296 | " 抓虾

    \n" + 297 | "
    \n" + 298 | "
    \n" + 299 | "
    \n" + 300 | " \n" + 303 | "
    \n" + 304 | " \n" + 305 | "\n" + 306 | "\n" + 344 | ""); 345 | Document document1 = null; 346 | try { 347 | document1 = Jsoup.connect("http://xww.hebut.edu.cn/gdyw/67001.htm").get(); 348 | // // 标题 349 | // Element element = document1.selectFirst("div.sub_articleTitle"); 350 | // System.out.println(element.getElementsByTag("h2").text()); 351 | // // 时间 352 | // Element element1 = document1.selectFirst("div.sub_articleAuthor"); 353 | // System.out.println(element1.getElementsByTag("strong").eachText().get(0)); 354 | // // 正文 355 | // Element element2 = document1.selectFirst("div.sub_articleInfo"); 356 | 357 | //StringBuilder stringBuilder = new StringBuilder(); 358 | Set urlSeeds = new HashSet(); 359 | Iterator iterator = document1.getElementsByTag("a").iterator(); 360 | while (iterator.hasNext()) { 361 | Element element3 = (Element) iterator.next(); 362 | String href = element3.attr("href").toString(); 363 | if (href.contains("/") || href.contains("#")) continue; 364 | urlSeeds.add("http://xww.hebut.edu.cn/gdyw/" + href); 365 | //stringBuilder.append(element3.text()); 366 | } 367 | urlSeeds.remove("index.html"); 368 | urlSeeds.remove(""); 369 | urlSeeds.remove("javascript:void(0);"); 370 | System.out.println(urlSeeds); 371 | 372 | 373 | } catch (IOException e) { 374 | e.printStackTrace(); 375 | } 376 | //System.out.println(document1.getElementsByTag("h2")); 377 | 378 | } 379 | 380 | } 381 | -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestRedis.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig; 4 | import redis.clients.jedis.Jedis; 5 | 6 | /** 7 | * @author: JS 8 | * @date: 2018/3/27 9 | * @description: 10 | */ 11 | public class TestRedis { 12 | 13 | public static void main(String[] args) { 14 | //连接本地的 Redis 服务 15 | Jedis jedis = new Jedis("127.0.0.1",6379); 16 | //查看服务是否运行 17 | System.out.println("服务正在运行: "+jedis.ping()); 18 | // jedis.sre 19 | GenericObjectPoolConfig config = new GenericObjectPoolConfig(); 20 | 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /spider-core/src/test/java/com/TestReg.java: -------------------------------------------------------------------------------- 1 | package com; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | /** 7 | * @author: JS 8 | * @date: 2018/3/23 9 | * @description: 10 | */ 11 | public class TestReg { 12 | 13 | public static void main(String[] args) { 14 | String src = "\n" + 15 | "\n" + 16 | " \n" + 17 | "\n" + 18 | "\n" + 19 | "\n" + 20 | "\n" + 21 | "\n" + 22 | "\n" + 23 | "\n" + 24 | ""; 25 | Pattern pattern = Pattern.compile("([\\s\\S]*?)