├── .gitignore
├── README.md
├── _config.yml
├── pom.xml
├── spider-consumer
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── jinshuai
    │           │       ├── Consumer.java
    │           │       ├── core
    │           │           ├── downloader
    │           │           │   ├── Downloader.java
    │           │           │   └── impl
    │           │           │   │   └── HttpClientPoolDownloader.java
    │           │           ├── parser
    │           │           │   ├── Parser.java
    │           │           │   └── impl
    │           │           │   │   └── NewsParser.java
    │           │           ├── saver
    │           │           │   ├── Saver.java
    │           │           │   └── impl
    │           │           │   │   └── TextSaver.java
    │           │           └── scheduler
    │           │           │   ├── Scheduler.java
    │           │           │   └── impl
    │           │           │       └── RedisScheduler.java
    │           │       ├── entity
    │           │           ├── Page.java
    │           │           └── UrlSeed.java
    │           │       └── util
    │           │           ├── JedisUtils.java
    │           │           ├── PropertiesUtils.java
    │           │           ├── hash
    │           │               ├── MurmurHash.java
    │           │               └── PageUtils.java
    │           │           └── http
    │           │               ├── HttpUtils.java
    │           │               ├── StatusHandler.java
    │           │               └── UserAgentArray.java
    │       └── resources
    │           ├── application.properties
    │           └── logback.xml
├── spider-core
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── jinshuai
    │       │   │       ├── Spider.java
    │       │   │       ├── core
    │       │   │           ├── README.md
    │       │   │           ├── downloader
    │       │   │           │   ├── Downloader.java
    │       │   │           │   └── impl
    │       │   │           │   │   └── HttpClientPoolDownloader.java
    │       │   │           ├── parser
    │       │   │           │   ├── Parser.java
    │       │   │           │   └── impl
    │       │   │           │   │   ├── BaiKeParser.java
    │       │   │           │   │   └── NewsParser.java
    │       │   │           ├── saver
    │       │   │           │   ├── Saver.java
    │       │   │           │   └── impl
    │       │   │           │   │   ├── DataBaseSaver.java
    │       │   │           │   │   └── TextSaver.java
    │       │   │           └── scheduler
    │       │   │           │   ├── Scheduler.java
    │       │   │           │   └── impl
    │       │   │           │       ├── PriorityQueueScheduler.java
    │       │   │           │       └── RedisScheduler.java
    │       │   │       ├── entity
    │       │   │           ├── Page.java
    │       │   │           └── UrlSeed.java
    │       │   │       └── util
    │       │   │           ├── ExcelUtils.java
    │       │   │           ├── JedisUtils.java
    │       │   │           ├── OfficeUtils.java
    │       │   │           ├── PropertiesUtils.java
    │       │   │           ├── hash
    │       │   │               ├── MurmurHash.java
    │       │   │               └── PageUtils.java
    │       │   │           └── http
    │       │   │               ├── HttpUtils.java
    │       │   │               ├── StatusHandler.java
    │       │   │               └── UserAgentArray.java
    │       └── resources
    │       │   ├── application.properties
    │       │   └── logback.xml
    │   └── test
    │       └── java
    │           └── com
    │               ├── TestGson.java
    │               ├── TestHttpClient.java
    │               ├── TestJDBC.java
    │               ├── TestJsoup.java
    │               ├── TestRedis.java
    │               └── TestReg.java
└── spider-flowchart.svg


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.iml,*.idea
 3 | '.idea'
 4 | *.iml
 5 | logs/
 6 | # Mobile Tools for Java (J2ME)
 7 | .mtj.tmp/
 8 | .idea/
 9 | # Package Files #
10 | *.jar
11 | *.war
12 | .idea/*
13 | .idea*
14 | \.idea*
15 | *.ear
16 | target
17 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
18 | hs_err_pid*
19 | *.MF


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spider
 2 | 
 3 | Spider是一个基于Java的简易多线程爬虫框架，并且提供了默认组件。用户也可以根据需要实现自己的组件  
 4 | - 具体流程
 5 |   - 首先在调度器中添加初始种子，开启线程池。
 6 |   - 工作线程开始从种子调度器中取URL种子
 7 |   - 使用下载器获取URL对应的页面内容
 8 |   - 使用解析器解析页面内容，将页面里的URL封装成URL种子，添加到种子调度器中。
 9 |   - 持久器会判断是否已经存在类似的文本内容，如果存在会做丢弃处理，否则会将页面中的内容做持久化处理。
10 |   
11 | ![流程图](./spider-flowchart.svg)
12 | 
13 | # 使用
14 | 
15 | ## 开发环境
16 | - JDK8+
17 | - Maven3+
18 | - lombok
19 | 
20 | ## 使用
21 | - 修改`application.properties`中存放解析内容的路径`dir`
22 | - 如果使用`Redis`作为种子调度器(默认使用优先队列)，需要修改`application.properties`中配置的`redis-ip`、`redis-port`和`redis-password`。如果你的Redis不需要密码验证，就不用修改文件里的`password`属性。
23 | - 如果使用消息队列(框架使用的是[RocketMQ](https://rocketmq.apache.org/))，需要修改`application.properties`中配置的`mq-ip`、`mq-port`。不用的话可以将`mq-switch`置为0，关闭消息队列。
24 | - 默认解析器解析: 如果解析其它类型的网页，需要重写`Parser.java`接口以及提供给种子调度器的初始种子
25 | - 运行`Spider.java`
26 | ```Java
27 |         Spider.build()
28 |                 .addUrlSeed(new UrlSeed("http://xww.hebut.edu.cn/gdyw/index.htm"))
29 |                 .run();
30 | ```
31 | 
32 | # 项目结构
33 | 
34 | ```Shell
35 | ├── logs                                          // 系统日志
36 | ├── spider-consumer                               // 消费模块(消费4XX 3XX状态码对应URL)
37 | ├── spider-core                                   // 爬虫模块
38 | │   ├── src                                       // 源码
39 | │   ├── |——main
40 | │   ├── ├──|——java/com/jinshuai                          
41 | │   ├── ├──├──|——core                             // 核心组件
42 | │   ├── ├──├──|————downloader                     // 下载器
43 | │   ├── ├──├──|————parser                         // 解析器
44 | │   ├── ├──├──|————saver                          // 持久器
45 | │   ├── ├──├──|————scheduler                      // URL调度器
46 | │   ├── ├──├──|——entity                           // 实体
47 | │   ├── ├──├──|——util                             // 工具
48 | │   ├── ├──|——resources                           // 资源目录
49 | │   ├── ├──|——|——application.properties           // 配置文件
50 | 
51 | ```
52 | 
53 | # 进度
54 | ## Finished
55 | - [x] 配置了[Http连接池](https://hc.apache.org/httpcomponents-client-ga/)，完成了Http请求和处理Http响应<br>
56 | - [x] [解析](https://jsoup.org/)响应的内容
57 | - [x] 配置线程池，通过[Redis](https://redis.io/)缓存URL种子
58 | - [x] 持久化解析结果
59 | - [x] 添加新的种子调度器（优先队列结合布隆过滤器）
60 | - [x] 对于Redis调度器，存放url对应的hash进行判重减少空间使用
61 | - [x] 使用SimHash进行文本相似度检测
62 | - [x] 将3XX 4XX 5XX状态码对应URL放到消息队列中去消费
63 | 
64 | ## TODO
65 | - [ ] 定时解析失败日志，将失败URL重新加入爬取仓库，设置失败次数限制，超过指定次数就放弃。
66 | - [ ] 分布式环境下，统一存放解析后的文本
67 | - [ ] 各个组件进行热替换
68 | - [ ] 优化解析页面代码
69 | 
70 | # 参考
71 | - **代码和设计思路**参考自[https://github.com/xjtushilei/ScriptSpider](https://github.com/xjtushilei/ScriptSpider)
72 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.jinshuai</groupId>
 8 |     <artifactId>spider</artifactId>
 9 |     <packaging>pom</packaging>
10 |     <version>1.0-SNAPSHOT</version>
11 | 
12 |     <modules>
13 |         <module>spider-core</module>
14 |         <module>spider-consumer</module>
15 |     </modules>
16 | 
17 |     <name>spider</name>
18 | 
19 |     <url>https://github.com/jinshuai86/Spider</url>
20 | 
21 |     <properties>
22 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
23 |     </properties>
24 | 
25 | </project>
26 | 


--------------------------------------------------------------------------------
/spider-consumer/README.md:
--------------------------------------------------------------------------------
1 | # 消费者
2 | 消费状态码为3XX 4XX 5XX对应的URL


--------------------------------------------------------------------------------
/spider-consumer/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     
  6 |     <parent>
  7 |         <artifactId>spider</artifactId>
  8 |         <groupId>com.jinshuai</groupId>
  9 |         <version>1.0-SNAPSHOT</version>
 10 |     </parent>
 11 | 
 12 |     <modelVersion>4.0.0</modelVersion>
 13 | 
 14 |     <groupId>com.jinshuai.consumer</groupId>
 15 |     <artifactId>spider-consumer</artifactId>
 16 |     <version>1.0</version>
 17 |     <packaging>jar</packaging>
 18 | 
 19 |     <name>spider-consumer</name>
 20 | 
 21 |     <url>https://github.com/jinshuai86/Spider</url>
 22 | 
 23 |     <properties>
 24 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 25 |     </properties>
 26 | 
 27 | 
 28 |     <build>
 29 |         <plugins>
 30 |             <!--  添加编译插件支持jdk1.8 -->
 31 |             <plugin>
 32 |                 <groupId>org.apache.maven.plugins</groupId>
 33 |                 <artifactId>maven-compiler-plugin</artifactId>
 34 |                 <configuration>
 35 |                     <source>1.8</source>
 36 |                     <target>1.8</target>
 37 |                     <encoding>UTF-8</encoding>
 38 |                 </configuration>
 39 |             </plugin>
 40 |             <!-- 设置入口类 -->
 41 |             <plugin>
 42 |                 <groupId>org.apache.maven.plugins</groupId>
 43 |                 <artifactId>maven-shade-plugin</artifactId>
 44 |                 <version>1.2.1</version>
 45 |                 <executions>
 46 |                     <execution>
 47 |                         <phase>package</phase>
 48 |                         <goals>
 49 |                             <goal>shade</goal>
 50 |                         </goals>
 51 |                         <configuration>
 52 |                             <transformers>
 53 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
 54 |                                     <mainClass>com.jinshuai.Consumer</mainClass>
 55 |                                 </transformer>
 56 |                             </transformers>
 57 |                         </configuration>
 58 |                     </execution>
 59 |                 </executions>
 60 |             </plugin>
 61 |         </plugins>
 62 |     </build>
 63 | 
 64 | 
 65 |     <dependencies>
 66 |         <dependency>
 67 |             <groupId>junit</groupId>
 68 |             <artifactId>junit</artifactId>
 69 |             <version>4.12</version>
 70 |             <scope>test</scope>
 71 |         </dependency>
 72 |         <!-- use its log function -->
 73 |         <dependency>
 74 |             <groupId>org.projectlombok</groupId>
 75 |             <artifactId>lombok</artifactId>
 76 |             <version>1.18.2</version>
 77 |             <scope>provided</scope>
 78 |         </dependency>
 79 |         <!--slf4j logback-->
 80 |         <dependency>
 81 |             <groupId>ch.qos.logback</groupId>
 82 |             <artifactId>logback-classic</artifactId>
 83 |             <version>1.2.3</version>
 84 |         </dependency>
 85 |         <!-- send http request-->
 86 |         <dependency>
 87 |             <groupId>org.apache.httpcomponents</groupId>
 88 |             <artifactId>httpclient</artifactId>
 89 |             <version>4.5.5</version>
 90 |         </dependency>
 91 |         <!-- parse DOM -->
 92 |         <dependency>
 93 |             <groupId>org.jsoup</groupId>
 94 |             <artifactId>jsoup</artifactId>
 95 |             <version>1.11.2</version>
 96 |         </dependency>
 97 |         <!-- store urlSeed -->
 98 |         <dependency>
 99 |             <groupId>redis.clients</groupId>
100 |             <artifactId>jedis</artifactId>
101 |             <version>2.9.0</version>
102 |         </dependency>
103 |         <!-- parse json -->
104 |         <dependency>
105 |             <groupId>com.google.code.gson</groupId>
106 |             <artifactId>gson</artifactId>
107 |             <version>2.8.0</version>
108 |         </dependency>
109 |         <!-- use its bloom filter -->
110 |         <dependency>
111 |             <groupId>com.google.guava</groupId>
112 |             <artifactId>guava</artifactId>
113 |             <version>26.0-jre</version>
114 |         </dependency>
115 |         <!-- super datetime -->
116 |         <dependency>
117 |             <groupId>joda-time</groupId>
118 |             <artifactId>joda-time</artifactId>
119 |             <version>2.9.9</version>
120 |         </dependency>
121 |         <!-- tokenizer -->
122 |         <dependency>
123 |             <groupId>org.ansj</groupId>
124 |             <artifactId>ansj_seg</artifactId>
125 |             <version>5.1.1</version>
126 |         </dependency>
127 |         <!-- RocketMQ -->
128 |         <dependency>
129 |             <groupId>org.apache.rocketmq</groupId>
130 |             <artifactId>rocketmq-client</artifactId>
131 |             <version>4.4.0</version>
132 |         </dependency>
133 |     </dependencies>
134 | 
135 | </project>
136 | 


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/Consumer.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai;
  2 | 
  3 | import com.jinshuai.core.downloader.Downloader;
  4 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader;
  5 | import com.jinshuai.core.parser.Parser;
  6 | import com.jinshuai.core.parser.impl.NewsParser;
  7 | import com.jinshuai.core.saver.Saver;
  8 | import com.jinshuai.core.saver.impl.TextSaver;
  9 | import com.jinshuai.core.scheduler.Scheduler;
 10 | import com.jinshuai.core.scheduler.impl.RedisScheduler;
 11 | import com.jinshuai.entity.Page;
 12 | import com.jinshuai.entity.UrlSeed;
 13 | import com.jinshuai.util.PropertiesUtils;
 14 | import lombok.extern.slf4j.Slf4j;
 15 | import org.apache.rocketmq.client.consumer.DefaultMQPushConsumer;
 16 | import org.apache.rocketmq.client.consumer.listener.ConsumeConcurrentlyStatus;
 17 | import org.apache.rocketmq.client.consumer.listener.MessageListenerConcurrently;
 18 | import org.apache.rocketmq.client.exception.MQClientException;
 19 | import org.apache.rocketmq.common.message.MessageExt;
 20 | import org.apache.rocketmq.remoting.common.RemotingHelper;
 21 | 
 22 | import java.io.UnsupportedEncodingException;
 23 | import java.util.concurrent.*;
 24 | 
 25 | 
 26 | /**
 27 |  * @author: JS
 28 |  * @date: 2018/06/27
 29 |  * @description: 消费端
 30 |  */
 31 | @Slf4j
 32 | public class Consumer {
 33 |     
 34 |     /**
 35 |      * 设置爬虫组件：scheduler、downloader、parser、saver、
 36 |      */
 37 |     private Scheduler scheduler;
 38 |     private Downloader downloader;
 39 |     private Parser parser;
 40 |     private Saver saver;
 41 | 
 42 |     /**
 43 |      * 线程池参数配置
 44 |      */
 45 |     private ScheduledThreadPoolExecutor pool;
 46 |     private static final int CORE_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 2;
 47 |     private static final int INITIAL_DELAY = 5;
 48 |     private static final int PERIOD = 20;
 49 | 
 50 |     private static final String CHARSET = RemotingHelper.DEFAULT_CHARSET;
 51 | 
 52 |     private Consumer setScheduler(Scheduler scheduler) {
 53 |         if (scheduler == null) {
 54 |             log.error("未设置调度器，启动失败");
 55 |             System.exit(-1);
 56 |         }
 57 |         this.scheduler = scheduler;
 58 |         return this;
 59 |     }
 60 | 
 61 |     private Consumer setDownloader(Downloader downloader) {
 62 |         if (downloader == null) {
 63 |             log.error("未设置下载器，启动失败");
 64 |             System.exit(-1);
 65 |         }
 66 |         this.downloader = downloader;
 67 |         return this;
 68 |     }
 69 | 
 70 |     private Consumer setParser(Parser parser) {
 71 |         if (parser == null) {
 72 |             log.error("未设置解析器，启动失败");
 73 |             System.exit(-1);
 74 |         }
 75 |         this.parser = parser;
 76 |         return this;
 77 |     }
 78 | 
 79 |     private Consumer setSaver(Saver saver) {
 80 |         if (saver == null) {
 81 |             log.error("未设置保存器，启动失败");
 82 |             System.exit(-1);
 83 |         }
 84 |         this.saver = saver;
 85 |         return this;
 86 |     }
 87 | 
 88 |     private Consumer setThreadPool() {
 89 |         pool =  new ScheduledThreadPoolExecutor(CORE_POOL_SIZE);
 90 |         return this;
 91 |     }
 92 | 
 93 |     private void run() {
 94 |         log.info("消费者启动......");
 95 |         startConsumer();
 96 |         while (true) {
 97 |             UrlSeed urlSeed = scheduler.pop();
 98 |             try {
 99 |                 if (urlSeed == null) {
100 | //                    log.info("队列暂无消息，等待中......");
101 |                     TimeUnit.SECONDS.sleep(1);
102 |                 } else {
103 |                     log.info("准备解析URL:[{}]，优先级(默认5):[{}]", urlSeed.getUrl(), urlSeed.getPriority());
104 |                     pool.scheduleAtFixedRate(new ConsumerWork(urlSeed),INITIAL_DELAY, PERIOD, TimeUnit.SECONDS);
105 |                 }
106 |             } catch (InterruptedException e) {
107 |                 log.error("当前线程被中断", e);
108 |             } catch (RejectedExecutionException e) {
109 |                 log.error("拒绝此次提交的任务[{}]", urlSeed, e);
110 |             } catch (Exception e) {
111 |                 log.error("线程池定时任务停止工作,重新启动线程池", e);
112 |                 pool.scheduleAtFixedRate(new ConsumerWork(urlSeed),INITIAL_DELAY, PERIOD, TimeUnit.SECONDS);
113 |             }
114 |         }
115 |     }
116 | 
117 |     private class ConsumerWork implements Runnable {
118 | 
119 |         private UrlSeed urlSeed;
120 | 
121 |         ConsumerWork(UrlSeed urlSeed) {
122 |             this.urlSeed = urlSeed;
123 |         }
124 | 
125 |         public void run() {
126 |             try {
127 |                 log.info("已完成任务数量:[{}]，运行中线程数量：[{}]，最大线程运行数量: [{}]，工作队列任务数量：[{}]",
128 |                         pool.getCompletedTaskCount(), pool.getActiveCount(), pool.getMaximumPoolSize(), pool.getQueue().size());
129 |                 Page page = downloader.download(urlSeed);
130 |                 parser.parse(page);
131 |                 // 将新的种子添加到调度器中
132 |                 page.getUrlSeeds().forEach(seed -> scheduler.push(seed));
133 |                 saver.save(page);
134 |             } finally {
135 |             }
136 |         }
137 |     }
138 |     
139 |     private void startConsumer() {
140 |         PropertiesUtils properties = PropertiesUtils.getInstance();
141 |         String ip = properties.get("mq-ip");
142 |         String port = properties.get("mq-port");
143 |         DefaultMQPushConsumer consumer = new DefaultMQPushConsumer("Consumer-Group");
144 |         try {
145 |             consumer.setNamesrvAddr(ip + ":" + port);
146 |             consumer.subscribe("Forbidden-Topic", "*");
147 |             consumer.subscribe("Redirect-Topic", "*");
148 |             consumer.subscribe("ServerWrong-Topic", "*");
149 |             consumer.registerMessageListener((MessageListenerConcurrently) (msgs, context) -> {
150 |                 for (MessageExt msg : msgs) {
151 |                     try {
152 | //                        log.info("consume success [{}]", msg.toString());
153 |                         // 其它状态码对应的url优先级是0
154 |                         scheduler.push(new UrlSeed(new String(msg.getBody(), CHARSET), 0)); // TODO
155 |                     } catch (UnsupportedEncodingException e) {
156 |                         log.error("unsupported encoding[{}]", CHARSET, e);
157 |                     }
158 |                 }
159 |                 return ConsumeConcurrentlyStatus.CONSUME_SUCCESS; });
160 |             consumer.start();
161 |         } catch (MQClientException e) {
162 |             log.error("failed to start consumer", e);
163 |             System.exit(-1);
164 |         }
165 |         log.info("Consumer Started.");
166 |     }
167 | 
168 |     private static Consumer build() {
169 |         return new Consumer()
170 |                 .setDownloader(new HttpClientPoolDownloader())
171 |                 .setParser(new NewsParser())
172 |                 .setSaver(new TextSaver())
173 |                 .setScheduler(new RedisScheduler())
174 |                 .setThreadPool();
175 |     }
176 | 
177 |     /**
178 |      * Test
179 |      * */
180 |     public static void main(String[] args) {
181 |         Consumer.build()
182 |                 .run();
183 |     }
184 | 
185 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/downloader/Downloader.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.downloader;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | import com.jinshuai.entity.UrlSeed;
 5 | 
 6 | /**
 7 |  * 下载器接口，可以针对此接口构造多种下载器实现
 8 |  * @see com.jinshuai.core.downloader.impl.HttpClientPoolDownloader
 9 |  * */
10 | public interface Downloader {
11 | 
12 |     /***
13 |      * @param urlSeed  待使用种子
14 |      * @return 响应体内容封装成的Page
15 |      */
16 |     Page download(UrlSeed urlSeed);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/downloader/impl/HttpClientPoolDownloader.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.downloader.impl;
 2 | 
 3 | import com.jinshuai.core.downloader.Downloader;
 4 | import com.jinshuai.entity.Page;
 5 | import com.jinshuai.entity.UrlSeed;
 6 | import com.jinshuai.util.http.HttpUtils;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | import org.jsoup.Jsoup;
 9 | import org.jsoup.nodes.Document;
10 | 
11 | /**
12 |  * @author: JS
13 |  * @date: 2018/3/26
14 |  * @description:
15 |  *  通过Http连接池下载
16 |  */
17 | @Slf4j
18 | public class HttpClientPoolDownloader implements Downloader {
19 | 
20 |     public Page download(UrlSeed urlSeed) {
21 |         Page page = null;
22 |         try {
23 |             String html = HttpUtils.getSingleInstance().getContent(urlSeed.getUrl());
24 |             Document document = Jsoup.parse(html, urlSeed.getUrl());
25 |             page = new Page(urlSeed, document);
26 |         } catch (Exception e) {
27 |             log.error("下载器下载的相应文本获取DOM树失败", e);
28 |         }
29 |         return page;
30 |     }
31 | 
32 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/parser/Parser.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.parser;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | 
 5 | /**
 6 |  * @author JS
 7 |  * @date 2018/03/26
 8 |  * @description
 9 |  *  解析Page
10 |  * */
11 | public interface Parser {
12 | 
13 |     /**
14 |      * @param page 要解析的Page
15 |      * @return 解析后的Page(Map、Set)
16 |      * @description 解析Page中的Document的内容到Map中，URL到Set中
17 |      * */
18 |     Page parse(Page page);
19 | 
20 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/parser/impl/NewsParser.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.core.parser.impl;
  2 | 
  3 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader;
  4 | import com.jinshuai.core.parser.Parser;
  5 | import com.jinshuai.entity.Page;
  6 | import com.jinshuai.entity.UrlSeed;
  7 | import lombok.extern.slf4j.Slf4j;
  8 | import org.joda.time.DateTime;
  9 | import org.joda.time.DateTimeUtils;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | 
 13 | import java.util.*;
 14 | 
 15 | /**
 16 |  * @author: JS
 17 |  * @date: 2018/3/26
 18 |  * @description:
 19 |  *  针对hebut新闻类的网页，解析相应内容。
 20 |  */
 21 | @Slf4j
 22 | public class NewsParser implements Parser {
 23 | 
 24 |     private static volatile int firstTime = 0;
 25 | 
 26 |     // TODO: 待优化解析过程
 27 |     public Page parse(Page page) {
 28 |         // 获取DOM树
 29 |         Document document;
 30 |         try {
 31 |             document = page.getDocument();
 32 |             long priority = timestamp2Priority(document);
 33 |             // 种子,并进行预处理
 34 |             Set<UrlSeed> urlSeeds = new HashSet<>();
 35 |             Iterator seedIterator = document.getElementsByTag("a").iterator();
 36 |             while (seedIterator.hasNext()) {
 37 |                 Element element3 = (Element) seedIterator.next();
 38 |                 String href = element3.attr("href");
 39 |                 if (href.contains("http://www.hebut.edu.cn/")|| href.contains("/")  || href.contains("#") || href.contains("index.htm") || href.contains("javascript:void(0);")) continue;
 40 |                 if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) continue;
 41 |                 urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/gdyw/" + href, priority));
 42 |             }
 43 |             page.setUrlSeeds(urlSeeds);
 44 |             if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) {
 45 |                 return page;
 46 |             }
 47 |             Map<String, String> items = new HashMap<String, String>(3);
 48 |             // 标题
 49 |             Element titleElement = document.selectFirst("div.sub_articleTitle");
 50 |             items.put("title", titleElement.getElementsByTag("h2").text());
 51 |             // 时间
 52 |             Element dateElement = document.selectFirst("div.sub_articleAuthor");
 53 |             items.put("date", dateElement.getElementsByTag("strong").eachText().get(0));
 54 |             // 正文
 55 |             Element textElement = document.selectFirst("div.sub_articleInfo");
 56 |             Iterator textIterator = textElement.getElementsByTag("span").iterator();
 57 |             StringBuilder stringBuilder = new StringBuilder();
 58 |             while (textIterator.hasNext()) {
 59 |                 Element element3 = (Element) textIterator.next();
 60 |                 stringBuilder.append(element3.text());
 61 |             }
 62 |             items.put("content", stringBuilder.toString());
 63 |             page.setItems(items);
 64 |         } catch (Exception e) {
 65 |             log.error("解析页面[{}]出错",page.getUrlSeed().getUrl(),e);
 66 |         } finally {
 67 |             return page;
 68 |         }
 69 |     }
 70 | 
 71 |     /**
 72 |      * 该Page中的url时间戳参考该Page的时间戳计算优先级
 73 |      * */
 74 |     private long timestamp2Priority(Document document) {
 75 |         String date;
 76 |         try {
 77 |             date = document.selectFirst("div.sub_articleAuthor").getElementsByTag("strong").eachText().get(0);
 78 |         } catch (Exception e) {
 79 |             log.error("解析页面异常",e);
 80 |             return 5;
 81 |         }
 82 |         DateTime dateTime = new DateTime(date);
 83 |         // 获取时间戳的差值
 84 |         long v = DateTimeUtils.currentTimeMillis() - dateTime.getMillis();
 85 |         // 换算成天数
 86 |         v /= 86400000;
 87 |         // 发布时间超过10天设置低的优先级：3，10天：5，小于10天：10
 88 |         return v > 10 ? 3 : v == 10 ? 5 : 10;
 89 |     }
 90 | 
 91 |     private Page getHyperLinkTag(Page page) {
 92 |         if (page == null) {
 93 |             throw new RuntimeException("page 为空");
 94 |         }
 95 |         // 获取DOM树
 96 |         Document document = page.getDocument();
 97 |         // 如果是首页
 98 |         if ("http://xww.hebut.edu.cn".equals(page.getUrlSeed().getUrl()) && firstTime == 0) {
 99 |             Set<UrlSeed> urlSeeds = new HashSet<UrlSeed>();
100 |             Iterator seedIterator = document.getElementsByTag("a").iterator();
101 |             while (seedIterator.hasNext()) {
102 |                 Element element3 = (Element) seedIterator.next();
103 |                 String href = element3.attr("href");
104 |                 if (href.contains("#") || href.contains("index.html") || href.contains("javascript:void(0);")) continue;
105 |                 if (href.startsWith("gdyw") || href.startsWith("zhyw")) {
106 |                     urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/" + href,
107 |                             (int) (Math.random() * 10)));
108 |                 }
109 |             }
110 |             page.setUrlSeeds(urlSeeds);
111 |             // 已经访问过首页
112 |             firstTime = 1;
113 |         }
114 |         return page;
115 |     }
116 |     /**
117 |      * test
118 |      * */
119 |     public static void main(String[] args) {
120 |         UrlSeed urlSeed = new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5);
121 |         Page page = new HttpClientPoolDownloader().download(urlSeed);
122 | //        Page page = new Page(new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5), Jsoup.parse("<html></html>","http://xww.hebut.edu.cn/gdyw/index.htm"));
123 |         System.out.println(new NewsParser().parse(page));
124 |     }
125 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/saver/Saver.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.saver;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | 
 5 | /**
 6 |  * 数据持久化
 7 |  * */
 8 | public interface Saver {
 9 | 
10 |     /**
11 |      * just do it
12 |      * */
13 |     void save(Page page);
14 | 
15 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/saver/impl/TextSaver.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.saver.impl;
 2 | 
 3 | import com.jinshuai.core.saver.Saver;
 4 | import com.jinshuai.entity.Page;
 5 | import com.jinshuai.util.PropertiesUtils;
 6 | import com.jinshuai.util.hash.PageUtils;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | 
 9 | import java.io.File;
10 | import java.io.FileWriter;
11 | import java.io.IOException;
12 | import java.util.Date;
13 | 
14 | /**
15 |  * @author: JS
16 |  * @date: 2018/3/27
17 |  * @description:
18 |  *  存储到txt
19 |  */
20 | @Slf4j
21 | public class TextSaver implements Saver {
22 | 
23 |     private String parentDir;
24 | 
25 |     private PageUtils pageUtil = PageUtils.getInstance();
26 | 
27 |     private PropertiesUtils propertiesUtil = PropertiesUtils.getInstance();
28 | 
29 |     public TextSaver() {
30 |         init();
31 |     }
32 | 
33 |     /**
34 |      * 初始化文件要存的目录
35 |      * */
36 |     private void init() {
37 |         parentDir = PropertiesUtils.getInstance().get("dir");
38 |         File file = new File(parentDir);
39 |         if (!file.exists()) {
40 |             file.mkdirs();
41 |         }
42 |         log.info("解析后的文件存放位置：[{}]",parentDir);
43 |     }
44 | 
45 |     public void save(Page page) {
46 |         if (page == null) {
47 |             return;
48 |         }
49 |         // 文本相似度检测
50 |         String similarCheck = propertiesUtil.get("similarCheck");
51 |         if (similarCheck != null && !similarCheck.trim().equals("") &&similarCheck.equalsIgnoreCase("true")) {
52 |             String title = page.getItems().get("title");
53 |             String content = page.getItems().get("content");
54 |             if(pageUtil.exist(title, content)) {
55 |                 log.info("标题为 [{}] 的相似文章已经存在", title);
56 |             }
57 |         }
58 |         File file = new File(String.format("%s%s.txt",parentDir,new Date().getTime()));
59 |         try (FileWriter fw = new FileWriter(file)) {
60 |             if (page.getItems() == null) {
61 |                 fw.flush();
62 |                 return;
63 |             }
64 |             fw.append(String.format("[标题] %s\n",page.getItems().get("title")));
65 |             fw.append(String.format("[日期] %s\n", page.getItems().get("date")));
66 |             fw.append(String.format("[正文] %s\n",page.getItems().get("content")));
67 |             fw.append(String.format("[链接] %s\n",page.getUrlSeed().getUrl()));
68 |             fw.flush();
69 |         } catch (IOException e) {
70 |             log.error("存储路径无效",e);
71 |         }
72 |     }
73 | 
74 |     public static void main(String[] args) throws IOException {
75 | //        String parentDir = "E:/HEBUTNews/";
76 | //        File file = new File(parentDir+ (new Date().getTime()) + ".txt");
77 | //        //file.createNewFile();
78 | //        if (!file.getParentFile().exists()) {
79 | //            //file.getParentFile().mkdirs();
80 | //
81 | //        }
82 | //        FileWriter fileWriter = new FileWriter(file);
83 | //        fileWriter.append("fasdfs");
84 | //        fileWriter.flush();
85 |         Saver saver = new TextSaver();
86 | 
87 | //        new TextSaver().save(new Page(new UrlSeed("",5), Jsoup.parse("HTML","")).setItems(null));
88 |     }
89 | 
90 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/scheduler/Scheduler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.scheduler;
 2 | 
 3 | import com.jinshuai.entity.UrlSeed;
 4 | 
 5 | /**
 6 |  * @author JS
 7 |  * @date 2018/03/26
 8 |  * @description：
 9 |  *  种子调度器: 提供种子，存放种子。
10 |  * */
11 | public interface Scheduler {
12 | 
13 |     /**
14 |      * 存放种子
15 |      * */
16 |     void push(UrlSeed urlSeed);
17 |     /**
18 |      * 提供种子
19 |      * */
20 |     UrlSeed pop();
21 | 
22 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/core/scheduler/impl/RedisScheduler.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.core.scheduler.impl;
  2 | 
  3 | import com.google.gson.Gson;
  4 | import com.jinshuai.core.scheduler.Scheduler;
  5 | import com.jinshuai.entity.UrlSeed;
  6 | import com.jinshuai.util.JedisUtils;
  7 | import lombok.extern.slf4j.Slf4j;
  8 | import redis.clients.jedis.Jedis;
  9 | 
 10 | /**
 11 |  * @author: JS
 12 |  * @date: 2018/3/26
 13 |  * @description: 将种子存放到Redis
 14 |  */
 15 | @Slf4j
 16 | public class RedisScheduler implements Scheduler {
 17 | 
 18 |     /**
 19 |      * 存放UrlSeed.url hash && 进行种子判重
 20 |      */
 21 |     private final static String PREFIX_SET = "Spider.consumer.set";
 22 | 
 23 |     /**
 24 |      * 根据种子的优先级先简单创建不同的几个队列
 25 |      */
 26 |     private final static String PREFIX_QUEUE_HIGH = "Spider.queue.consumer.high";
 27 |     private final static String PREFIX_QUEUE_LOW = "Spider.queue.consumer.low";
 28 |     private final static String PREFIX_QUEUE_DEFAULT = "Spider.queue.consumer.default";
 29 |     private final static String PREFIX_QUEUE_CONSUMER = "Spider.queue.consumer.consumer";
 30 | 
 31 |     /**
 32 |      * @param urlSeed 种子
 33 |      * @desciption: 配置 jedisPool
 34 |      * 添加种子的URL到Set，种子序列话后的JSON文本到List
 35 |      * 添加种子之前需要判断种子是否已经存在。
 36 |      */
 37 |     public void push(UrlSeed urlSeed) {
 38 |         try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) {
 39 |             // 种子不存在
 40 |             if (!jedis.sismember(PREFIX_SET, urlSeed.getUrlHash())) {
 41 |                 // 添加种子Url对应的hash到判重Set
 42 |                 jedis.sadd(PREFIX_SET, urlSeed.getUrlHash());
 43 |                 // 添加种子序列化后的JSON文本到List
 44 |                 Gson gson = new Gson();
 45 |                 String urlSeedToJson = gson.toJson(urlSeed);
 46 |                 long urlSeedPriority = urlSeed.getPriority();
 47 |                 if (urlSeedPriority > 5) {
 48 |                     jedis.lpush(PREFIX_QUEUE_HIGH, urlSeedToJson);
 49 |                 } else if (urlSeedPriority == 5) {
 50 |                     jedis.lpush(PREFIX_QUEUE_DEFAULT, urlSeedToJson);
 51 |                 } else if (urlSeedPriority > 0) {
 52 |                     jedis.lpush(PREFIX_QUEUE_LOW, urlSeedToJson);
 53 |                 } else {
 54 |                     jedis.lpush(PREFIX_QUEUE_CONSUMER, urlSeedToJson);
 55 |                 }
 56 |             }
 57 |         } catch (Exception e) {
 58 |             log.error("JedisPushUrl[{}]出错", urlSeed.toString(), e);
 59 |         }
 60 |     }
 61 | 
 62 |     /**
 63 |      * @description: 消费者只从对应的消费队列中取种子
 64 |      */
 65 |     public UrlSeed pop() {
 66 |         Jedis jedis = JedisUtils.getSingleInstance().getJedis();
 67 |         Gson gson = new Gson();
 68 |         String urlSeedToJson = null;
 69 |         UrlSeed urlSeed = null;
 70 |         try {
 71 |             if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_CONSUMER)) != null) {
 72 |                 urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class);
 73 |             } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_HIGH)) != null) {
 74 |                 urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class);
 75 |             } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_DEFAULT)) != null) {
 76 |                 urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class);
 77 |             } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_LOW)) != null) {
 78 |                 urlSeed = gson.fromJson(urlSeedToJson, UrlSeed.class);
 79 |             }
 80 |             return urlSeed;
 81 |         } catch (Exception e) {
 82 |             log.error("JedisPopUrl [{}]出错", urlSeedToJson, e);
 83 |         } finally {
 84 |             if (jedis != null && jedis.isConnected())
 85 |                 jedis.disconnect();
 86 |         }
 87 |         return gson.fromJson(urlSeedToJson, UrlSeed.class);
 88 |     }
 89 | 
 90 |     /**
 91 |      * test connection
 92 |      */
 93 |     public static void main(String[] args) {
 94 |         Jedis jedis = JedisUtils.getSingleInstance().getJedis();
 95 |         System.out.println(jedis.ping());
 96 |         UrlSeed urlSeed = new RedisScheduler().pop();
 97 |         System.out.println(urlSeed);
 98 |         jedis.lpush(PREFIX_QUEUE_LOW, "dasdasdasdsa");
 99 |     }
100 | 
101 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/entity/Page.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.entity;
 2 | 
 3 | import org.jsoup.nodes.Document;
 4 | 
 5 | import java.util.Map;
 6 | import java.util.Set;
 7 | 
 8 | /**
 9 |  * @author: JS
10 |  * @date: 2018/3/26
11 |  * @description:
12 |  *  每一个UrlSeed对应的页面抽象为一个Page
13 |  */
14 | public class Page {
15 | 
16 |     /**
17 |      * Page对应的UrlSeed
18 |      * */
19 |     private UrlSeed urlSeed;
20 | 
21 |     /**
22 |      * Page对应的jsoup文档
23 |      * */
24 |     private Document document;
25 | 
26 |     /**
27 |      * Page包含的url
28 |      * */
29 |     private Set<UrlSeed> urlSeeds;
30 | 
31 |     /**
32 |      * Page所包含的有用信息
33 |      * */
34 |     private Map<String,String> items;
35 | 
36 |     public Page(UrlSeed urlSeed, Document document) {
37 |         this.urlSeed = urlSeed;
38 |         this.document = document;
39 |     }
40 | 
41 |     public UrlSeed getUrlSeed() {
42 |         return urlSeed;
43 |     }
44 | 
45 |     public Page setUrlSeed(UrlSeed urlSeed) {
46 |         this.urlSeed = urlSeed;
47 |         return this;
48 |     }
49 | 
50 |     public Document getDocument() {
51 |         return document;
52 |     }
53 | 
54 |     public Page setDocument(Document document) {
55 |         this.document = document;
56 |         return this;
57 |     }
58 | 
59 |     public Set<UrlSeed> getUrlSeeds() {
60 |         return urlSeeds;
61 |     }
62 | 
63 |     public Page setUrlSeeds(Set<UrlSeed> urlSeeds) {
64 |         this.urlSeeds = urlSeeds;
65 |         return this;
66 |     }
67 | 
68 |     public Map<String, String> getItems() {
69 |         return items;
70 |     }
71 | 
72 |     public Page setItems(Map<String, String> items) {
73 |         this.items = items;
74 |         return this;
75 |     }
76 | 
77 |     @Override
78 |     public String toString() {
79 |         return "Page{" +
80 |                 "urlSeed=" + urlSeed +
81 |                 ", urlSeeds=" + urlSeeds +
82 |                 ", items=" + items +
83 |                 '}';
84 |     }
85 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/entity/UrlSeed.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.entity;
 2 | 
 3 | import com.jinshuai.util.hash.MurmurHash;
 4 | import lombok.EqualsAndHashCode;
 5 | import lombok.ToString;
 6 | 
 7 | /**
 8 |  * @author: JS
 9 |  * @date: 2018/3/26
10 |  * @description:
11 |  *  每个Url需要设置优先级，不需要对低于某个优先级的Url进行解析。
12 |  */
13 | @ToString
14 | @EqualsAndHashCode
15 | public class UrlSeed {
16 | 
17 |     /**
18 |      * 种子对应的Url
19 |      * */
20 |     private String url;
21 | 
22 |     /**
23 |      * url hash
24 |      * */
25 |     private String urlHash;
26 | 
27 |     /**
28 |      * 种子优先级
29 |      * 硬编码为5,通过时间戳设置优先级
30 |      * */
31 |     private long priority = 5;
32 | 
33 |     public UrlSeed(String url, long priority) {
34 |         this.url = url;
35 |         this.priority = priority;
36 |         this.urlHash = String.valueOf(MurmurHash.hash64(url));
37 |     }
38 | 
39 |     public UrlSeed(String url) {
40 |         this.url = url;
41 |         this.urlHash = String.valueOf(MurmurHash.hash64(url));
42 |     }
43 | 
44 |     public String getUrl() {
45 |         return url;
46 |     }
47 | 
48 |     public UrlSeed setUrl(String url) {
49 |         this.url = url;
50 |         return this;
51 |     }
52 | 
53 |     public String getUrlHash() {
54 |         return urlHash;
55 |     }
56 | 
57 |     public long getPriority() {
58 |         return priority;
59 |     }
60 | 
61 |     public UrlSeed setPriority(long priority) {
62 |         this.priority = priority;
63 |         return this;
64 |     }
65 | 
66 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/JedisUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import redis.clients.jedis.Jedis;
  5 | import redis.clients.jedis.JedisPool;
  6 | import redis.clients.jedis.JedisPoolConfig;
  7 | 
  8 | import java.util.Map;
  9 | import java.util.concurrent.ConcurrentHashMap;
 10 | 
 11 | /**
 12 |  * @author: JS
 13 |  * @date: 2018/3/27
 14 |  * @description:
 15 |  *  对Jedis简单的封装
 16 |  */
 17 | @Slf4j
 18 | public class JedisUtils {
 19 | 
 20 |     /**
 21 |      * JedisUtils实例
 22 |      * */
 23 |     private static volatile JedisUtils jedisUtils;
 24 | 
 25 |     /**
 26 |      * 获取JedisUtils单例
 27 |      * */
 28 |     public static JedisUtils getSingleInstance() {
 29 |         if (jedisUtils == null) {
 30 |             synchronized (JedisUtils.class) {
 31 |                 jedisUtils = new JedisUtils();
 32 |             }
 33 |         }
 34 |         return jedisUtils;
 35 |     }
 36 | 
 37 |     private JedisPool jedisPool;
 38 | 
 39 |     JedisUtils() {
 40 |         init();
 41 |     }
 42 | 
 43 |     private void init() {
 44 |         configJedisPool();
 45 |     }
 46 | 
 47 |     /**
 48 |      * 获取套接字、密码
 49 |      * */
 50 |     private static final String IP = PropertiesUtils.getInstance().get("redis-ip");
 51 |     private static final int PORT = Integer.valueOf(PropertiesUtils.getInstance().get("redis-port"));
 52 |     private static final String PASSWORD = PropertiesUtils.getInstance().get("redis-password");
 53 | 
 54 |     /**
 55 |      * 可用连接实例的最大数目，默认值为8；
 56 |      * 如果赋值为-1，则表示不限制；如果pool已经分配了maxActive个jedis实例，则此时pool的状态为exhausted(耗尽)。
 57 |      */
 58 |     private static int MAX_ACTIVE = 2048;
 59 | 
 60 |     /**
 61 |      * 控制一个pool最多有多少个状态为idle(空闲的)的jedis实例，默认值也是8。
 62 |      */
 63 |     private static int MAX_IDLE = 200;
 64 | 
 65 |     /**
 66 |      * 等待可用连接的最大时间，单位毫秒，默认值为-1，表示永不超时。如果超过等待时间，则直接抛出JedisConnectionException；
 67 |      * */
 68 |     private static int MAX_WAIT = 10000;
 69 | 
 70 |     /**
 71 |      * 超时时间
 72 |      * */
 73 |     private static int TIMEOUT = 10000;
 74 | 
 75 |     /**
 76 |      * 保存若干个jedisPool
 77 |      * key 为IP+port
 78 |      * */
 79 |     private static Map<String,JedisPool> maps = new ConcurrentHashMap<>();
 80 | 
 81 |     private void configJedisPool() {
 82 |         if (maps.get(IP) == null) {
 83 |             JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
 84 |             jedisPoolConfig.setMaxTotal(MAX_ACTIVE);
 85 |             jedisPoolConfig.setMaxIdle(MAX_IDLE);
 86 |             jedisPoolConfig.setMaxWaitMillis(MAX_WAIT);
 87 |             jedisPoolConfig.setTestOnReturn(true);
 88 |             // 未设置密码
 89 |             if (PASSWORD == null || PASSWORD.length() == 0) {
 90 |                 log.info("配置文件中未设置Redis密码，请确保Redis服务器不需要密码验证!!!");
 91 |                 jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT);
 92 |             } else {
 93 |                 jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT, PASSWORD);
 94 |             }
 95 |             maps.put(IP,jedisPool);
 96 |         } else {
 97 |             jedisPool = maps.get(IP);
 98 |         }
 99 |     }
100 | 
101 |     /**
102 |      * 从jedisPool中获取jedis
103 |      * */
104 |     public Jedis getJedis() {
105 |         Jedis jedis = null;
106 |         try {
107 |             jedis = jedisPool.getResource();
108 |         } catch (Exception e) {
109 |             log.error("连接Redis失败,检查IP、端口、密码", e);
110 |         }
111 |         return jedis;
112 |     }
113 | 
114 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/PropertiesUtils.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util;
 2 | 
 3 | 
 4 | import lombok.extern.slf4j.Slf4j;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import java.util.Map;
 9 | import java.util.Properties;
10 | import java.util.concurrent.ConcurrentHashMap;
11 | 
12 | /**
13 |  * @author: JS
14 |  * @date: 2018/5/4
15 |  * @description:
16 |  *  读取配置文件工具类
17 |  */
18 | @Slf4j
19 | public class PropertiesUtils {
20 | 
21 |     private Map<String, String> cache = new ConcurrentHashMap<>();
22 | 
23 |     private static volatile PropertiesUtils propertiesUtils;
24 | 
25 |     public static PropertiesUtils getInstance() {
26 |         if (propertiesUtils == null) {
27 |             synchronized (PropertiesUtils.class) {
28 |                 if (propertiesUtils == null) {
29 |                     propertiesUtils = new PropertiesUtils();
30 |                 }
31 |             }
32 |         }
33 |         return propertiesUtils;
34 |     }
35 | 
36 |     public String get(String key) {
37 |         if (key == null)
38 |             return null;
39 |         if (cache.get(key) != null) {
40 |             return cache.get(key);
41 |         }
42 |         Properties properties = new Properties();
43 |         InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties");
44 |         try {
45 |             properties.load(inputStream);
46 |         } catch (IOException e) {
47 |             log.error("加载配置文件[application.properties]失败",e);
48 |         }
49 |         String value = properties.getProperty(key);
50 |         cache.put(key,value);
51 |         return value;
52 |     }
53 | 
54 |     public static void main(String[] args) throws IOException {
55 | //        Properties properties = new Properties();
56 | //        InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties");
57 | //        properties.load(inputStream);
58 | //        System.out.println(properties.getProperty("ip"));
59 | //        System.out.println(properties.getProperty("ip"));
60 |         System.out.println(PropertiesUtils.getInstance().get("ip"));
61 |     }
62 | 
63 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/hash/MurmurHash.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.hash;
 2 | 
 3 | import java.math.BigInteger;
 4 | import java.nio.ByteBuffer;
 5 | import java.nio.ByteOrder;
 6 | 
 7 | /**
 8 |  * @author: JS
 9 |  * @date: 2019/5/25
10 |  * @description: 生成64位Hash 代码：https://www.cnkirito.moe/consistent-hash-lb/
11 |  */
12 | public class MurmurHash {
13 | 
14 |     public static BigInteger hash64(String word) {
15 |         ByteBuffer buf = ByteBuffer.wrap(word.getBytes());
16 |         int seed = 0x1234ABCD;
17 | 
18 |         ByteOrder byteOrder = buf.order();
19 |         buf.order(ByteOrder.LITTLE_ENDIAN);
20 | 
21 |         long m = 0xc6a4a7935bd1e995L;
22 |         int r = 47;
23 | 
24 |         long h = seed ^ (buf.remaining() * m);
25 | 
26 |         long k;
27 |         while (buf.remaining() >= 8) {
28 |             k = buf.getLong();
29 | 
30 |             k *= m;
31 |             k ^= k >>> r;
32 |             k *= m;
33 | 
34 |             h ^= k;
35 |             h *= m;
36 |         }
37 | 
38 |         if (buf.remaining() > 0) {
39 |             ByteBuffer finish = ByteBuffer.allocate(8).order(
40 |                     ByteOrder.LITTLE_ENDIAN);
41 |             // for big-endian version, do this first:
42 |             // finish.position(8-buf.remaining());
43 |             finish.put(buf).rewind();
44 |             h ^= finish.getLong();
45 |             h *= m;
46 |         }
47 |         h ^= h >>> r;
48 |         h *= m;
49 |         h ^= h >>> r;
50 | 
51 |         buf.order(byteOrder);
52 |         return new BigInteger(String.valueOf(h & 0xffffffffL));
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/hash/PageUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util.hash;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import org.ansj.app.keyword.KeyWordComputer;
  5 | import org.ansj.app.keyword.Keyword;
  6 | 
  7 | import java.io.File;
  8 | import java.io.IOException;
  9 | import java.math.BigInteger;
 10 | import java.nio.file.Files;
 11 | import java.util.Collection;
 12 | import java.util.HashSet;
 13 | import java.util.Map;
 14 | import java.util.Set;
 15 | import java.util.concurrent.ConcurrentHashMap;
 16 | 
 17 | /**
 18 |  * @author: JS
 19 |  * @date: 2019/5/25
 20 |  * @description: SimHash进行doc相似度检测
 21 |  */
 22 | @Slf4j
 23 | public class PageUtils {
 24 | 
 25 |     private volatile static PageUtils instance;
 26 | 
 27 |     private Map<String, Set<BigInteger>> invertedIndex = new ConcurrentHashMap<>();
 28 | 
 29 |     /**
 30 |      * 测试：跟踪文本
 31 |      * */
 32 |     private Map<BigInteger, String> fingerContent = new ConcurrentHashMap<>();
 33 | 
 34 |     public static PageUtils getInstance() {
 35 |         if (instance == null) {
 36 |             synchronized (PageUtils.class) {
 37 |                 if (instance == null) {
 38 |                     instance = new PageUtils();
 39 |                 }
 40 |             }
 41 |         }
 42 |         return instance;
 43 |     }
 44 | 
 45 |     private static final int BIT_SIZE = 64;
 46 | 
 47 |     private static final int TABLE_SIZE = 16;
 48 | 
 49 |     private static final int HAMMING_DISTANCE = 3;
 50 | 
 51 |     private ThreadLocal<String> simhashStrContainer = new ThreadLocal<>();
 52 | 
 53 |     public boolean exist(String title, String content) {
 54 |         boolean exist = false;
 55 |         BigInteger fingerprint = getSimHash(title, content);
 56 | //        fingerContent.put(fingerprint, title + "====" + content);
 57 |         String hashStr = simhashStrContainer.get();
 58 |         // 防止分词错误NPE
 59 |         if (hashStr.length() == BIT_SIZE) {
 60 |             // 获取每一个table对应的所有候选结果
 61 |             for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) {
 62 |                 String table = hashStr.substring(start, start + TABLE_SIZE);
 63 |                 Set<BigInteger> fingerprints = invertedIndex.get(table);
 64 |                 if (fingerprints != null && fingerprints.size() > 0) {
 65 |                     for (BigInteger fingerprintRes : fingerprints) {
 66 |                         // 海明距离
 67 |                         int hammingDistance = fingerprintRes.xor(fingerprint).bitCount();
 68 |                         if (hammingDistance <= HAMMING_DISTANCE) {
 69 | //                            log.error("标题 [{}] \r\n 内容[{}] \r\n 与 标题内容[{}] 相似\r\n 汉明距离:[{}]", title, content, fingerContent.get(fingerprintRes), hammingDistance);
 70 |                             exist = true;
 71 |                             break;
 72 |                         }
 73 |                     }
 74 |                 }
 75 |                 if (exist)
 76 |                     break;
 77 |             }
 78 |             // 构建倒排索引(16 * 4)
 79 |             constructInvertedIndex(fingerprint);
 80 |         }
 81 |         return exist;
 82 |     }
 83 | 
 84 |     private BigInteger getSimHash(String title, String content) {
 85 |         double[] featureVector = new double[BIT_SIZE];
 86 |         // 1. 分词,计算权重
 87 |         Collection<Keyword> result = getParticiple(title, content);
 88 |         // 2. hash
 89 |         // 3. 加权
 90 |         // 4. 合并
 91 |         featureVector = weightingAndCombine(featureVector, result);
 92 |         // 5. 降维
 93 |         // 6. SimHash 指纹
 94 |         return decreaseDimensionAndGetFingerprint(featureVector);
 95 |     }
 96 | 
 97 |     /**
 98 |      * 分词
 99 |      * */
100 |     private Collection<Keyword> getParticiple(String title, String content) {
101 |         int keyNumber = content.length() / 2 < 5 ? 5 : content.length(); // TODO
102 |         KeyWordComputer kwc = new KeyWordComputer(keyNumber);
103 |         return kwc.computeArticleTfidf(title, content);
104 |     }
105 | 
106 |     /**
107 |      * 哈希 加权 合并
108 |      * */
109 |     private double[] weightingAndCombine(double[] featureVector, Collection<Keyword> result) {
110 |         for (Keyword keyword : result) {
111 |             String keyStr = keyword.getName();
112 |             BigInteger keyHash = MurmurHash.hash64(keyStr);
113 |             for (int i = 0; i < BIT_SIZE; i++) {
114 |                 final BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i);
115 |                 // 3. 加权
116 |                 // 4. 合并
117 |                 if (keyHash.and(bitMask).signum() != 0) {
118 |                     featureVector[i] += keyword.getScore();
119 |                 } else {
120 |                     featureVector[i] -= keyword.getScore();
121 |                 }
122 |             }
123 |         }
124 |         return featureVector;
125 |     }
126 | 
127 |     /**
128 |      * 降维 获取指纹
129 |      * */
130 |     private BigInteger decreaseDimensionAndGetFingerprint(double[] featureVector) {
131 |         BigInteger fingerprint = BigInteger.ZERO;
132 |         StringBuilder simHashBuilder = new StringBuilder();
133 |         for (int i = 0; i < BIT_SIZE; i++) {
134 |             BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i);
135 |             if (featureVector[i] > 0) {
136 |                 fingerprint = fingerprint.or(fingerprint.xor(bitMask));
137 |                 simHashBuilder.append(1);
138 |             } else {
139 |                 simHashBuilder.append(0);
140 |             }
141 |         }
142 |         simhashStrContainer.set(simHashBuilder.toString());
143 |         return fingerprint;
144 |     }
145 | 
146 |     /**
147 |      * 构建倒排索引
148 |      * <table, {simhash1, simhash2 simhash3...}>
149 |      *
150 |      * */
151 |     private void constructInvertedIndex(BigInteger fingerprint) {
152 |         String hashStr = simhashStrContainer.get();
153 |         for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) {
154 |             String table = hashStr.substring(start, start + TABLE_SIZE);
155 |             Set<BigInteger> docs = invertedIndex.get(table);
156 |             if (docs == null) {
157 |                 docs = new HashSet<>();
158 |             }
159 |             docs.add(fingerprint);
160 |             invertedIndex.put(table, docs);
161 |         }
162 |     }
163 | 
164 |     public static void main(String[] args) throws IOException {
165 |         PageUtils pageUtil = PageUtils.getInstance();
166 |         StringBuilder sb1 = new StringBuilder();
167 |         File file1 = new File("D:/Data/1.txt");
168 |         Files.readAllLines(file1.toPath()).forEach(line ->{
169 |             sb1.append(line);
170 |         });
171 | 
172 |         StringBuilder sb2 = new StringBuilder();
173 |         File file2 = new File("D:/Data/2.txt");
174 |         Files.readAllLines(file2.toPath()).forEach(line -> {
175 |             sb2.append(line);
176 |         });
177 | 
178 |         pageUtil.exist("学校党委理论学习中心组召开扩大会议",sb1.toString());
179 |         pageUtil.exist("校党委理论学习中心组召开专题会议学习传达全国“两会”精神", sb2.toString());
180 | 
181 | 
182 | //        BigInteger simhash1 = pageUtil.getSimHash("","我来自河北省，你们可以叫我金帅");
183 | //        BigInteger simhash2 = pageUtil.getSimHash("","我来自河北省，我是金帅");
184 | //        System.out.println(simhash1.xor(simhash2).bitCount());
185 | //        System.out.println(simhash1.xor(simhash2).toString(2));
186 | //        System.out.println("=========十进制========");
187 | //        System.out.println(simhash1.toString());
188 | //        System.out.println(simhash2.toString());
189 | //        System.out.println("=========二进制========");
190 | //        System.out.println(simhash1.toString(2));
191 | //        System.out.println(simhash2.toString(2));
192 | 
193 |     }
194 | 
195 | }
196 | 


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/http/HttpUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util.http;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import org.apache.http.Header;
  5 | import org.apache.http.HttpEntity;
  6 | import org.apache.http.HttpResponse;
  7 | import org.apache.http.client.config.RequestConfig;
  8 | import org.apache.http.client.methods.HttpGet;
  9 | import org.apache.http.config.Registry;
 10 | import org.apache.http.config.RegistryBuilder;
 11 | import org.apache.http.config.SocketConfig;
 12 | import org.apache.http.conn.socket.ConnectionSocketFactory;
 13 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
 14 | import org.apache.http.conn.ssl.NoopHostnameVerifier;
 15 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
 16 | import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
 17 | import org.apache.http.entity.ContentType;
 18 | import org.apache.http.impl.client.CloseableHttpClient;
 19 | import org.apache.http.impl.client.HttpClients;
 20 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
 21 | import org.apache.http.ssl.SSLContexts;
 22 | import org.apache.http.util.ByteArrayBuffer;
 23 | 
 24 | import javax.net.ssl.HostnameVerifier;
 25 | import javax.net.ssl.SSLContext;
 26 | import java.io.IOException;
 27 | import java.io.InputStream;
 28 | import java.net.MalformedURLException;
 29 | import java.net.URI;
 30 | import java.net.URISyntaxException;
 31 | import java.net.URL;
 32 | import java.nio.charset.Charset;
 33 | import java.util.Random;
 34 | import java.util.concurrent.TimeUnit;
 35 | import java.util.regex.Matcher;
 36 | import java.util.regex.Pattern;
 37 | 
 38 | /**
 39 |  * @author: JS
 40 |  * @date: 2018/3/22
 41 |  * @description:
 42 |  *  创建单例HttpUtils，获取HttpClient实例执行HTTP请求根据状态码解析响应体。
 43 |  */
 44 | @Slf4j
 45 | public class HttpUtils {
 46 | 
 47 |     private static final ThreadLocal<HttpGet> httpGetContainer = new ThreadLocal<>();
 48 | 
 49 |     private static final ThreadLocal<HttpEntity> httpEntityContainer = new ThreadLocal<>();
 50 | 
 51 |     private static volatile HttpUtils HTTPUTILS;
 52 | 
 53 |     private PoolingHttpClientConnectionManager httpClientConnectionManager;
 54 | 
 55 |     private CloseableHttpClient httpClient;
 56 | 
 57 |     private static final int MAX_TOTAL_CONNECTIONS = 20;
 58 |     private static final int SOCKET_TIMEOUT = 5000;
 59 |     private static final int MAX_CONNECTIONS_PER_ROUTE = 200;
 60 |     private static final int CONNECTION_REQUEST_TIMEOUT = 5000;
 61 |     private static final int CONNECT_TIMEOUT = 5000;
 62 | 
 63 |     /**
 64 |      * 获取HttpUtils单例
 65 |      * */
 66 |     public static HttpUtils getSingleInstance() {
 67 |         if (HTTPUTILS == null) {
 68 |             synchronized (HttpUtils.class) {
 69 |                 if (HTTPUTILS == null) {
 70 |                     HTTPUTILS = new HttpUtils();
 71 |                 }
 72 |             }
 73 |         }
 74 |         return HTTPUTILS;
 75 |     }
 76 | 
 77 |     HttpUtils() {
 78 |         init();
 79 |     }
 80 | 
 81 |     private void init() {
 82 |         configHttpPool();
 83 |         configHttpClient();
 84 |     }
 85 | 
 86 |     /**
 87 |      * 配置HTTP连接池
 88 |      *
 89 |      * */
 90 |     private void configHttpPool() {
 91 |         try {
 92 |             // 配置SSL
 93 |             SSLContext sslcontext = SSLContexts.custom()
 94 |                     .loadTrustMaterial(null, new TrustSelfSignedStrategy())
 95 |                     .build();
 96 | 
 97 | //            HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.getDefaultHostnameVerifier();
 98 |             // 关闭域名证书验证
 99 |             HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE;
100 | 
101 |             SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
102 |                     sslcontext, hostnameVerifier);
103 | 
104 |             Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
105 |                     .register("http", PlainConnectionSocketFactory.getSocketFactory())
106 |                     .register("https", sslsf)
107 |                     .build();
108 | 
109 |             // 将SSL集成到HttpConnectionManager
110 |             httpClientConnectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
111 |             // 设置HTTP连接池最大连接数
112 |             httpClientConnectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);
113 |             // 每个路由最大的连接数
114 |             httpClientConnectionManager.setDefaultMaxPerRoute(MAX_CONNECTIONS_PER_ROUTE);
115 |             // 设置socket超时时间
116 |             SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(SOCKET_TIMEOUT).build();
117 |             httpClientConnectionManager.setDefaultSocketConfig(socketConfig);
118 |         } catch (Exception e) {
119 |             log.error("SSL配置出错",e);
120 |         }
121 |     }
122 | 
123 |     /**
124 |      * 配置HttpClient
125 |      *
126 |      * */
127 |     private void configHttpClient() {
128 |         // 请求配置
129 |         RequestConfig requestConfig = RequestConfig.custom()
130 |                 .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT)
131 |                 .setConnectTimeout(CONNECT_TIMEOUT)
132 |                 .build();
133 |         // 将配置信息应用到HttpClient
134 |         if (httpClientConnectionManager == null) {
135 |             log.error("httpClientConnectionManager未被初始化");
136 |             return;
137 |         }
138 |         httpClient = HttpClients.custom()
139 |                 .setDefaultRequestConfig(requestConfig)
140 |                 .setConnectionManager(httpClientConnectionManager)
141 |                 .build();
142 |     }
143 | 
144 |     /**
145 |      * 配置HttpGet
146 |      *
147 |      * */
148 |     private HttpGet getHttpGet(String urlStr) {
149 |         URL url;
150 |         URI uri = null;
151 |         try {
152 |             url = new URL(urlStr);
153 |             uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
154 |         } catch (MalformedURLException | URISyntaxException e) {
155 |             log.error("字符串格式不正确[{}]",urlStr,e);
156 |         }
157 |         HttpGet httpGet = new HttpGet(uri);
158 |         // 添加请求头header
159 |         httpGet.addHeader("Accept", "*/*");
160 |         httpGet.addHeader("Accept-Encoding", "gzip, deflate");
161 |         httpGet.addHeader("Connection", "keep-alive");
162 |         int randomUserAgent = new Random().nextInt(UserAgentArray.USER_AGENT.length);
163 |         httpGet.addHeader("User-Agent",UserAgentArray.USER_AGENT[randomUserAgent]);
164 | 
165 |         return httpGet;
166 |     }
167 | 
168 |     /**
169 |      * 发Get请求
170 |      *
171 |      * */
172 |     private void sendRequest(String urlStr) {
173 |         HttpGet httpGet = httpGetContainer.get();
174 |         try {
175 |             HttpResponse response = httpClient.execute(httpGet);
176 |             // 根据状态码执行不同的操作
177 |             int statusCode = response.getStatusLine().getStatusCode();
178 |             switch (statusCode / 100) {
179 |                 case 2:
180 |                     executeStrategy(SuccessStrategy.getInstance(), urlStr, response);
181 |                     break;
182 |                 case 3:
183 |                     executeStrategy(RedirectStrategy.getInstance(), urlStr, response);
184 |                     break;
185 |                 case 4:
186 |                     executeStrategy(ClientErrorStrategy.getInstance(), urlStr, response);
187 |                     break;
188 |                 case 5:
189 |                     executeStrategy(ServerErrorStrategy.getInstance(), urlStr, response);
190 |                     break;
191 |             }
192 |         } catch (IOException e) {
193 |             log.error("IO出错[{}]", urlStr, e);
194 |         }
195 |     }
196 | 
197 |     /**
198 |      * 获取 HttpEntity
199 |      *
200 |      * */
201 |     public String getContent(String urlStr) {
202 |         // url为空或者不是http协议
203 |         if (urlStr == null || !urlStr.startsWith("http")) {
204 |             return null;
205 |         }
206 |         // 防止SSL过程中的握手警报 http://dovov.com/ssljava-1-7-0unrecognized_name.html
207 |         if (urlStr.startsWith("https")) {
208 |             System.setProperty("jsse.enableSNIExtension", "false");
209 |         }
210 |         String content = null;
211 |         try {
212 |             httpGetContainer.set(getHttpGet(urlStr));
213 |             sendRequest(urlStr);
214 |             HttpEntity httpEntity = httpEntityContainer.get();
215 |             if (httpEntity == null) {
216 |                 log.error("HttpEntity为空");
217 |                 return null;
218 |             }
219 |             InputStream inputStream = httpEntity.getContent();
220 |             content = parseStream(inputStream, httpEntity);
221 |         } catch (IOException e) {
222 |             log.error("获取响应流失败", e);
223 |         } catch (Exception e) {
224 |             log.error("获取内容异常", e);
225 |         } finally {
226 |             httpGetContainer.get().releaseConnection();
227 |             httpGetContainer.remove();
228 |         }
229 |         return content;
230 |     }
231 | 
232 |     /**
233 |      * 解析响应流
234 |      *
235 |      * */
236 |     private String parseStream(InputStream inputStream, HttpEntity httpEntity) {
237 |         String pageContent = null;
238 |         // 获取页面编码：1. 从响应头content-type 2. 如果没有则从返回的HTML中获取Meta标签里的编码
239 |         ByteArrayBuffer byteArrayBuffer = new ByteArrayBuffer(4096);
240 |         byte[] tempStore = new byte[4096];
241 |         int count;
242 |         try {
243 |             // read(tempStore) 会重新从零开始存->刷新字节数组 ,并返回读到的字节数量
244 |             while ((count = inputStream.read(tempStore)) != -1) {
245 |                 byteArrayBuffer.append(tempStore, 0, count);
246 |             }
247 |             // TODO:下面复制粘贴的：https://github.com/xjtushilei/ScriptSpider
248 |             // 根据获取的字节编码转为String类型
249 |             String charset = "UTF-8";
250 |             ContentType contentType = ContentType.getOrDefault(httpEntity);
251 |             Charset charsets = contentType.getCharset();
252 |             pageContent = new String(byteArrayBuffer.toByteArray());
253 |             // 如果响应头中含有content-type字段，直接读取然后设置编码即可。
254 |             if (null != charsets) {
255 |                 charset = charsets.toString();
256 |             } else {
257 |                 // 发现HttpClient带的功能有问题，这里自己又写了一下。
258 |                 Pattern pattern = Pattern.compile("<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\"");
259 |                 Matcher matcher = pattern.matcher(pageContent.toLowerCase());
260 |                 if (matcher.find()) {
261 |                     charset = matcher.group(4);
262 |                 }
263 |             }
264 |             pageContent = new String(byteArrayBuffer.toByteArray(),charset);
265 |         } catch (IOException e) {
266 |             log.error("处理流失败", e);
267 |         }
268 |         return pageContent;
269 |     }
270 | 
271 |     /**
272 |      * 执行具体的策略
273 |      *
274 |      * */
275 |     private void executeStrategy(StatusHandler statusHandler, String url, HttpResponse response) {
276 |         statusHandler.process(url, response);
277 |     }
278 | 
279 |     /**
280 |      * 2XX 策略
281 |      * 成功获取响应时对应的执行策略
282 |      *
283 |      * */
284 |     public static class SuccessStrategy implements StatusHandler {
285 | 
286 |         private static final StatusHandler statusHandler = new SuccessStrategy();
287 | 
288 |         static StatusHandler getInstance() {
289 |             return statusHandler;
290 |         }
291 | 
292 |         @Override
293 |         public void process(String url, HttpResponse response) {
294 |             httpEntityContainer.set(response.getEntity());
295 |         }
296 | 
297 |     }
298 | 
299 |     /**
300 |      * 3XX 策略
301 |      * 重定向时对应的执行策略
302 |      *
303 |      * */
304 |     public static class RedirectStrategy implements StatusHandler {
305 | 
306 |         private static final StatusHandler statusHandler = new RedirectStrategy();
307 | 
308 |         static StatusHandler getInstance() {
309 |             return statusHandler;
310 |         }
311 | 
312 |         @Override
313 |         public void process(String url, HttpResponse response) {
314 |             Header location = response.getFirstHeader("Location");
315 |             // 将location对应的URL放到仓库中
316 |             log.error("301: 资源[{}]已被重定向[{}]", url, location.getValue());
317 |         }
318 | 
319 |     }
320 | 
321 |     /**
322 |      * 4XX 策略
323 |      * 主要处理需要认证的资源401，需要授权的资源403，以及不存在的资源404
324 |      * 当请求次数过多以后，就容易报403
325 |      * 当 401，403时，将资源放到低优先级的队列或者消息队列中，额外处理。 TODO
326 |      * */
327 |     public static class ClientErrorStrategy implements StatusHandler {
328 | 
329 |         private static final StatusHandler statusHandler = new ClientErrorStrategy();
330 | 
331 |         static StatusHandler getInstance() {
332 |             return statusHandler;
333 |         }
334 | 
335 |         @Override
336 |         public void process(String url, HttpResponse response) {
337 |             int status = response.getStatusLine().getStatusCode();
338 | 
339 |             if (status == 401 || status == 403) {
340 |                 log.warn("401: 无权访问此资源[{}]", url);
341 |             } else if (status == 404) {
342 |                 log.warn("404: 请求的资源不存在[{}]", url);
343 |             }
344 |         }
345 | 
346 |     }
347 | 
348 |     /**
349 |      * 5XX 策略
350 |      * 远端服务器出错，应对办法是暂时停止爬虫 TODO
351 |      * */
352 |     public static class ServerErrorStrategy implements StatusHandler {
353 | 
354 |         private static final StatusHandler statusHandler = new ServerErrorStrategy();
355 | 
356 |         static StatusHandler getInstance() {
357 |             return statusHandler;
358 |         }
359 | 
360 |         @Override
361 |         public void process(String url, HttpResponse response) {
362 |             log.error("500: 远端服务器出错[{}]", url);
363 |             Header retryAfter = response.getFirstHeader("Retry-After");
364 |             long waitSeconds = 20;
365 |             if (retryAfter != null) {
366 |                 waitSeconds = Long.valueOf(retryAfter.getValue());
367 |             }
368 |             log.info("由于远程服务器出错，爬虫休息 [{}] 秒后，尝试继续执行任务.....", waitSeconds);
369 |             try {
370 |                 TimeUnit.SECONDS.sleep(waitSeconds);
371 |             } catch (InterruptedException e) {
372 |                 log.error("sleep error", e);
373 |             }
374 |         }
375 | 
376 |     }
377 | 
378 |     /**
379 |      * Test HttpUtils
380 |      *
381 |      *  具体逻辑：HttpClient用封装好的HttpGet发送get请求，获取HttpEntity，从HttpEntity中获取响应内容以及响应头
382 |      *  从响应头Content-Type中获取charset编码格式，如果响应头中没有编码格式响应头，就从响应内容中解析meta标签获取编码格式
383 |      *  然后将字节数组按响应头中的编码格式创建字符串
384 |      * */
385 |     public static void main(String[] args) {
386 |     }
387 | 
388 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/http/StatusHandler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.http;
 2 | 
 3 | import org.apache.http.HttpResponse;
 4 | 
 5 | /**
 6 |  * @author: JS
 7 |  * @date: 2019/4/12
 8 |  * @description: 状态码处理策略
 9 |  */
10 | public interface StatusHandler {
11 | 
12 |     void process(String URL, HttpResponse response);
13 | 
14 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/java/com/jinshuai/util/http/UserAgentArray.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.http;
 2 | 
 3 | /**
 4 |  * @author: JS
 5 |  * @date: 2018/3/23
 6 |  * @description:
 7 |  *  找的一些用户代理，防止后台限制发多个请求。
 8 |  *  TODO: 待放在文件中
 9 |  */
10 | public class UserAgentArray {
11 | 
12 |     public static final String[] USER_AGENT = {
13 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
14 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
15 |             "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
16 |             "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
17 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
18 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
19 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
20 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
21 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
22 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
23 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
24 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
25 |             "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
26 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
27 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
28 |             "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
29 |     };
30 | 
31 | }


--------------------------------------------------------------------------------
/spider-consumer/src/main/resources/application.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinshuai86/Spider/5c88c1f9aecc5624d548c18733a36533bd4d81bf/spider-consumer/src/main/resources/application.properties


--------------------------------------------------------------------------------
/spider-consumer/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--scan:
 3 |             当此属性设置为true时，配置文件如果发生改变，将会被重新加载，默认值为true。
 4 | scanPeriod:
 5 |             设置监测配置文件是否有修改的时间间隔，如果没有给出时间单位，默认单位是毫秒。当scan为true时，此属性生效。默认的时间间隔为1分钟。
 6 | debug:
 7 |             当此属性设置为true时，将打印出logback内部日志信息，实时查看logback运行状态。默认值为false。
 8 | 
 9 | configuration 子节点为 appender、logger、root
10 | 
11 |             -->
12 | <configuration scan="true" scanPeriod="60 seconds" debug="false">
13 | 
14 |     <!--用于区分不同应用程序的记录-->
15 |     <contextName>spider-consumer</contextName>
16 | 
17 |     <!--日志文件所在目录，如果是tomcat，如下写法日志文件会在则为${TOMCAT_HOME}/bin/logs/目录下-->
18 |     <property name="LOG_HOME" value="logs/consumer"/>
19 | 
20 |     <!--控制台-->
21 |     <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
22 |         <encoder>
23 |             <!--格式化输出：%d表示日期，%thread表示线程名，%-5level：级别从左显示5个字符宽度 %logger输出日志的logger名 %msg：日志消息，%n是换行符 -->
24 |             <pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>
25 |             <!--解决乱码问题-->
26 |             <charset>UTF-8</charset>
27 |         </encoder>
28 |         <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
29 |             <level>INFO</level>
30 |             <onMatch>ACCEPT</onMatch>
31 |             <onMismatch>DENY</onMismatch>
32 |         </filter>
33 |     </appender>
34 | 
35 |     <!--滚动文件 info以上的信息-->
36 |     <!---->
37 |     <!--<appender name="infoFile" class="ch.qos.logback.core.rolling.RollingFileAppender">-->
38 |         <!--&lt;!&ndash; ThresholdFilter:临界值过滤器，过滤掉 TRACE 和 DEBUG 级别的日志 &ndash;&gt;-->
39 |         <!--<filter class="ch.qos.logback.classic.filter.ThresholdFilter">-->
40 |             <!--<level>INFO</level>-->
41 |             <!--<onMatch>ACCEPT</onMatch>-->
42 |             <!--<onMismatch>DENY</onMismatch>-->
43 |         <!--</filter>-->
44 |         <!--<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">-->
45 |             <!--<fileNamePattern>${LOG_HOME}/log.%d{yyyy-MM-dd}.log</fileNamePattern>-->
46 |             <!--<maxHistory>30</maxHistory>&lt;!&ndash;保存最近30天的日志&ndash;&gt;-->
47 |         <!--</rollingPolicy>-->
48 |         <!--<encoder>-->
49 |             <!--<charset>UTF-8</charset>-->
50 |             <!--<pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>-->
51 |         <!--</encoder>-->
52 |     <!--</appender>-->
53 | 
54 |     <!--滚动文件 异常日志-->
55 |     <appender name="errorFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
56 |         <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
57 |             <level>ERROR</level>
58 |         </filter>
59 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
60 |             <fileNamePattern>${LOG_HOME}/error.%d{yyyy-MM-dd}.log</fileNamePattern>
61 |             <maxHistory>30</maxHistory>
62 |         </rollingPolicy>
63 |         <encoder>
64 |             <charset>UTF-8</charset>
65 |             <pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>
66 |         </encoder>
67 |     </appender>
68 | 
69 |     <!--这里如果是info，spring、mybatis等框架则不会输出：TRACE < DEBUG < INFO <  WARN < ERROR-->
70 |     <!--root是所有logger的祖先，均继承root，如果某一个自定义的logger没有指定level，就会寻找
71 |     父logger看有没有指定级别，直到找到root。-->
72 |     <root level="debug">
73 |         <appender-ref ref="stdout"/>
74 |         <!--<appender-ref ref="infoFile"/>-->
75 |         <appender-ref ref="errorFile"/>
76 |         <!--<appender-ref ref="logstash"/>-->
77 |     </root>
78 | 
79 |     <!--为某个包单独配置logger
80 | 
81 |     比如定时任务，写代码的包名为：com.seentao.task
82 |     步骤如下：
83 | 
84 |     1、定义一个appender，取名为task（随意，只要下面logger引用就行了）
85 |     appender的配置按照需要即可
86 | 
87 |     2、定义一个logger:
88 |     <logger name="com.seentao.task" level="DEBUG" additivity="false">
89 |       <appender-ref ref="task" />
90 |     </logger>
91 |     注意：additivity必须设置为false，这样只会交给task这个appender，否则其他appender也会打印com.seentao.task里的log信息。
92 | 
93 |     3、这样，在com.seentao.task的logger就会是上面定义的logger了。
94 |     private static Logger logger = LoggerFactory.getLogger(Class1.class);
95 |     -->
96 | </configuration>


--------------------------------------------------------------------------------
/spider-core/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <parent>
  6 |         <artifactId>spider</artifactId>
  7 |         <groupId>com.jinshuai</groupId>
  8 |         <version>1.0-SNAPSHOT</version>
  9 |     </parent>
 10 | 
 11 | 
 12 |   <groupId>com.jinshuai.core</groupId>
 13 |   <artifactId>spider-core</artifactId>
 14 |   <version>1.0</version>
 15 |   <packaging>jar</packaging>
 16 | 
 17 |   <name>spider-core</name>
 18 | 
 19 |   <url>https://github.com/jinshuai86/Spider</url>
 20 | 
 21 |   <properties>
 22 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 23 |   </properties>
 24 | 
 25 |   <build>
 26 |   <plugins>
 27 |     <!--  添加编译插件支持jdk1.8 -->
 28 |     <plugin>
 29 |       <groupId>org.apache.maven.plugins</groupId>
 30 |       <artifactId>maven-compiler-plugin</artifactId>
 31 |       <configuration>
 32 |         <source>1.8</source>
 33 |         <target>1.8</target>
 34 |         <encoding>UTF-8</encoding>
 35 |       </configuration>
 36 |     </plugin>
 37 |     <!-- 设置入口类 -->
 38 |     <plugin>
 39 |       <groupId>org.apache.maven.plugins</groupId>
 40 |       <artifactId>maven-shade-plugin</artifactId>
 41 |       <version>1.2.1</version>
 42 |       <executions>
 43 |         <execution>
 44 |           <phase>package</phase>
 45 |           <goals>
 46 |             <goal>shade</goal>
 47 |           </goals>
 48 |           <configuration>
 49 |             <transformers>
 50 |               <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
 51 |                 <mainClass>com.jinshuai.Spider</mainClass>
 52 |               </transformer>
 53 |             </transformers>
 54 |           </configuration>
 55 |         </execution>
 56 |       </executions>
 57 |     </plugin>
 58 |   </plugins>
 59 |   </build>
 60 | 
 61 |   <dependencies>
 62 |     <!-- test case -->
 63 |     <dependency>
 64 |       <groupId>junit</groupId>
 65 |       <artifactId>junit</artifactId>
 66 |       <version>4.12</version>
 67 |       <scope>test</scope>
 68 |     </dependency>
 69 |     <!--slf4j logback-->
 70 |     <dependency>
 71 |       <groupId>ch.qos.logback</groupId>
 72 |       <artifactId>logback-classic</artifactId>
 73 |       <version>1.2.3</version>
 74 |     </dependency>
 75 |     <!-- send http request-->
 76 |     <dependency>
 77 |       <groupId>org.apache.httpcomponents</groupId>
 78 |       <artifactId>httpclient</artifactId>
 79 |       <version>4.5.5</version>
 80 |     </dependency>
 81 |     <!-- parse DOM -->
 82 |     <dependency>
 83 |       <groupId>org.jsoup</groupId>
 84 |       <artifactId>jsoup</artifactId>
 85 |       <version>1.11.2</version>
 86 |     </dependency>
 87 |     <!-- store urlSeed -->
 88 |     <dependency>
 89 |       <groupId>redis.clients</groupId>
 90 |       <artifactId>jedis</artifactId>
 91 |       <version>2.9.0</version>
 92 |     </dependency>
 93 |     <!-- parse json -->
 94 |     <dependency>
 95 |       <groupId>com.google.code.gson</groupId>
 96 |       <artifactId>gson</artifactId>
 97 |       <version>2.8.0</version>
 98 |     </dependency>
 99 |     <!-- MySQL -->
100 |     <dependency>
101 |       <groupId>mysql</groupId>
102 |       <artifactId>mysql-connector-java</artifactId>
103 |       <version>8.0.11</version>
104 |     </dependency>
105 |     <!-- use its bloom filter -->
106 |     <dependency>
107 |       <groupId>com.google.guava</groupId>
108 |       <artifactId>guava</artifactId>
109 |       <version>26.0-jre</version>
110 |     </dependency>
111 |     <!-- use its log function -->
112 |     <dependency>
113 |       <groupId>org.projectlombok</groupId>
114 |       <artifactId>lombok</artifactId>
115 |       <version>1.18.2</version>
116 |       <scope>provided</scope>
117 |     </dependency>
118 |     <!-- handle excel -->
119 |     <dependency>
120 |       <groupId>org.apache.poi</groupId>
121 |       <artifactId>poi-ooxml</artifactId>
122 |       <version>3.17</version>
123 |     </dependency>
124 |     <!-- super datetime -->
125 |     <dependency>
126 |       <groupId>joda-time</groupId>
127 |       <artifactId>joda-time</artifactId>
128 |       <version>2.9.9</version>
129 |     </dependency>
130 |     <!-- tokenizer -->
131 |     <dependency>
132 |       <groupId>org.ansj</groupId>
133 |       <artifactId>ansj_seg</artifactId>
134 |       <version>5.1.1</version>
135 |     </dependency>
136 |     <!-- RocketMQ -->
137 |     <dependency>
138 |       <groupId>org.apache.rocketmq</groupId>
139 |       <artifactId>rocketmq-client</artifactId>
140 |       <version>4.4.0</version>
141 |     </dependency>
142 |   </dependencies>
143 | 
144 | </project>


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/Spider.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai;
  2 | 
  3 | import com.jinshuai.core.downloader.Downloader;
  4 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader;
  5 | import com.jinshuai.core.parser.Parser;
  6 | import com.jinshuai.core.parser.impl.NewsParser;
  7 | import com.jinshuai.core.saver.Saver;
  8 | import com.jinshuai.core.saver.impl.TextSaver;
  9 | import com.jinshuai.core.scheduler.Scheduler;
 10 | import com.jinshuai.core.scheduler.impl.PriorityQueueScheduler;
 11 | import com.jinshuai.core.scheduler.impl.RedisScheduler;
 12 | import com.jinshuai.entity.Page;
 13 | import com.jinshuai.entity.UrlSeed;
 14 | import com.jinshuai.util.PropertiesUtils;
 15 | import lombok.extern.slf4j.Slf4j;
 16 | 
 17 | import java.util.concurrent.*;
 18 | 
 19 | 
 20 | /**
 21 |  * @author: JS
 22 |  * @date: 2018/3/27
 23 |  * @description: 程序启动入口
 24 |  */
 25 | @Slf4j
 26 | public class Spider {
 27 |     
 28 |     /**
 29 |      * 设置爬虫组件：scheduler、downloader、parser、saver、
 30 |      */
 31 |     private Scheduler scheduler;
 32 |     private Downloader downloader;
 33 |     private Parser parser;
 34 |     private Saver saver;
 35 | 
 36 |     /**
 37 |      * 初始目标任务量
 38 |      * */
 39 |     private static long targetTaskNumbers = 800;
 40 | 
 41 |     /**
 42 |      * 线程池参数配置
 43 |      */
 44 |     private ThreadPoolExecutor pool;
 45 |     private static final int CORE_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 2;
 46 |     private static final int MAX_POOL_SIZE = Runtime.getRuntime().availableProcessors() * 4;
 47 |     private static final long KEEP_ALIVE_TIME = 1500L;
 48 |     private static final int MAX_QUEUE_SIZE = 100;
 49 | 
 50 |     /**
 51 |      * 最多只有MAX_QUEUE_SIZE + MAX_POOL_SIZE个任务并发执行 -> 控制任务的提交速率
 52 |      */
 53 |     private Semaphore semaphore = new Semaphore(MAX_QUEUE_SIZE + MAX_POOL_SIZE);
 54 | 
 55 |     private Spider setScheduler(Scheduler scheduler) {
 56 |         if (scheduler == null) {
 57 |             log.error("未设置调度器，启动失败");
 58 |             System.exit(-1);
 59 |         }
 60 |         this.scheduler = scheduler;
 61 |         return this;
 62 |     }
 63 | 
 64 |     private Spider setDownloader(Downloader downloader) {
 65 |         if (downloader == null) {
 66 |             log.error("未设置下载器，启动失败");
 67 |             System.exit(-1);
 68 |         }
 69 |         this.downloader = downloader;
 70 |         return this;
 71 |     }
 72 | 
 73 |     private Spider setParser(Parser parser) {
 74 |         if (parser == null) {
 75 |             log.error("未设置解析器，启动失败");
 76 |             System.exit(-1);
 77 |         }
 78 |         this.parser = parser;
 79 |         return this;
 80 |     }
 81 | 
 82 |     private Spider setSaver(Saver saver) {
 83 |         if (saver == null) {
 84 |             log.error("未设置保存器，启动失败");
 85 |             System.exit(-1);
 86 |         }
 87 |         this.saver = saver;
 88 |         return this;
 89 |     }
 90 | 
 91 |     private Spider setThreadPool() {
 92 |         pool = new ThreadPoolExecutor(CORE_POOL_SIZE, MAX_POOL_SIZE, KEEP_ALIVE_TIME, TimeUnit.MILLISECONDS,
 93 |                 new LinkedBlockingQueue<>(MAX_QUEUE_SIZE));
 94 | 
 95 |         return this;
 96 |     }
 97 | 
 98 |     private Spider addUrlSeed(UrlSeed urlSeed) {
 99 |         if (urlSeed == null) {
100 |             log.error("未添加初始种子，启动失败");
101 |             System.exit(-1);
102 |         }
103 |         scheduler.push(urlSeed);
104 |         return this;
105 |     }
106 | 
107 |     private Spider setTargetTaskNumbers() {
108 |         String configTargetNum = PropertiesUtils.getInstance().get("targetNum");
109 |         if (configTargetNum != null && !configTargetNum.trim().equals("")) {
110 |             try {
111 |                 targetTaskNumbers = Long.valueOf(configTargetNum);
112 |                 if (targetTaskNumbers <= 0) {
113 |                     log.error("无效的目标任务数量:[{}]", targetTaskNumbers);
114 |                 }
115 |             } catch (Exception e) {
116 |                 log.error("无效的目标任务数量:[{}]，使用默认值", configTargetNum, e);
117 |             }
118 |         }
119 |         return this;
120 |     }
121 | 
122 |     private void run() {
123 |         log.info("爬虫启动......");
124 |         Runtime.getRuntime().addShutdownHook(new Thread(()->{
125 |             pool.shutdown(); // clean resource
126 |         }));
127 |         UrlSeed urlSeed = null;
128 |         while (true) {
129 |             try {
130 |                 // the url_store has no url and there is no active thread
131 |                 if ((urlSeed = scheduler.pop()) == null && pool.getActiveCount() == 0 && pool.getQueue().size() == 0) {
132 |                     pool.shutdown();
133 |                     log.info("解析完毕，正在停止......");
134 |                     System.exit(-1); //TODO 为了停止生产者，可以改为轮询标志位
135 |                     break;
136 |                 } else if (urlSeed == null) {
137 |                     log.info("种子仓库已无种子，等待中......");
138 |                     TimeUnit.SECONDS.sleep(1);
139 |                 } else {
140 |                     log.info("准备解析URL:[{}]，优先级(默认5):[{}]", urlSeed.getUrl(), urlSeed.getPriority());
141 |                     semaphore.acquire();
142 |                     pool.execute(new SpiderWork(urlSeed));
143 |                 }
144 |                 if (pool.getCompletedTaskCount() >= targetTaskNumbers && urlSeed == null && pool.getQueue().size() == 0) {
145 |                     pool.shutdown();
146 |                     log.info("达到目标，正在停止......");
147 |                     System.exit(-1); //TODO 为了停止生产者，可以改为轮询标志位
148 |                 }
149 |             } catch (InterruptedException e) {
150 |                 log.error("当前线程被中断", e); //TODO
151 |             } catch (RejectedExecutionException e) {
152 |                 log.error("拒绝此次提交的任务[{}]", urlSeed, e);
153 |                 semaphore.release();
154 |             }
155 |         }
156 |     }
157 | 
158 |     private class SpiderWork implements Runnable {
159 | 
160 |         private UrlSeed urlSeed;
161 | 
162 |         SpiderWork(UrlSeed urlSeed) {
163 |             this.urlSeed = urlSeed;
164 |         }
165 | 
166 |         public void run() {
167 |             try {
168 |                 log.info("已完成任务数量:[{}]，运行中线程数量：[{}]，最大线程运行数量: [{}]，工作队列任务数量：[{}]",
169 |                         pool.getCompletedTaskCount(), pool.getActiveCount(), pool.getMaximumPoolSize(), pool.getQueue().size());
170 |                 Page page = downloader.download(urlSeed);
171 |                 parser.parse(page);
172 |                 // add new url to scheduler
173 |                 page.getUrlSeeds().forEach(seed -> scheduler.push(seed));
174 |                 saver.save(page);
175 |             } finally {
176 |                 semaphore.release();
177 |             }
178 |         }
179 |     }
180 | 
181 |     private static Spider build() {
182 | 
183 |         return new Spider()
184 |                 .setTargetTaskNumbers()
185 |                 .setDownloader(new HttpClientPoolDownloader())
186 |                 .setParser(new NewsParser())
187 |                 .setSaver(new TextSaver())
188 | //                .setScheduler(new RedisScheduler())
189 |                 .setScheduler(new PriorityQueueScheduler(targetTaskNumbers))
190 |                 .setThreadPool();
191 | 
192 | 
193 |     }
194 | 
195 |     /**
196 |      * Test
197 |      *
198 |      * 线程池提交任务流程：
199 |      * 判断当前活跃的线程数量和corePoolSize的大小关系，如果没达到corePoolSize就会开新的线程执行任务，如果达到了
200 |      * 判断和工作队列的大小关系，如果工作队列还没有满，将任务放到工作队列中，如果满了
201 |      * 判断和maximumPoolSize的大小关系，如果没达到maximumPoolSize，就会新开线程执行任务，如果达到了
202 |      * 回调注册的拒绝策略
203 |      *
204 |      */
205 |     public static void main(String[] args) {
206 |         Spider.build()
207 |                 .addUrlSeed(new UrlSeed("http://xww.hebut.edu.cn/gdyw/index.htm"))
208 |                 .run();
209 |     }
210 | 
211 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/README.md:
--------------------------------------------------------------------------------
1 | - 下载器Downloader根据种子调度器scheduler提供的种子UrlSeed进行下载
2 | - 解析器Parser解析下载器的响应,将响应内容封装成一个Page
3 | - 持久器Saver将Page持久化


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/downloader/Downloader.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.downloader;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | import com.jinshuai.entity.UrlSeed;
 5 | 
 6 | /**
 7 |  * 下载器接口，可以针对此接口构造多种下载器实现
 8 |  * @see com.jinshuai.core.downloader.impl.HttpClientPoolDownloader
 9 |  * */
10 | public interface Downloader {
11 | 
12 |     /***
13 |      * @param urlSeed  待使用种子
14 |      * @return 响应体内容封装成的Page
15 |      */
16 |     Page download(UrlSeed urlSeed);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/downloader/impl/HttpClientPoolDownloader.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.downloader.impl;
 2 | 
 3 | import com.jinshuai.core.downloader.Downloader;
 4 | import com.jinshuai.entity.Page;
 5 | import com.jinshuai.entity.UrlSeed;
 6 | import com.jinshuai.util.http.HttpUtils;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | import org.jsoup.Jsoup;
 9 | import org.jsoup.nodes.Document;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | /**
14 |  * @author: JS
15 |  * @date: 2018/3/26
16 |  * @description:
17 |  *  通过Http连接池下载
18 |  */
19 | @Slf4j
20 | public class HttpClientPoolDownloader implements Downloader {
21 | 
22 |     public Page download(UrlSeed urlSeed) {
23 |         Page page = null;
24 |         try {
25 |             String html = HttpUtils.getSingleInstance().getContent(urlSeed.getUrl());
26 |             Document document = Jsoup.parse(html, urlSeed.getUrl());
27 |             page = new Page(urlSeed, document);
28 |         } catch (Exception e) {
29 |             log.error("下载器下载的相应文本获取DOM树失败", e);
30 |         }
31 |         return page;
32 |     }
33 | 
34 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/parser/Parser.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.parser;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | 
 5 | /**
 6 |  * @author JS
 7 |  * @date 2018/03/26
 8 |  * @description
 9 |  *  解析Page
10 |  * */
11 | public interface Parser {
12 | 
13 |     /**
14 |      * @param page 要解析的Page
15 |      * @return 解析后的Page(Map、Set)
16 |      * @description 解析Page中的Document的内容到Map中，URL到Set中
17 |      * */
18 |     Page parse(Page page);
19 | 
20 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/parser/impl/BaiKeParser.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.parser.impl;
 2 | 
 3 | import com.jinshuai.core.parser.Parser;
 4 | import com.jinshuai.entity.Page;
 5 | 
 6 | /**
 7 |  * @author: JS
 8 |  * @date: 2018/3/29
 9 |  * @description:
10 |  *  百度百科解析器 //TODO:to do
11 |  */
12 | public class BaiKeParser implements Parser{
13 | 
14 |     @Override
15 |     public Page parse(Page page) {
16 |         return null;
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/parser/impl/NewsParser.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.core.parser.impl;
  2 | 
  3 | import com.jinshuai.core.downloader.impl.HttpClientPoolDownloader;
  4 | import com.jinshuai.core.parser.Parser;
  5 | import com.jinshuai.entity.Page;
  6 | import com.jinshuai.entity.UrlSeed;
  7 | import lombok.extern.slf4j.Slf4j;
  8 | import org.joda.time.DateTime;
  9 | import org.joda.time.DateTimeUtils;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import java.util.*;
 16 | 
 17 | /**
 18 |  * @author: JS
 19 |  * @date: 2018/3/26
 20 |  * @description:
 21 |  *  针对hebut新闻类的网页，解析相应内容。
 22 |  */
 23 | @Slf4j
 24 | public class NewsParser implements Parser {
 25 | 
 26 |     private static volatile int firstTime = 0;
 27 | 
 28 |     // TODO: 待优化解析过程
 29 |     public Page parse(Page page) {
 30 |         // 获取DOM树
 31 |         Document document;
 32 |         try {
 33 |             document = page.getDocument();
 34 |             long priority = timestamp2Priority(document);
 35 |             // 种子,并进行预处理
 36 |             Set<UrlSeed> urlSeeds = new HashSet<>();
 37 |             Iterator seedIterator = document.getElementsByTag("a").iterator();
 38 |             while (seedIterator.hasNext()) {
 39 |                 Element element3 = (Element) seedIterator.next();
 40 |                 String href = element3.attr("href");
 41 |                 if (href.contains("http://www.hebut.edu.cn/")|| href.contains("/")  || href.contains("#") || href.contains("index.htm") || href.contains("javascript:void(0);")) continue;
 42 |                 if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) continue;
 43 |                 urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/gdyw/" + href, priority));
 44 |             }
 45 |             page.setUrlSeeds(urlSeeds);
 46 |             if ("http://xww.hebut.edu.cn/".equals(page.getUrlSeed().getUrl())) {
 47 |                 return page;
 48 |             }
 49 |             Map<String, String> items = new HashMap<String, String>(3);
 50 |             // 标题
 51 |             Element titleElement = document.selectFirst("div.sub_articleTitle");
 52 |             items.put("title", titleElement.getElementsByTag("h2").text());
 53 |             // 时间
 54 |             Element dateElement = document.selectFirst("div.sub_articleAuthor");
 55 |             items.put("date", dateElement.getElementsByTag("strong").eachText().get(0));
 56 |             // 正文
 57 |             Element textElement = document.selectFirst("div.sub_articleInfo");
 58 |             Iterator textIterator = textElement.getElementsByTag("span").iterator();
 59 |             StringBuilder stringBuilder = new StringBuilder();
 60 |             while (textIterator.hasNext()) {
 61 |                 Element element3 = (Element) textIterator.next();
 62 |                 stringBuilder.append(element3.text());
 63 |             }
 64 |             items.put("content", stringBuilder.toString());
 65 |             page.setItems(items);
 66 |         } catch (Exception e) {
 67 |             log.error("解析页面[{}]出错",page.getUrlSeed().getUrl(),e);
 68 |         } finally {
 69 |             return page;
 70 |         }
 71 |     }
 72 | 
 73 |     /**
 74 |      * 该Page中的url时间戳参考该Page的时间戳计算优先级
 75 |      * */
 76 |     private long timestamp2Priority(Document document) {
 77 |         String date;
 78 |         try {
 79 |             date = document.selectFirst("div.sub_articleAuthor").getElementsByTag("strong").eachText().get(0);
 80 |         } catch (Exception e) {
 81 |             log.error("解析页面异常",e);
 82 |             return 5;
 83 |         }
 84 |         DateTime dateTime = new DateTime(date);
 85 |         // 获取时间戳的差值
 86 |         long v = DateTimeUtils.currentTimeMillis() - dateTime.getMillis();
 87 |         // 换算成天数
 88 |         v /= 86400000;
 89 |         // 发布时间超过10天设置低的优先级：3，10天：5，小于10天：3
 90 |         return v > 10 ? 3 : v == 10 ? 5 : 10;
 91 |     }
 92 | 
 93 |     private Page getHyperLinkTag(Page page) {
 94 |         if (page == null) {
 95 |             throw new RuntimeException("page 为空");
 96 |         }
 97 |         // 获取DOM树
 98 |         Document document = page.getDocument();
 99 |         // 如果是首页
100 |         if ("http://xww.hebut.edu.cn".equals(page.getUrlSeed().getUrl()) && firstTime == 0) {
101 |             Set<UrlSeed> urlSeeds = new HashSet<UrlSeed>();
102 |             Iterator seedIterator = document.getElementsByTag("a").iterator();
103 |             while (seedIterator.hasNext()) {
104 |                 Element element3 = (Element) seedIterator.next();
105 |                 String href = element3.attr("href");
106 |                 if (href.contains("#") || href.contains("index.html") || href.contains("javascript:void(0);")) continue;
107 |                 if (href.startsWith("gdyw") || href.startsWith("zhyw")) {
108 |                     urlSeeds.add(new UrlSeed("http://xww.hebut.edu.cn/" + href,
109 |                             (int) (Math.random() * 10)));
110 |                 }
111 |             }
112 |             page.setUrlSeeds(urlSeeds);
113 |             // 已经访问过首页
114 |             firstTime = 1;
115 |         }
116 |         return page;
117 |     }
118 |     /**
119 |      * test
120 |      * */
121 |     public static void main(String[] args) {
122 |         UrlSeed urlSeed = new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5);
123 |         Page page = new HttpClientPoolDownloader().download(urlSeed);
124 | //        Page page = new Page(new UrlSeed("http://xww.hebut.edu.cn/gdyw/70772.htm",5), Jsoup.parse("<html></html>","http://xww.hebut.edu.cn/gdyw/index.htm"));
125 |         System.out.println(new NewsParser().parse(page));
126 |     }
127 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/saver/Saver.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.saver;
 2 | 
 3 | import com.jinshuai.entity.Page;
 4 | 
 5 | /**
 6 |  * 数据持久化
 7 |  * */
 8 | public interface Saver {
 9 | 
10 |     /**
11 |      * just do it
12 |      * */
13 |     void save(Page page);
14 | 
15 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/saver/impl/DataBaseSaver.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.saver.impl;
 2 | 
 3 | import com.jinshuai.core.saver.Saver;
 4 | import com.jinshuai.entity.Page;
 5 | 
 6 | /**
 7 |  * @author: JS
 8 |  * @date: 2018/3/27
 9 |  * @description:
10 |  *  存放到数据库中
11 |  */
12 | public class DataBaseSaver implements Saver {
13 | 
14 |     public void save(Page page) {
15 |         //TODO
16 |     }
17 | 
18 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/saver/impl/TextSaver.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.saver.impl;
 2 | 
 3 | import com.jinshuai.core.saver.Saver;
 4 | import com.jinshuai.entity.Page;
 5 | import com.jinshuai.util.PropertiesUtils;
 6 | import com.jinshuai.util.hash.PageUtils;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | 
 9 | import java.io.File;
10 | import java.io.FileWriter;
11 | import java.io.IOException;
12 | import java.util.Date;
13 | 
14 | /**
15 |  * @author: JS
16 |  * @date: 2018/3/27
17 |  * @description:
18 |  *  存储到txt
19 |  */
20 | @Slf4j
21 | public class TextSaver implements Saver {
22 | 
23 |     private String parentDir;
24 | 
25 |     private PageUtils pageUtil = PageUtils.getInstance();
26 | 
27 |     private PropertiesUtils propertiesUtil = PropertiesUtils.getInstance();
28 | 
29 |     public TextSaver() {
30 |         init();
31 |     }
32 | 
33 |     /**
34 |      * 初始化文件要存的目录
35 |      * */
36 |     private void init() {
37 |         parentDir = PropertiesUtils.getInstance().get("dir");
38 |         File file = new File(parentDir);
39 |         if (!file.exists()) {
40 |             file.mkdirs();
41 |         }
42 |         log.info("解析后的文件存放位置：[{}]",parentDir);
43 |     }
44 | 
45 |     public void save(Page page) {
46 |         if (page == null) {
47 |             return;
48 |         }
49 |         // 文本相似度检测
50 |         String similarCheck = propertiesUtil.get("similarCheck");
51 |         if (similarCheck != null && !similarCheck.trim().equals("") &&similarCheck.equalsIgnoreCase("true")) {
52 |             String title = page.getItems().get("title");
53 |             String content = page.getItems().get("content");
54 |             if(pageUtil.exist(title, content)) {
55 |                 log.info("标题为 [{}] 的相似文章已经存在", title);
56 |             }
57 |         }
58 |         File file = new File(String.format("%s%s.txt",parentDir,new Date().getTime()));
59 |         try (FileWriter fw = new FileWriter(file)) {
60 |             if (page.getItems() == null) {
61 |                 fw.flush();
62 |                 return;
63 |             }
64 |             fw.append(String.format("[标题] %s\n",page.getItems().get("title")));
65 |             fw.append(String.format("[日期] %s\n", page.getItems().get("date")));
66 |             fw.append(String.format("[正文] %s\n",page.getItems().get("content")));
67 |             fw.append(String.format("[链接] %s\n",page.getUrlSeed().getUrl()));
68 |             fw.flush();
69 |         } catch (IOException e) {
70 |             log.error("存储路径无效",e);
71 |         }
72 |     }
73 | 
74 |     public static void main(String[] args) throws IOException {
75 | //        String parentDir = "E:/HEBUTNews/";
76 | //        File file = new File(parentDir+ (new Date().getTime()) + ".txt");
77 | //        //file.createNewFile();
78 | //        if (!file.getParentFile().exists()) {
79 | //            //file.getParentFile().mkdirs();
80 | //
81 | //        }
82 | //        FileWriter fileWriter = new FileWriter(file);
83 | //        fileWriter.append("fasdfs");
84 | //        fileWriter.flush();
85 |         Saver saver = new TextSaver();
86 | 
87 | //        new TextSaver().save(new Page(new UrlSeed("",5), Jsoup.parse("HTML","")).setItems(null));
88 |     }
89 | 
90 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/scheduler/Scheduler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.scheduler;
 2 | 
 3 | import com.jinshuai.entity.UrlSeed;
 4 | 
 5 | /**
 6 |  * @author JS
 7 |  * @date 2018/03/26
 8 |  * @description：
 9 |  *  种子调度器: 提供种子，存放种子。
10 |  * */
11 | public interface Scheduler {
12 | 
13 |     /**
14 |      * 存放种子
15 |      * */
16 |     void push(UrlSeed urlSeed);
17 |     /**
18 |      * 提供种子
19 |      * */
20 |     UrlSeed pop();
21 | 
22 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/scheduler/impl/PriorityQueueScheduler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.scheduler.impl;
 2 | 
 3 | import com.google.common.hash.BloomFilter;
 4 | import com.google.common.hash.Funnels;
 5 | import com.jinshuai.core.scheduler.Scheduler;
 6 | import com.jinshuai.entity.UrlSeed;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | 
 9 | import java.nio.charset.Charset;
10 | import java.util.PriorityQueue;
11 | 
12 | /**
13 |  * @author: JS
14 |  * @date: 2018/10/19
15 |  * @description: 优先级队列结合布隆过滤器进行种子调度
16 |  */
17 | @Slf4j
18 | public class PriorityQueueScheduler implements Scheduler {
19 | 
20 |     /**
21 |      * 存储种子的优先队列，采用大根堆实现
22 |      */
23 |     private final PriorityQueue<UrlSeed> urlQueue;
24 | 
25 |     /**
26 |      * 布隆过滤器判断种子是否重复
27 |      * 预定要完成的任务数量是800
28 |      * 允许0.01的错误率P
29 |      */
30 |     private final BloomFilter<String> bloomFilter;
31 | 
32 |     public PriorityQueueScheduler(long targetNum) {
33 |         bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")), targetNum, 0.01);
34 |         urlQueue = new PriorityQueue<>(
35 |                 (o1, o2) -> -Long.compare(o1.getPriority(), o2.getPriority())
36 |         );
37 |     }
38 | 
39 |     @Override
40 |     public void push(UrlSeed urlSeed) {
41 |         String url = urlSeed.getUrl();
42 |         // 判断url是否已经在种子队列中
43 |         if (bloomFilter.mightContain(url)) {
44 | //            log.warn("url:[{}]已存在", urlSeed.getUrl());
45 |             return;
46 |         }
47 |         urlQueue.add(urlSeed);
48 |         bloomFilter.put(url);
49 |     }
50 | 
51 |     @Override
52 |     public UrlSeed pop() {
53 |         if (urlQueue.size() == 0) {
54 |             return null;
55 |         }
56 |         return urlQueue.poll();
57 |     }
58 | 
59 |     /**
60 |      * test
61 |      */
62 |     public static void main(String[] args) {
63 |         UrlSeed urlSeed1 = new UrlSeed("123",5);
64 |         UrlSeed urlSeed2 = new UrlSeed("1234",6);
65 |         UrlSeed urlSeed3 = new UrlSeed("1234",4);
66 |         PriorityQueueScheduler priorityQueueScheduler = new PriorityQueueScheduler(800);
67 |         priorityQueueScheduler.push(urlSeed1);
68 |         priorityQueueScheduler.push(urlSeed2);
69 |         priorityQueueScheduler.push(urlSeed3);
70 |     }
71 | 
72 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/core/scheduler/impl/RedisScheduler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.core.scheduler.impl;
 2 | 
 3 | import com.google.gson.Gson;
 4 | import com.jinshuai.core.scheduler.Scheduler;
 5 | import com.jinshuai.entity.UrlSeed;
 6 | import com.jinshuai.util.JedisUtils;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | import redis.clients.jedis.Jedis;
 9 | 
10 | /**
11 |  * @author: JS
12 |  * @date: 2018/3/26
13 |  * @description:
14 |  *  将种子存放到Redis
15 |  */
16 | @Slf4j
17 | public class RedisScheduler implements Scheduler {
18 | 
19 |     /**
20 |      * 存放UrlSeed.url hash && 进行种子判重
21 |      * */
22 |     private final static String PREFIX_SET = "Spider.set";
23 | 
24 |     /**
25 |      * 根据种子的优先级先简单创建不同的几个队列
26 |      * */
27 |     private final static String PREFIX_QUEUE_HIGH = "Spider.queue.high";
28 |     private final static String PREFIX_QUEUE_LOW = "Spider.queue.low";
29 |     private final static String PREFIX_QUEUE_DEFAULT = "Spider.queue.default";
30 | 
31 |     /**
32 |      * @param urlSeed 种子
33 |      * @desciption:
34 |      *  配置 jedisPool
35 |      *  添加种子的URL到Set，种子序列话后的JSON文本到List
36 |      *  添加种子之前需要判断种子是否已经存在。
37 |      * */
38 |     public void push(UrlSeed urlSeed) {
39 |         try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) {
40 |             // 种子不存在
41 |             if (!jedis.sismember(PREFIX_SET, urlSeed.getUrlHash())) {
42 |                 // 添加种子Url对应的hash到判重Set
43 |                 jedis.sadd(PREFIX_SET, urlSeed.getUrlHash());
44 |                 // 添加种子序列化后的JSON文本到List
45 |                 Gson gson = new Gson();
46 |                 String urlSeedToJson = gson.toJson(urlSeed);
47 |                 long urlSeedPriority = urlSeed.getPriority();
48 |                 if (urlSeedPriority > 5) {
49 |                     jedis.lpush(PREFIX_QUEUE_HIGH, urlSeedToJson);
50 |                 } else if (urlSeedPriority == 5) {
51 |                     jedis.lpush(PREFIX_QUEUE_DEFAULT, urlSeedToJson);
52 |                 } else {
53 |                     jedis.lpush(PREFIX_QUEUE_LOW, urlSeedToJson);
54 |                 }
55 |             }
56 |         } catch (Exception e) {
57 |             log.error("JedisPushUrl[{}]出错",urlSeed.toString(),e);
58 |         }
59 |     }
60 | 
61 |     /**
62 |      * @return 从列表中获取的种子JSON反序列化为UrlSeed
63 |      * @description:
64 |      *  优先从高优先级别的列表里取种子
65 |      * */
66 |     public UrlSeed pop() {
67 |         Gson gson = new Gson();
68 |         String urlSeedToJson = null;
69 |         UrlSeed urlSeed = null;
70 |         try (Jedis jedis = JedisUtils.getSingleInstance().getJedis()) {
71 |             if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_HIGH)) != null) {
72 |                 urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class);
73 |             } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_DEFAULT)) != null) {
74 |                 urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class);
75 |             } else if ((urlSeedToJson = jedis.lpop(PREFIX_QUEUE_LOW)) != null) {
76 |                 urlSeed = gson.fromJson(urlSeedToJson,UrlSeed.class);
77 |             }
78 |             return urlSeed;
79 |         } catch (Exception e) {
80 |             log.error("JedisPopUrl [{}]出错", urlSeedToJson, e);
81 |         }
82 |         return gson.fromJson(urlSeedToJson,UrlSeed.class);
83 |     }
84 | 
85 |     /**
86 |      * test connection
87 |      * */
88 |     public static void main(String[] args) {
89 |         Jedis jedis = JedisUtils.getSingleInstance().getJedis();
90 |         System.out.println(jedis.ping());
91 |         UrlSeed urlSeed = new RedisScheduler().pop();
92 |         System.out.println(urlSeed);
93 |         jedis.lpush(PREFIX_QUEUE_LOW, "dasdasdasdsa");
94 |     }
95 | 
96 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/entity/Page.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.entity;
 2 | 
 3 | import org.jsoup.nodes.Document;
 4 | 
 5 | import java.util.Map;
 6 | import java.util.Set;
 7 | 
 8 | /**
 9 |  * @author: JS
10 |  * @date: 2018/3/26
11 |  * @description:
12 |  *  每一个UrlSeed对应的页面抽象为一个Page
13 |  */
14 | public class Page {
15 | 
16 |     /**
17 |      * Page对应的UrlSeed
18 |      * */
19 |     private UrlSeed urlSeed;
20 | 
21 |     /**
22 |      * Page对应的jsoup文档
23 |      * */
24 |     private Document document;
25 | 
26 |     /**
27 |      * Page包含的url
28 |      * */
29 |     private Set<UrlSeed> urlSeeds;
30 | 
31 |     /**
32 |      * Page所包含的有用信息
33 |      * */
34 |     private Map<String,String> items;
35 | 
36 |     public Page(UrlSeed urlSeed, Document document) {
37 |         this.urlSeed = urlSeed;
38 |         this.document = document;
39 |     }
40 | 
41 |     public UrlSeed getUrlSeed() {
42 |         return urlSeed;
43 |     }
44 | 
45 |     public Page setUrlSeed(UrlSeed urlSeed) {
46 |         this.urlSeed = urlSeed;
47 |         return this;
48 |     }
49 | 
50 |     public Document getDocument() {
51 |         return document;
52 |     }
53 | 
54 |     public Page setDocument(Document document) {
55 |         this.document = document;
56 |         return this;
57 |     }
58 | 
59 |     public Set<UrlSeed> getUrlSeeds() {
60 |         return urlSeeds;
61 |     }
62 | 
63 |     public Page setUrlSeeds(Set<UrlSeed> urlSeeds) {
64 |         this.urlSeeds = urlSeeds;
65 |         return this;
66 |     }
67 | 
68 |     public Map<String, String> getItems() {
69 |         return items;
70 |     }
71 | 
72 |     public Page setItems(Map<String, String> items) {
73 |         this.items = items;
74 |         return this;
75 |     }
76 | 
77 |     @Override
78 |     public String toString() {
79 |         return "Page{" +
80 |                 "urlSeed=" + urlSeed +
81 |                 ", urlSeeds=" + urlSeeds +
82 |                 ", items=" + items +
83 |                 '}';
84 |     }
85 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/entity/UrlSeed.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.entity;
 2 | 
 3 | import com.jinshuai.util.hash.MurmurHash;
 4 | import lombok.EqualsAndHashCode;
 5 | import lombok.ToString;
 6 | 
 7 | /**
 8 |  * @author: JS
 9 |  * @date: 2018/3/26
10 |  * @description:
11 |  *  每个Url需要设置优先级，不需要对低于某个优先级的Url进行解析。
12 |  */
13 | @ToString
14 | @EqualsAndHashCode
15 | public class UrlSeed {
16 | 
17 |     /**
18 |      * 种子对应的Url
19 |      * */
20 |     private String url;
21 | 
22 |     /**
23 |      * url hash
24 |      * */
25 |     private String urlHash;
26 | 
27 |     /**
28 |      * 种子优先级
29 |      * 硬编码为5,通过时间戳设置优先级
30 |      * */
31 |     private long priority = 5;
32 | 
33 |     public UrlSeed(String url, long priority) {
34 |         this.url = url;
35 |         this.priority = priority;
36 |         this.urlHash = String.valueOf(MurmurHash.hash64(url));
37 |     }
38 | 
39 |     public UrlSeed(String url) {
40 |         this.url = url;
41 |         this.urlHash = String.valueOf(MurmurHash.hash64(url));
42 |     }
43 | 
44 |     public String getUrl() {
45 |         return url;
46 |     }
47 | 
48 |     public UrlSeed setUrl(String url) {
49 |         this.url = url;
50 |         return this;
51 |     }
52 | 
53 |     public String getUrlHash() {
54 |         return urlHash;
55 |     }
56 | 
57 |     public long getPriority() {
58 |         return priority;
59 |     }
60 | 
61 |     public UrlSeed setPriority(long priority) {
62 |         this.priority = priority;
63 |         return this;
64 |     }
65 | 
66 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/ExcelUtils.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util;
 2 | 
 3 | import lombok.extern.slf4j.Slf4j;
 4 | import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 5 | import org.apache.poi.ss.usermodel.*;
 6 | 
 7 | import java.io.File;
 8 | import java.io.IOException;
 9 | import java.util.ArrayList;
10 | import java.util.Iterator;
11 | import java.util.List;
12 | 
13 | /**
14 |  * @author: JS
15 |  * @date: 2018/11/9
16 |  * @description: handle excel
17 |  */
18 | @Slf4j
19 | public class ExcelUtils implements OfficeUtils<String> {
20 |     @Override
21 |     public List<String> read() {
22 |         List<String> result = new ArrayList<>();
23 |         try {
24 |             Workbook wb = WorkbookFactory.create(new File("F:/XXX.xls"));
25 |             wb.close();
26 |             Sheet sheet = wb.getSheetAt(0);
27 |             for(Iterator rowIterator = sheet.rowIterator(); rowIterator.hasNext();) {
28 |                 Row row = (Row)rowIterator.next();
29 |                 Cell cell = row.getCell(1);
30 |                 result.add(cell.getStringCellValue());
31 |             }
32 |         } catch (IOException | InvalidFormatException e ) {
33 |             log.error("读取excel出错，检查路径是否正确、行列号是否越界",e);
34 |         }
35 |         return result;
36 |     }
37 | 
38 |     @Override
39 |     @Deprecated
40 |     public void write() {
41 |     }
42 | 
43 |     public static void main(String[] args) {
44 |         List<String> list = new ExcelUtils().read();
45 |         list.forEach(System.out::println);
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/JedisUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import redis.clients.jedis.Jedis;
  5 | import redis.clients.jedis.JedisPool;
  6 | import redis.clients.jedis.JedisPoolConfig;
  7 | 
  8 | import java.util.Map;
  9 | import java.util.concurrent.ConcurrentHashMap;
 10 | 
 11 | /**
 12 |  * @author: JS
 13 |  * @date: 2018/3/27
 14 |  * @description:
 15 |  *  对Jedis简单的封装
 16 |  */
 17 | @Slf4j
 18 | public class JedisUtils {
 19 | 
 20 |     /**
 21 |      * JedisUtils实例
 22 |      * */
 23 |     private static volatile JedisUtils jedisUtils;
 24 | 
 25 |     /**
 26 |      * 获取JedisUtils单例
 27 |      * */
 28 |     public static JedisUtils getSingleInstance() {
 29 |         if (jedisUtils == null) {
 30 |             synchronized (JedisUtils.class) {
 31 |                 jedisUtils = new JedisUtils();
 32 |             }
 33 |         }
 34 |         return jedisUtils;
 35 |     }
 36 | 
 37 |     private JedisPool jedisPool;
 38 | 
 39 |     private JedisUtils() {
 40 |         init();
 41 |     }
 42 | 
 43 |     private void init() {
 44 |         configJedisPool();
 45 |     }
 46 | 
 47 |     /**
 48 |      * 获取套接字、密码
 49 |      * */
 50 |     private static final String IP = PropertiesUtils.getInstance().get("redis-ip");
 51 |     private static final int PORT = Integer.valueOf(PropertiesUtils.getInstance().get("redis-port"));
 52 |     private static final String PASSWORD = PropertiesUtils.getInstance().get("redis-password");
 53 | 
 54 |     /**
 55 |      * 可用连接实例的最大数目，默认值为8；
 56 |      * 如果赋值为-1，则表示不限制；如果pool已经分配了maxActive个jedis实例，则此时pool的状态为exhausted(耗尽)。
 57 |      */
 58 |     private static int MAX_ACTIVE = 2048;
 59 | 
 60 |     /**
 61 |      * 控制一个pool最多有多少个状态为idle(空闲的)的jedis实例，默认值也是8。
 62 |      */
 63 |     private static int MAX_IDLE = 200;
 64 | 
 65 |     /**
 66 |      * 等待可用连接的最大时间，单位毫秒，默认值为-1，表示永不超时。如果超过等待时间，则直接抛出JedisConnectionException；
 67 |      * */
 68 |     private static int MAX_WAIT = 10000;
 69 | 
 70 |     /**
 71 |      * 超时时间
 72 |      * */
 73 |     private static int TIMEOUT = 10000;
 74 | 
 75 |     /**
 76 |      * 保存若干个jedisPool
 77 |      * key 为IP+port
 78 |      * */
 79 |     private static Map<String,JedisPool> maps = new ConcurrentHashMap<>();
 80 | 
 81 |     private void configJedisPool() {
 82 |         if (maps.get(IP) == null) {
 83 |             JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
 84 |             jedisPoolConfig.setMaxTotal(MAX_ACTIVE);
 85 |             jedisPoolConfig.setMaxIdle(MAX_IDLE);
 86 |             jedisPoolConfig.setMaxWaitMillis(MAX_WAIT);
 87 |             jedisPoolConfig.setTestOnReturn(true);
 88 |             // 未设置密码
 89 |             if (PASSWORD == null || PASSWORD.length() == 0) {
 90 |                 log.info("配置文件中未设置Redis密码，请确保Redis服务器不需要密码验证!!!");
 91 |                 jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT);
 92 |             } else {
 93 |                 jedisPool = new JedisPool(jedisPoolConfig, IP, PORT, TIMEOUT, PASSWORD);
 94 |             }
 95 |             maps.put(IP,jedisPool);
 96 |         } else {
 97 |             jedisPool = maps.get(IP);
 98 |         }
 99 |     }
100 | 
101 |     /**
102 |      * 从jedisPool中获取jedis
103 |      * */
104 |     public Jedis getJedis() {
105 |         Jedis jedis = null;
106 |         try {
107 |             jedis = jedisPool.getResource();
108 |         } catch (Exception e) {
109 |             log.error("连接Redis失败,检查IP、端口、密码", e);
110 |         }
111 |         return jedis;
112 |     }
113 | 
114 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/OfficeUtils.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | public interface OfficeUtils<T> {
 6 | 
 7 |     List<T> read();
 8 | 
 9 |     void write();
10 | 
11 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/PropertiesUtils.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util;
 2 | 
 3 | 
 4 | import lombok.extern.slf4j.Slf4j;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import java.util.Map;
 9 | import java.util.Properties;
10 | import java.util.concurrent.ConcurrentHashMap;
11 | 
12 | /**
13 |  * @author: JS
14 |  * @date: 2018/5/4
15 |  * @description:
16 |  *  读取配置文件工具类
17 |  */
18 | @Slf4j
19 | public class PropertiesUtils {
20 | 
21 |     private Map<String, String> cache = new ConcurrentHashMap<>();
22 | 
23 |     private static volatile PropertiesUtils propertiesUtils;
24 | 
25 |     public static PropertiesUtils getInstance() {
26 |         if (propertiesUtils == null) {
27 |             synchronized (PropertiesUtils.class) {
28 |                 if (propertiesUtils == null) {
29 |                     propertiesUtils = new PropertiesUtils();
30 |                 }
31 |             }
32 |         }
33 |         return propertiesUtils;
34 |     }
35 | 
36 |     private PropertiesUtils(){}
37 | 
38 |     public String get(String key) {
39 |         if (key == null)
40 |             return null;
41 |         if (cache.get(key) != null) {
42 |             return cache.get(key);
43 |         }
44 |         Properties properties = new Properties();
45 |         InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties");
46 |         try {
47 |             properties.load(inputStream);
48 |         } catch (IOException e) {
49 |             log.error("加载配置文件[application.properties]失败",e);
50 |         }
51 |         String value = properties.getProperty(key);
52 |         cache.put(key,value);
53 |         return value;
54 |     }
55 | 
56 |     public static void main(String[] args) throws IOException {
57 | //        Properties properties = new Properties();
58 | //        InputStream inputStream = PropertiesUtils.class.getResourceAsStream("/application.properties");
59 | //        properties.load(inputStream);
60 | //        System.out.println(properties.getProperty("ip"));
61 | //        System.out.println(properties.getProperty("ip"));
62 |         System.out.println(PropertiesUtils.getInstance().get("dir"));
63 |         System.out.println(PropertiesUtils.getInstance().get("dir"));
64 |     }
65 | 
66 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/hash/MurmurHash.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.hash;
 2 | 
 3 | import java.math.BigInteger;
 4 | import java.nio.ByteBuffer;
 5 | import java.nio.ByteOrder;
 6 | 
 7 | /**
 8 |  * @author: JS
 9 |  * @date: 2019/5/25
10 |  * @description: 生成64位Hash 代码：https://www.cnkirito.moe/consistent-hash-lb/
11 |  */
12 | public class MurmurHash {
13 | 
14 |     public static BigInteger hash64(String word) {
15 |         ByteBuffer buf = ByteBuffer.wrap(word.getBytes());
16 |         int seed = 0x1234ABCD;
17 | 
18 |         ByteOrder byteOrder = buf.order();
19 |         buf.order(ByteOrder.LITTLE_ENDIAN);
20 | 
21 |         long m = 0xc6a4a7935bd1e995L;
22 |         int r = 47;
23 | 
24 |         long h = seed ^ (buf.remaining() * m);
25 | 
26 |         long k;
27 |         while (buf.remaining() >= 8) {
28 |             k = buf.getLong();
29 | 
30 |             k *= m;
31 |             k ^= k >>> r;
32 |             k *= m;
33 | 
34 |             h ^= k;
35 |             h *= m;
36 |         }
37 | 
38 |         if (buf.remaining() > 0) {
39 |             ByteBuffer finish = ByteBuffer.allocate(8).order(
40 |                     ByteOrder.LITTLE_ENDIAN);
41 |             // for big-endian version, do this first:
42 |             // finish.position(8-buf.remaining());
43 |             finish.put(buf).rewind();
44 |             h ^= finish.getLong();
45 |             h *= m;
46 |         }
47 |         h ^= h >>> r;
48 |         h *= m;
49 |         h ^= h >>> r;
50 | 
51 |         buf.order(byteOrder);
52 |         return new BigInteger(String.valueOf(h & 0xffffffffL));
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/hash/PageUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util.hash;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import org.ansj.app.keyword.KeyWordComputer;
  5 | import org.ansj.app.keyword.Keyword;
  6 | 
  7 | import java.io.File;
  8 | import java.io.IOException;
  9 | import java.math.BigInteger;
 10 | import java.nio.file.Files;
 11 | import java.util.*;
 12 | import java.util.concurrent.ConcurrentHashMap;
 13 | 
 14 | /**
 15 |  * @author: JS
 16 |  * @date: 2019/5/25
 17 |  * @description: SimHash进行doc相似度检测
 18 |  */
 19 | @Slf4j
 20 | public class PageUtils {
 21 | 
 22 |     private volatile static PageUtils instance;
 23 | 
 24 |     private Map<String, Set<BigInteger>> invertedIndex = new ConcurrentHashMap<>();
 25 | 
 26 |     /**
 27 |      * 测试：跟踪文本
 28 |      * */
 29 |     private Map<BigInteger, String> fingerContent = new ConcurrentHashMap<>();
 30 | 
 31 |     public static PageUtils getInstance() {
 32 |         if (instance == null) {
 33 |             synchronized (PageUtils.class) {
 34 |                 if (instance == null) {
 35 |                     instance = new PageUtils();
 36 |                 }
 37 |             }
 38 |         }
 39 |         return instance;
 40 |     }
 41 | 
 42 |     private PageUtils(){}
 43 | 
 44 |     private static final int BIT_SIZE = 64;
 45 | 
 46 |     private static final int TABLE_SIZE = 16;
 47 | 
 48 |     private static final int HAMMING_DISTANCE = 3;
 49 | 
 50 |     private ThreadLocal<String> simhashStrContainer = new ThreadLocal<>();
 51 | 
 52 |     public boolean exist(String title, String content) {
 53 |         boolean exist = false;
 54 |         BigInteger fingerprint = getSimHash(title, content);
 55 | //        fingerContent.put(fingerprint, title + "====" + content);
 56 |         String hashStr = simhashStrContainer.get();
 57 |         // 防止分词错误NPE
 58 |         if (hashStr.length() == BIT_SIZE) {
 59 |             // 获取每一个table对应的所有候选结果
 60 |             for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) {
 61 |                 String table = hashStr.substring(start, start + TABLE_SIZE);
 62 |                 Set<BigInteger> fingerprints = invertedIndex.get(table);
 63 |                 if (fingerprints != null && fingerprints.size() > 0) {
 64 |                     for (BigInteger fingerprintRes : fingerprints) {
 65 |                         // 海明距离
 66 |                         int hammingDistance = fingerprintRes.xor(fingerprint).bitCount();
 67 |                         if (hammingDistance <= HAMMING_DISTANCE) {
 68 | //                            log.error("标题 [{}] \r\n 内容[{}] \r\n 与 标题内容[{}] 相似\r\n 汉明距离:[{}]", title, content, fingerContent.get(fingerprintRes), hammingDistance);
 69 |                             exist = true;
 70 |                             break;
 71 |                         }
 72 |                     }
 73 |                 }
 74 |                 if (exist)
 75 |                     break;
 76 |             }
 77 |             // 构建倒排索引(16 * 4)
 78 |             constructInvertedIndex(fingerprint);
 79 |         }
 80 |         return exist;
 81 |     }
 82 | 
 83 |     private BigInteger getSimHash(String title, String content) {
 84 |         double[] featureVector = new double[BIT_SIZE];
 85 |         // 1. 分词,计算权重
 86 |         Collection<Keyword> result = getParticiple(title, content);
 87 |         // 2. hash
 88 |         // 3. 加权
 89 |         // 4. 合并
 90 |         featureVector = weightingAndCombine(featureVector, result);
 91 |         // 5. 降维
 92 |         // 6. SimHash 指纹
 93 |         return decreaseDimensionAndGetFingerprint(featureVector);
 94 |     }
 95 | 
 96 |     /**
 97 |      * 分词
 98 |      * */
 99 |     private Collection<Keyword> getParticiple(String title, String content) {
100 |         int keyNumber = content.length() / 2 < 5 ? 5 : content.length(); // TODO
101 |         KeyWordComputer kwc = new KeyWordComputer(keyNumber);
102 |         return kwc.computeArticleTfidf(title, content);
103 |     }
104 | 
105 |     /**
106 |      * 哈希 加权 合并
107 |      * */
108 |     private double[] weightingAndCombine(double[] featureVector, Collection<Keyword> result) {
109 |         for (Keyword keyword : result) {
110 |             String keyStr = keyword.getName();
111 |             BigInteger keyHash = MurmurHash.hash64(keyStr);
112 |             for (int i = 0; i < BIT_SIZE; i++) {
113 |                 final BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i);
114 |                 // 3. 加权
115 |                 // 4. 合并
116 |                 if (keyHash.and(bitMask).signum() != 0) {
117 |                     featureVector[i] += keyword.getScore();
118 |                 } else {
119 |                     featureVector[i] -= keyword.getScore();
120 |                 }
121 |             }
122 |         }
123 |         return featureVector;
124 |     }
125 | 
126 |     /**
127 |      * 降维 获取指纹
128 |      * */
129 |     private BigInteger decreaseDimensionAndGetFingerprint(double[] featureVector) {
130 |         BigInteger fingerprint = BigInteger.ZERO;
131 |         StringBuilder simHashBuilder = new StringBuilder();
132 |         for (int i = 0; i < BIT_SIZE; i++) {
133 |             BigInteger bitMask = BigInteger.ONE.shiftLeft(BIT_SIZE - 1 - i);
134 |             if (featureVector[i] > 0) {
135 |                 fingerprint = fingerprint.or(fingerprint.xor(bitMask));
136 |                 simHashBuilder.append(1);
137 |             } else {
138 |                 simHashBuilder.append(0);
139 |             }
140 |         }
141 |         simhashStrContainer.set(simHashBuilder.toString());
142 |         return fingerprint;
143 |     }
144 | 
145 |     /**
146 |      * 构建倒排索引
147 |      * < table, {simhash1, simhash2 simhash3...} >
148 |      *
149 |      * */
150 |     private void constructInvertedIndex(BigInteger fingerprint) {
151 |         String hashStr = simhashStrContainer.get();
152 |         for (int start = 0; start < BIT_SIZE; start += TABLE_SIZE) {
153 |             String table = hashStr.substring(start, start + TABLE_SIZE);
154 |             Set<BigInteger> docs = invertedIndex.get(table);
155 |             if (docs == null) {
156 |                 docs = new HashSet<>();
157 |             }
158 |             docs.add(fingerprint);
159 |             invertedIndex.put(table, docs);
160 |         }
161 |     }
162 | 
163 |     public static void main(String[] args) throws IOException {
164 |         PageUtils pageUtil = PageUtils.getInstance();
165 |         StringBuilder sb1 = new StringBuilder();
166 |         File file1 = new File("D:/Data/1.txt");
167 |         Files.readAllLines(file1.toPath()).forEach(line ->{
168 |             sb1.append(line);
169 |         });
170 | 
171 |         StringBuilder sb2 = new StringBuilder();
172 |         File file2 = new File("D:/Data/2.txt");
173 |         Files.readAllLines(file2.toPath()).forEach(line -> {
174 |             sb2.append(line);
175 |         });
176 | 
177 |         pageUtil.exist("学校党委理论学习中心组召开扩大会议",sb1.toString());
178 |         pageUtil.exist("校党委理论学习中心组召开专题会议学习传达全国“两会”精神", sb2.toString());
179 | 
180 | 
181 | //        BigInteger simhash1 = pageUtil.getSimHash("","我来自河北省，你们可以叫我金帅");
182 | //        BigInteger simhash2 = pageUtil.getSimHash("","我来自河北省，我是金帅");
183 | //        System.out.println(simhash1.xor(simhash2).bitCount());
184 | //        System.out.println(simhash1.xor(simhash2).toString(2));
185 | //        System.out.println("=========十进制========");
186 | //        System.out.println(simhash1.toString());
187 | //        System.out.println(simhash2.toString());
188 | //        System.out.println("=========二进制========");
189 | //        System.out.println(simhash1.toString(2));
190 | //        System.out.println(simhash2.toString(2));
191 | 
192 |     }
193 | 
194 | }
195 | 


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/http/HttpUtils.java:
--------------------------------------------------------------------------------
  1 | package com.jinshuai.util.http;
  2 | 
  3 | import com.jinshuai.util.PropertiesUtils;
  4 | import lombok.extern.slf4j.Slf4j;
  5 | import org.apache.http.Header;
  6 | import org.apache.http.HttpEntity;
  7 | import org.apache.http.HttpResponse;
  8 | import org.apache.http.client.config.RequestConfig;
  9 | import org.apache.http.client.methods.HttpGet;
 10 | import org.apache.http.config.Registry;
 11 | import org.apache.http.config.RegistryBuilder;
 12 | import org.apache.http.config.SocketConfig;
 13 | import org.apache.http.conn.socket.ConnectionSocketFactory;
 14 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
 15 | import org.apache.http.conn.ssl.NoopHostnameVerifier;
 16 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
 17 | import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
 18 | import org.apache.http.entity.ContentType;
 19 | import org.apache.http.impl.client.CloseableHttpClient;
 20 | import org.apache.http.impl.client.HttpClients;
 21 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
 22 | import org.apache.http.ssl.SSLContexts;
 23 | import org.apache.http.util.ByteArrayBuffer;
 24 | import org.apache.rocketmq.client.exception.MQClientException;
 25 | import org.apache.rocketmq.client.producer.DefaultMQProducer;
 26 | import org.apache.rocketmq.client.producer.SendCallback;
 27 | import org.apache.rocketmq.client.producer.SendResult;
 28 | import org.apache.rocketmq.common.message.Message;
 29 | import org.apache.rocketmq.remoting.common.RemotingHelper;
 30 | import org.apache.rocketmq.remoting.exception.RemotingException;
 31 | 
 32 | import javax.net.ssl.HostnameVerifier;
 33 | import javax.net.ssl.SSLContext;
 34 | import javax.net.ssl.SSLPeerUnverifiedException;
 35 | import javax.print.attribute.HashAttributeSet;
 36 | import java.io.IOException;
 37 | import java.io.InputStream;
 38 | import java.io.UnsupportedEncodingException;
 39 | import java.net.MalformedURLException;
 40 | import java.net.URI;
 41 | import java.net.URISyntaxException;
 42 | import java.net.URL;
 43 | import java.nio.charset.Charset;
 44 | import java.util.HashMap;
 45 | import java.util.Map;
 46 | import java.util.Random;
 47 | import java.util.concurrent.TimeUnit;
 48 | import java.util.regex.Matcher;
 49 | import java.util.regex.Pattern;
 50 | 
 51 | /**
 52 |  * @author: JS
 53 |  * @date: 2018/3/22
 54 |  * @description:
 55 |  *  创建单例HttpUtils，获取HttpClient实例执行HTTP请求根据状态码解析响应体。
 56 |  */
 57 | @Slf4j
 58 | public class HttpUtils {
 59 | 
 60 |     private static final ThreadLocal<HttpGet> httpGetContainer = new ThreadLocal<>();
 61 | 
 62 |     private static final ThreadLocal<HttpEntity> httpEntityContainer = new ThreadLocal<>();
 63 | 
 64 |     private static volatile HttpUtils HTTPUTILS;
 65 | 
 66 |     private PoolingHttpClientConnectionManager httpClientConnectionManager;
 67 | 
 68 |     private CloseableHttpClient httpClient;
 69 | 
 70 |     private static final int MAX_TOTAL_CONNECTIONS = 20;
 71 |     private static final int SOCKET_TIMEOUT = 5000;
 72 |     private static final int MAX_CONNECTIONS_PER_ROUTE = 200;
 73 |     private static final int CONNECTION_REQUEST_TIMEOUT = 5000;
 74 |     private static final int CONNECT_TIMEOUT = 5000;
 75 | 
 76 |     private static DefaultMQProducer producer;
 77 |     /**
 78 |      * 消息队列开关
 79 |      * 1-打开
 80 |      * 0-关闭
 81 |      * */
 82 |     private static String mqSwitch;
 83 | 
 84 |     private static final String CHARSET = RemotingHelper.DEFAULT_CHARSET;
 85 | 
 86 | 
 87 |     /**
 88 |      * 获取HttpUtils单例
 89 |      * */
 90 |     public static HttpUtils getSingleInstance() {
 91 |         if (HTTPUTILS == null) {
 92 |             synchronized (HttpUtils.class) {
 93 |                 if (HTTPUTILS == null) {
 94 |                     HTTPUTILS = new HttpUtils();
 95 |                 }
 96 |             }
 97 |         }
 98 |         return HTTPUTILS;
 99 |     }
100 | 
101 |     private HttpUtils() {
102 |         init();
103 |     }
104 | 
105 |     private void init() {
106 |         configHttpPool();
107 |         configHttpClient();
108 |         configMQ();
109 |     }
110 | 
111 |     /**
112 |      * 配置消息队列
113 |      * */
114 |     private void configMQ() {
115 |         mqSwitch = PropertiesUtils.getInstance().get("mq-switch");
116 |         // the switch of MQ is closed
117 |         if (mqSwitch == null || "0".equals(mqSwitch)) {
118 |             return;
119 |         }
120 |         producer = new DefaultMQProducer("Producer-Group");
121 |         String ip = PropertiesUtils.getInstance().get("mq-ip");
122 |         String port = PropertiesUtils.getInstance().get("mq-port");
123 |         producer.setNamesrvAddr(ip + ":" + port);
124 |         try {
125 |             producer.start();
126 |             log.info("mq-producer started");
127 |         } catch (MQClientException e) {
128 |             log.error("failed to start producer", e);
129 |         }
130 |         producer.setRetryTimesWhenSendAsyncFailed(0);
131 |     }
132 | 
133 |     /**
134 |      * 配置HTTP连接池
135 |      *
136 |      * */
137 |     private void configHttpPool() {
138 |         try {
139 |             // 配置SSL
140 |             SSLContext sslcontext = SSLContexts.custom()
141 |                     .loadTrustMaterial(null, new TrustSelfSignedStrategy())
142 |                     .build();
143 | 
144 |             HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.getDefaultHostnameVerifier();
145 |             // 关闭域名证书验证
146 | //            HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE;
147 | 
148 |             SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
149 |                     sslcontext, hostnameVerifier);
150 | 
151 |             Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
152 |                     .register("http", PlainConnectionSocketFactory.getSocketFactory())
153 |                     .register("https", sslsf)
154 |                     .build();
155 | 
156 |             // 将SSL集成到HttpConnectionManager
157 |             httpClientConnectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
158 |             // 设置HTTP连接池最大连接数
159 |             httpClientConnectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);
160 |             // 每个路由最大的连接数
161 |             httpClientConnectionManager.setDefaultMaxPerRoute(MAX_CONNECTIONS_PER_ROUTE);
162 |             // 设置socket超时时间
163 |             SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(SOCKET_TIMEOUT).build();
164 |             httpClientConnectionManager.setDefaultSocketConfig(socketConfig);
165 |         } catch (Exception e) {
166 |             log.error("SSL配置出错",e);
167 |         }
168 |     }
169 | 
170 |     /**
171 |      * 配置HttpClient
172 |      *
173 |      * */
174 |     private void configHttpClient() {
175 |         // 请求配置
176 |         RequestConfig requestConfig = RequestConfig.custom()
177 |                 .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT)
178 |                 .setConnectTimeout(CONNECT_TIMEOUT)
179 |                 .build();
180 |         // 将配置信息应用到HttpClient
181 |         if (httpClientConnectionManager == null) {
182 |             log.error("httpClientConnectionManager未被初始化");
183 |             return;
184 |         }
185 |         httpClient = HttpClients.custom()
186 |                 .setDefaultRequestConfig(requestConfig)
187 |                 .setConnectionManager(httpClientConnectionManager)
188 |                 .build();
189 |     }
190 | 
191 |     /**
192 |      * 配置HttpGet
193 |      *
194 |      * */
195 |     private HttpGet getHttpGet(String urlStr) {
196 |         URL url;
197 |         URI uri = null;
198 |         try {
199 |             url = new URL(urlStr);
200 |             uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
201 |         } catch (MalformedURLException | URISyntaxException e) {
202 |             log.error("字符串格式不正确[{}]",urlStr,e);
203 |         }
204 |         HttpGet httpGet = new HttpGet(uri);
205 |         // 添加请求头header
206 |         httpGet.addHeader("Accept", "*/*");
207 |         httpGet.addHeader("Accept-Encoding", "gzip, deflate");
208 |         httpGet.addHeader("Connection", "keep-alive");
209 |         int randomUserAgent = new Random().nextInt(UserAgentArray.USER_AGENT.length);
210 |         httpGet.addHeader("User-Agent",UserAgentArray.USER_AGENT[randomUserAgent]);
211 | 
212 |         return httpGet;
213 |     }
214 | 
215 |     /**
216 |      * 发Get请求
217 |      *
218 |      * */
219 |     private void sendRequest(String urlStr) {
220 |         HttpGet httpGet = httpGetContainer.get();
221 |         try {
222 |             HttpResponse response = httpClient.execute(httpGet);
223 |             // 根据状态码执行不同的操作
224 |             int statusCode = response.getStatusLine().getStatusCode();
225 |             StatusHandler strategy = StatusContext.getStrategy(statusCode);
226 |             strategy.process(urlStr, response);
227 |         } catch (IOException e) {
228 |             log.error("IO出错[{}]", urlStr, e);
229 |         }
230 |     }
231 | 
232 |     /**
233 |      * 获取 HttpEntity
234 |      *
235 |      * */
236 |     public String getContent(String urlStr) {
237 |         // url为空或者不是http协议
238 |         if (urlStr == null || !urlStr.startsWith("http")) {
239 |             return null;
240 |         }
241 |         // 防止SSL过程中的握手警报 http://dovov.com/ssljava-1-7-0unrecognized_name.html
242 |         if (urlStr.startsWith("https")) {
243 |             System.setProperty("jsse.enableSNIExtension", "false");
244 |         }
245 |         String content = null;
246 |         try {
247 |             httpGetContainer.set(getHttpGet(urlStr));
248 |             sendRequest(urlStr);
249 |             HttpEntity httpEntity = httpEntityContainer.get();
250 |             if (httpEntity == null) {
251 |                 log.error("HttpEntity为空");
252 |                 return null;
253 |             }
254 |             InputStream inputStream = httpEntity.getContent();
255 |             content = parseStream(inputStream, httpEntity);
256 |         } catch (IOException e) {
257 |             log.error("获取响应流失败", e);
258 |         } catch (Exception e) {
259 |             log.error("获取内容异常", e);
260 |         } finally {
261 |             httpGetContainer.get().releaseConnection();
262 |             httpGetContainer.remove();
263 |         }
264 |         return content;
265 |     }
266 | 
267 |     /**
268 |      * 解析响应流
269 |      *
270 |      * */
271 |     private String parseStream(InputStream inputStream, HttpEntity httpEntity) {
272 |         String pageContent = null;
273 |         // 获取页面编码：1. 从响应头content-type 2. 如果没有则从返回的HTML中获取Meta标签里的编码
274 |         ByteArrayBuffer byteArrayBuffer = new ByteArrayBuffer(4096);
275 |         byte[] tempStore = new byte[4096];
276 |         int count;
277 |         try {
278 |             // read(tempStore) 会重新从零开始存->刷新字节数组 ,并返回读到的字节数量
279 |             while ((count = inputStream.read(tempStore)) != -1) {
280 |                 byteArrayBuffer.append(tempStore, 0, count);
281 |             }
282 |             // TODO:下面复制粘贴的：https://github.com/xjtushilei/ScriptSpider
283 |             // 根据获取的字节编码转为String类型
284 |             String charset = "UTF-8";
285 |             ContentType contentType = ContentType.getOrDefault(httpEntity);
286 |             Charset charsets = contentType.getCharset();
287 |             pageContent = new String(byteArrayBuffer.toByteArray());
288 |             // 如果响应头中含有content-type字段，直接读取然后设置编码即可。
289 |             if (null != charsets) {
290 |                 charset = charsets.toString();
291 |             } else {
292 |                 // 发现HttpClient带的功能有问题，这里自己又写了一下。
293 |                 Pattern pattern = Pattern.compile("<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\"");
294 |                 Matcher matcher = pattern.matcher(pageContent.toLowerCase());
295 |                 if (matcher.find()) {
296 |                     charset = matcher.group(4);
297 |                 }
298 |             }
299 |             pageContent = new String(byteArrayBuffer.toByteArray(),charset);
300 |         } catch (IOException e) {
301 |             log.error("处理流失败", e);
302 |         }
303 |         return pageContent;
304 |     }
305 | 
306 |     /**
307 |      * 发送消息
308 |      * */
309 |     private static void sendMessage(String url, String topic) {
310 |         try {
311 |             Message msg = new Message(topic, url.getBytes(CHARSET));
312 |             producer.send(msg, new SendCallback() {
313 |                 @Override
314 |                 public void onSuccess(SendResult sendResult) {
315 |                     log.info("msg send success [{}]", sendResult);
316 |                 }
317 | 
318 |                 @Override
319 |                 public void onException(Throwable e) {
320 |                     log.error("msg send fail: [{}]", url, e);
321 |                 }
322 |             });
323 |         } catch (UnsupportedEncodingException e) {
324 |             log.error("MQ wrong encoding:[{}]", CHARSET, e);
325 |         } catch (InterruptedException | RemotingException | MQClientException e) {
326 |             log.error("Client Or Server Wrong", e);
327 |         }
328 |     }
329 | 
330 |     /**
331 |      * 策略上下文
332 |      * */
333 |     public static class StatusContext {
334 | 
335 |         private static final Map<Integer, StatusHandler> status2Handler = new HashMap<>();
336 | 
337 |         static {
338 |             status2Handler.put(2, SuccessStrategy.getInstance());
339 |             status2Handler.put(3, RedirectStrategy.getInstance());
340 |             status2Handler.put(4, ClientErrorStrategy.getInstance());
341 |             status2Handler.put(5, ServerErrorStrategy.getInstance());
342 |         }
343 | 
344 |         static StatusHandler getStrategy(int statusCode) {
345 |             return status2Handler.get(statusCode / 100);
346 |         }
347 | 
348 |     }
349 | 
350 |     /**
351 |      * 2XX 策略
352 |      * 成功获取响应时对应的执行策略
353 |      *
354 |      * */
355 |     public static class SuccessStrategy implements StatusHandler {
356 | 
357 |         private static final StatusHandler statusHandler = new SuccessStrategy();
358 | 
359 |         static StatusHandler getInstance() {
360 |             return statusHandler;
361 |         }
362 | 
363 |         @Override
364 |         public void process(String url, HttpResponse response) {
365 |             httpEntityContainer.set(response.getEntity());
366 |         }
367 | 
368 |     }
369 | 
370 |     /**
371 |      * 3XX 策略
372 |      * 重定向时对应的执行策略
373 |      *
374 |      * */
375 |     public static class RedirectStrategy implements StatusHandler {
376 | 
377 |         private static final StatusHandler statusHandler = new RedirectStrategy();
378 | 
379 |         static StatusHandler getInstance() {
380 |             return statusHandler;
381 |         }
382 | 
383 |         @Override
384 |         public void process(String url, HttpResponse response) {
385 |             Header location = response.getFirstHeader("Location");
386 |             // 将location对应的URL放到仓库中
387 | //            scheduler.push(new UrlSeed(location.getValue(), 5));
388 |             log.error("301: 资源已被重定向[{}]", url);
389 |             sendMessage(location.getValue(), "Redirect-Topic");
390 |         }
391 | 
392 |     }
393 | 
394 |     /**
395 |      * 4XX 策略
396 |      * 主要处理需要认证的资源401，需要授权的资源403，以及不存在的资源404
397 |      * 当请求次数过多以后，就容易报403
398 |      * 当 401，403时，将资源放到低优先级的队列或者消息队列中，额外处理。 TODO
399 |      * */
400 |     public static class ClientErrorStrategy implements StatusHandler {
401 | 
402 |         private static final StatusHandler statusHandler = new ClientErrorStrategy();
403 | 
404 |         static StatusHandler getInstance() {
405 |             return statusHandler;
406 |         }
407 | 
408 |         @Override
409 |         public void process(String url, HttpResponse response) {
410 |             int status = response.getStatusLine().getStatusCode();
411 |             if (status == 401 || status == 403) {
412 |                 log.warn("401: 无权访问此资源[{}]", url);
413 |                 // send to mq
414 |                 sendMessage(url, "Forbidden-Topic");
415 |             } else if (status == 404) {
416 |                 log.warn("404: 请求的资源不存在[{}]", url);
417 |             }
418 | 
419 |         }
420 | 
421 |     }
422 | 
423 |     /**
424 |      * 5XX 策略
425 |      * 远端服务器出错，应对办法是暂时停止爬虫 TODO
426 |      * */
427 |     public static class ServerErrorStrategy implements StatusHandler {
428 | 
429 |         private static final StatusHandler statusHandler = new ServerErrorStrategy();
430 | 
431 |         static StatusHandler getInstance() {
432 |             return statusHandler;
433 |         }
434 | 
435 |         @Override
436 |         public void process(String url, HttpResponse response) {
437 |             log.error("500: 远端服务器出错[{}]", url);
438 |             Header retryAfter = response.getFirstHeader("Retry-After");
439 |             long waitSeconds = 20;
440 |             if (retryAfter != null) {
441 |                 waitSeconds = Long.parseLong(retryAfter.getValue());
442 |             }
443 |             log.info("由于远程服务器出错，爬虫休息 [{}] 秒后，尝试继续执行任务.....", waitSeconds);
444 |             try {
445 |                 TimeUnit.SECONDS.sleep(waitSeconds);
446 |             } catch (InterruptedException e) {
447 |                 log.error("sleep error", e);
448 |             }
449 |             sendMessage(url, "ServerWrong-Topic");
450 |         }
451 | 
452 |     }
453 | 
454 |     /**
455 |      * Test HttpUtils
456 |      *
457 |      *  具体逻辑：HttpClient用封装好的HttpGet发送get请求，获取HttpEntity，从HttpEntity中获取响应内容以及响应头
458 |      *  从响应头Content-Type中获取charset编码格式，如果响应头中没有编码格式响应头，就从响应内容中解析meta标签获取编码格式
459 |      *  然后将字节数组按响应头中的编码格式创建字符串
460 |      * */
461 |     public static void main(String[] args) {
462 |         String url1 = "https://jinshuai86.github.io/about";
463 |         String url2 = "http://port.patentstar.cn/bns/PtDataSvc.asmx?op=GetPatentData&_strPID=CN105961023A&_PdTpe=CnDesXmlTxt";
464 |         String url3 = "https://www.toutiao.com/";
465 |         String url4 = "http://xww.hebut.edu.cn";
466 |         String url5 = "http://www.baidu.com";
467 |         String url7 = "https://www.douban.com";
468 |         String url8 = "https://baike.baidu.com/item/";
469 |         String[] arr = {"碳酸铵","硫酸铁", "醋酸钠", "碳酸钙", "氢氧化钠", "硫酸亚铁", "高锰酸钾"};
470 | //        for (int i = 0; i < 10; i++) {
471 | //            HttpUtils.getSingleInstance().sendMessage("https://www.douban.com", "Forbidden-Topic");
472 | //        }
473 |         for (int i = 0; i < 2; i++) {
474 | //            System.out.println(i + " ============ ");
475 | //            HttpUtils.getSingleInstance().getContent(url8 + arr[i % arr.length]);
476 |             System.out.println(HttpUtils.getSingleInstance().getContent(url3));
477 |         }
478 |     }
479 | 
480 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/http/StatusHandler.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.http;
 2 | 
 3 | import org.apache.http.HttpResponse;
 4 | 
 5 | /**
 6 |  * @author: JS
 7 |  * @date: 2019/4/12
 8 |  * @description: 状态码处理策略
 9 |  */
10 | public interface StatusHandler {
11 | 
12 |     void process(String URL, HttpResponse response);
13 | 
14 | }


--------------------------------------------------------------------------------
/spider-core/src/main/java/com/jinshuai/util/http/UserAgentArray.java:
--------------------------------------------------------------------------------
 1 | package com.jinshuai.util.http;
 2 | 
 3 | /**
 4 |  * @author: JS
 5 |  * @date: 2018/3/23
 6 |  * @description:
 7 |  *  找的一些用户代理，防止后台限制发多个请求。
 8 |  *  TODO: 待放在文件中
 9 |  */
10 | public class UserAgentArray {
11 | 
12 |     public static final String[] USER_AGENT = {
13 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
14 |             "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
15 |             "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
16 |             "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
17 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
18 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
19 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
20 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
21 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
22 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
23 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
24 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
25 |             "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
26 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
27 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
28 |             "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
29 |     };
30 | 
31 | }


--------------------------------------------------------------------------------
/spider-core/src/main/resources/application.properties:
--------------------------------------------------------------------------------
 1 | # Redis 默认没有password验证，如需要密码验证，直接在下方修改 password=你的密码 即可，否则保持默认
 2 | redis-ip=127.0.0.1
 3 | redis-port=6379
 4 | redis-password=
 5 | 
 6 | # 解析的内容存放目录
 7 | dir=D:/Data/HEBUTNews/core
 8 | 
 9 | # 开启文本相似度检测
10 | similarCheck=false
11 | 
12 | # 目标任务数量
13 | targetNum=100
14 | 
15 | # MQ配置
16 | mq-ip=127.0.0.1
17 | mq-port=9876
18 | # MQ开关:0-关闭，1-打开
19 | mq-switch=0


--------------------------------------------------------------------------------
/spider-core/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--scan:
 3 |             当此属性设置为true时，配置文件如果发生改变，将会被重新加载，默认值为true。
 4 | scanPeriod:
 5 |             设置监测配置文件是否有修改的时间间隔，如果没有给出时间单位，默认单位是毫秒。当scan为true时，此属性生效。默认的时间间隔为1分钟。
 6 | debug:
 7 |             当此属性设置为true时，将打印出logback内部日志信息，实时查看logback运行状态。默认值为false。
 8 | 
 9 | configuration 子节点为 appender、logger、root
10 | 
11 |             -->
12 | <configuration scan="true" scanPeriod="60 seconds" debug="false">
13 | 
14 |     <!--用于区分不同应用程序的记录-->
15 |     <contextName>spider-core</contextName>
16 | 
17 |     <!--日志文件所在目录，如果是tomcat，如下写法日志文件会在则为${TOMCAT_HOME}/bin/logs/目录下-->
18 |     <property name="LOG_HOME" value="logs/core"/>
19 | 
20 |     <!--控制台-->
21 |     <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
22 |         <encoder>
23 |             <!--格式化输出：%d表示日期，%thread表示线程名，%-5level：级别从左显示5个字符宽度 %logger输出日志的logger名 %msg：日志消息，%n是换行符 -->
24 |             <pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>
25 |             <!--解决乱码问题-->
26 |             <charset>UTF-8</charset>
27 |         </encoder>
28 |         <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
29 |             <level>INFO</level>
30 |             <onMatch>ACCEPT</onMatch>
31 |             <onMismatch>DENY</onMismatch>
32 |         </filter>
33 |     </appender>
34 | 
35 |     <!--滚动文件 info以上的信息-->
36 |     <!---->
37 |     <!--<appender name="infoFile" class="ch.qos.logback.core.rolling.RollingFileAppender">-->
38 |         <!--&lt;!&ndash; ThresholdFilter:临界值过滤器，过滤掉 TRACE 和 DEBUG 级别的日志 &ndash;&gt;-->
39 |         <!--<filter class="ch.qos.logback.classic.filter.ThresholdFilter">-->
40 |             <!--<level>INFO</level>-->
41 |             <!--<onMatch>ACCEPT</onMatch>-->
42 |             <!--<onMismatch>DENY</onMismatch>-->
43 |         <!--</filter>-->
44 |         <!--<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">-->
45 |             <!--<fileNamePattern>${LOG_HOME}/log.%d{yyyy-MM-dd}.log</fileNamePattern>-->
46 |             <!--<maxHistory>30</maxHistory>&lt;!&ndash;保存最近30天的日志&ndash;&gt;-->
47 |         <!--</rollingPolicy>-->
48 |         <!--<encoder>-->
49 |             <!--<charset>UTF-8</charset>-->
50 |             <!--<pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>-->
51 |         <!--</encoder>-->
52 |     <!--</appender>-->
53 | 
54 |     <!--滚动文件 异常日志-->
55 |     <appender name="errorFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
56 |         <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
57 |             <level>ERROR</level>
58 |         </filter>
59 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
60 |             <fileNamePattern>${LOG_HOME}/error.%d{yyyy-MM-dd}.log</fileNamePattern>
61 |             <maxHistory>30</maxHistory>
62 |         </rollingPolicy>
63 |         <encoder>
64 |             <charset>UTF-8</charset>
65 |             <pattern>[%d{yyyy-MM-dd HH:mm:ss.SSS}] [%thread] %-5level %logger{36} : %msg%n</pattern>
66 |         </encoder>
67 |     </appender>
68 | 
69 |     <!--这里如果是info，spring、mybatis等框架则不会输出：TRACE < DEBUG < INFO <  WARN < ERROR-->
70 |     <!--root是所有logger的祖先，均继承root，如果某一个自定义的logger没有指定level，就会寻找
71 |     父logger看有没有指定级别，直到找到root。-->
72 |     <root level="debug">
73 |         <appender-ref ref="stdout"/>
74 |         <!--<appender-ref ref="infoFile"/>-->
75 |         <appender-ref ref="errorFile"/>
76 |         <!--<appender-ref ref="logstash"/>-->
77 |     </root>
78 | 
79 |     <!--为某个包单独配置logger
80 | 
81 |     比如定时任务，写代码的包名为：com.seentao.task
82 |     步骤如下：
83 | 
84 |     1、定义一个appender，取名为task（随意，只要下面logger引用就行了）
85 |     appender的配置按照需要即可
86 | 
87 |     2、定义一个logger:
88 |     <logger name="com.seentao.task" level="DEBUG" additivity="false">
89 |       <appender-ref ref="task" />
90 |     </logger>
91 |     注意：additivity必须设置为false，这样只会交给task这个appender，否则其他appender也会打印com.seentao.task里的log信息。
92 | 
93 |     3、这样，在com.seentao.task的logger就会是上面定义的logger了。
94 |     private static Logger logger = LoggerFactory.getLogger(Class1.class);
95 |     -->
96 | </configuration>


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestGson.java:
--------------------------------------------------------------------------------
 1 | package com;
 2 | 
 3 | import com.google.gson.Gson;
 4 | import com.jinshuai.entity.UrlSeed;
 5 | import org.ansj.splitWord.analysis.*;
 6 | import org.junit.Assert;
 7 | import org.junit.Test;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | /**
12 |  * @author: JS
13 |  * @date: 2018/3/27
14 |  * @description:
15 |  */
16 | public class TestGson {
17 | 
18 |    @Test
19 |    public void testSegment() {
20 |        String[] arr = {"碳酸铵","硫酸铁", "醋酸钠", "碳酸钙", "氢氧化钠", "硫酸亚铁", "高锰酸钾"};
21 |        for (String str : arr) {
22 |            System.out.println(BaseAnalysis.parse(str));
23 |            System.out.println(ToAnalysis.parse(str));
24 |            System.out.println(DicAnalysis.parse(str));
25 |            System.out.println(IndexAnalysis.parse(str));
26 |            System.out.println(NlpAnalysis.parse(str));
27 |        }
28 | 
29 |    }
30 | 
31 | }


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestHttpClient.java:
--------------------------------------------------------------------------------
  1 | package com;
  2 | 
  3 | import org.apache.http.HttpEntity;
  4 | import org.apache.http.HttpResponse;
  5 | import org.apache.http.NameValuePair;
  6 | import org.apache.http.client.ClientProtocolException;
  7 | import org.apache.http.client.ResponseHandler;
  8 | import org.apache.http.client.entity.UrlEncodedFormEntity;
  9 | import org.apache.http.client.methods.CloseableHttpResponse;
 10 | import org.apache.http.client.methods.HttpGet;
 11 | import org.apache.http.client.methods.HttpPost;
 12 | import org.apache.http.client.utils.URIBuilder;
 13 | import org.apache.http.impl.client.AbstractHttpClient;
 14 | import org.apache.http.impl.client.CloseableHttpClient;
 15 | import org.apache.http.impl.client.HttpClientBuilder;
 16 | import org.apache.http.impl.client.HttpClients;
 17 | import org.apache.http.message.BasicNameValuePair;
 18 | import org.apache.http.util.EntityUtils;
 19 | 
 20 | import java.io.IOException;
 21 | import java.net.URI;
 22 | import java.net.URISyntaxException;
 23 | import java.util.ArrayList;
 24 | import java.util.HashMap;
 25 | import java.util.List;
 26 | import java.util.Random;
 27 | 
 28 | /**
 29 |  * @author: JS
 30 |  * @date: 2018/3/22
 31 |  * @description:
 32 |  */
 33 | public class TestHttpClient{
 34 | 
 35 |     private static CloseableHttpClient httpClient = HttpClients.createDefault();
 36 |     private static ResponseHandler<String> responseHandler;
 37 |     private static CloseableHttpResponse response;
 38 |     private static HttpEntity httpEntity;
 39 | 
 40 |     public static void main(String[] args) {
 41 |         testPost();
 42 |     }
 43 | 
 44 |     static void testPost() {
 45 |         try {
 46 |             HttpPost httpPost = new HttpPost("http://ikc.hebut.edu.cn/view/User/Login.ashx");
 47 |             List<NameValuePair> nvps = new ArrayList<NameValuePair>();
 48 |             nvps.add(new BasicNameValuePair("userid", "js_214"));
 49 |             nvps.add(new BasicNameValuePair("userpassword", "123456"));
 50 |             httpPost.setEntity(new UrlEncodedFormEntity(nvps));
 51 |             response = httpClient.execute(httpPost);
 52 |             System.out.println(response.getStatusLine());
 53 |             httpEntity = response.getEntity();
 54 |             // do something useful with the response body
 55 |             // and ensure it is fully consumed
 56 |             EntityUtils.consume(httpEntity);
 57 |             EntityUtils.toString(httpEntity);
 58 |         } catch(IOException e) {
 59 |             e.printStackTrace();
 60 |         } finally {
 61 |             try {
 62 |                 response.close();
 63 |             } catch (IOException e) {
 64 |                 e.printStackTrace();
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     static void testGet() {
 70 |         HttpGet httpGet = new HttpGet(getURI("https","baike.baidu.com","/item/数据库引擎",null));
 71 |         try {
 72 |             response = httpClient.execute(httpGet);
 73 |             httpEntity = response.getEntity();            System.out.println(response.getStatusLine());
 74 |             System.out.println("Executing request " + httpGet.getRequestLine());
 75 | 
 76 |             // Create a custom response handler
 77 |             ResponseHandler<String> responseHandler = TestHttpClient.getSingleResponseHandlerInstance();
 78 |             String responseBody = httpClient.execute(httpGet, responseHandler);
 79 |             System.out.println("----------------------------------------");
 80 |             System.out.println(responseBody);
 81 |         } catch (IOException e) {
 82 |             e.printStackTrace();
 83 |         } finally {
 84 |             try {
 85 |                 response.close();
 86 |                 httpClient.close();
 87 |                 response.getEntity();
 88 |             } catch (Exception e) {
 89 |                 e.printStackTrace();
 90 |             }
 91 |         }
 92 |     }
 93 | 
 94 |     static ResponseHandler getSingleResponseHandlerInstance() {
 95 |         if (responseHandler == null) {
 96 |             synchronized (TestHttpClient.class) {
 97 |                 if (responseHandler == null) {
 98 |                     responseHandler = new ResponseHandler<String>() {
 99 |                         public String handleResponse(final HttpResponse httpResponse) throws ClientProtocolException, IOException {
100 |                             int status = httpResponse.getStatusLine().getStatusCode();
101 |                             if (status >= 200 && status < 300) {
102 |                                 HttpEntity entity = httpResponse.getEntity();
103 |                                 return entity != null ? EntityUtils.toString(entity,"UTF-8") : null;
104 |                             } else {
105 |                                 throw new ClientProtocolException("Unexpected response status: " + status);
106 |                             }
107 |                         }
108 |                     };
109 |                 }
110 |             }
111 |         }
112 |         return responseHandler;
113 |     }
114 | 
115 |     static URI getURI(String scheme, String host, String path, HashMap<String,String> ...parameters) {
116 |         URI uri = null;
117 |         try {
118 |             uri = new URIBuilder()
119 |                     .setScheme(scheme)
120 |                     .setHost(host)
121 |                     .setPath(path)
122 |                     .setParameter("btnG", "Google Search")
123 |                     .setParameter("aq", "f")
124 |                     .setParameter("oq", "")
125 |                     .build();
126 |         } catch (URISyntaxException e) {
127 |             e.printStackTrace();
128 |         }
129 |         return uri;
130 |     }
131 | 
132 | }


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestJDBC.java:
--------------------------------------------------------------------------------
 1 | package com;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.InputStream;
 6 | import java.sql.Connection;
 7 | import java.sql.DriverManager;
 8 | import java.sql.SQLException;
 9 | 
10 | /**
11 |  * @author: JS
12 |  * @date: 2018/4/22
13 |  * @description:
14 |  * conn.close() 调用关闭以后
15 |  */
16 | public class TestJDBC {
17 | 
18 |     public static void main(String[] args) throws ClassNotFoundException, SQLException {
19 |         Class.forName("com.mysql.jdbc.Driver");
20 |         Connection connection = DriverManager.getConnection("");
21 |         connection.close();
22 |         try (InputStream inputStream = new FileInputStream(new File(""))) {
23 |             connection.close();
24 |         } catch (Exception e) {
25 | 
26 |         } finally {
27 |             // ...
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestJsoup.java:
--------------------------------------------------------------------------------
  1 | package com;
  2 | 
  3 | import junit.framework.TestCase;
  4 | import org.jsoup.Jsoup;
  5 | import org.jsoup.nodes.Document;
  6 | import org.jsoup.nodes.Element;
  7 | import org.jsoup.select.Elements;
  8 | 
  9 | import java.io.IOException;
 10 | import java.util.HashSet;
 11 | import java.util.Iterator;
 12 | import java.util.Set;
 13 | 
 14 | /**
 15 |  * @author: JS
 16 |  * @date: 2018/3/23
 17 |  * @description:
 18 |  */
 19 | public class TestJsoup extends TestCase{
 20 | 
 21 |     public void testJsoup() {
 22 |         Document document = Jsoup.parse("<!DOCTYPE html>\n" +
 23 |                 "<html>\n" +
 24 |                 "<head>\n" +
 25 |                 "    <meta http-equiv=\"content-type\" content=\"text/html; charset=gb2312\" />\n" +
 26 |                 "    <meta charset=\"UTF-8\" />\n" +
 27 |                 "    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=10,IE=9,IE=8\" />\n" +
 28 |                 "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0, user-scalable=0, minimum-scale=1.0, maximum-scale=1.0\" />\n" +
 29 |                 "    <title>计算机辅助创新设计公共服务平台</title>\n" +
 30 |                 "    <meta name=\"robots\" content=\"noindex,follow\">\n" +
 31 |                 "    <link href=\"../../css/stylenew.css\" rel=\"stylesheet\" type=\"text/css\" />\n" +
 32 |                 "    <link href=\"../../css/lightbox.css\" rel=\"stylesheet\" type=\"text/css\" />\n" +
 33 |                 "    <script src=\"../bootstrap/js/jquery.js\" type=\"text/javascript\"></script>\n" +
 34 |                 "    <script src=\"../../js/NewsIndex.js\" type=\"text/javascript\"> </script>\n" +
 35 |                 "    <script src=\"js/main.js\" type=\"text/javascript\"></script>\n" +
 36 |                 "    <script type=\"text/javascript\">\n" +
 37 |                 "/* <![CDATA[ */\n" +
 38 |                 "var ie6w = {\"url\":\"http:\\/\\/localhost:8000\\/wp-content\\/plugins\\/shockingly-big-ie6-warning\",\"test\":\"false\",\"jstest\":\"false\",\"t1\":\"WARNING\",\"t2\":\"You are using Internet Explorer version 6.0 or lower. Due to security issues and lack of support for Web Standards it is highly recommended that you upgrade to a modern browser.\",\"firefox\":\"true\",\"opera\":\"true\",\"chrome\":\"true\",\"safari\":\"true\",\"ie\":\"true\",\"firefoxu\":\"http:\\/\\/www.getfirefox.net\\/\",\"operau\":\"http:\\/\\/www.opera.com\\/\",\"chromeu\":\"http:\\/\\/www.google.com\\/chrome\\/\",\"safariu\":\"http:\\/\\/www.apple.com\\/safari\\/\",\"ieu\":\"http:\\/\\/www.microsoft.com\\/windows\\/ie\\/\"};\n" +
 39 |                 "/* ]]> */\n" +
 40 |                 "    </script>\n" +
 41 |                 "    <script src=\"../js/ie6w_top.js\" type=\"text/javascript\"></script>\n" +
 42 |                 "    <!-- Begin - HITS-IE6 PNGFix -->\n" +
 43 |                 "    <!-- IE6 has not been detected as the users browser version by the server -->\n" +
 44 |                 "    <!--  End  - HITS-IE6 PNGFix -->\n" +
 45 |                 "    <!--[if lt IE 9]><script src=\"javascript:void(0)/wp-content/themes/CAI/js/html5.js\"></script><![endif]-->\n" +
 46 |                 "</head>\n" +
 47 |                 "<body class=\"home blog\" onload=\"loadnum()\">\n" +
 48 |                 "    <div class=\"headerwrap\">\n" +
 49 |                 "        <header class=\"header\">\n" +
 50 |                 "\t<div class=\"navbar\">\n" +
 51 |                 "    <h2 class=\"logo\">\n" +
 52 |                 "\t\t<img width=\"60px;\" height=\"20px;\" src=\"../../img/logo.png\">\n" +
 53 |                 "        <span class=\"label-important\">计算机辅助创新设计公共服务平台<br/>\n" +
 54 |                 "        \n" +
 55 |                 "        </span>\n" +
 56 |                 "        \n" +
 57 |                 "        </h2>\n" +
 58 |                 "        \n" +
 59 |                 "              \n" +
 60 |                 "        \n" +
 61 |                 "        \n" +
 62 |                 "\t\t<!--\n" +
 63 |                 "\t\t<ul class=\"nav\">\n" +
 64 |                 "\t\t\t<div class=\"menu\"></div>\n" +
 65 |                 "\t\t</ul>\n" +
 66 |                 "\t\t-->\n" +
 67 |                 "\t\t<div class=\"menu pull-right\">\n" +
 68 |                 "\t\t\t<form method=\"get\" class=\"dropdown search-form\" action=\"javascript:void(0)/\">\n" +
 69 |                 "\t\t\t\t\n" +
 70 |                 "\t\t\t\t<ul class=\"dropdown-menu search-suggest\"></ul>\n" +
 71 |                 "\t\t\t</form>\n" +
 72 |                 "\t\t\t<!--\n" +
 73 |                 "\t\t\t<div class=\"btn-group pull-left\">\n" +
 74 |                 "\t\t\t\t<button class=\"btn btn-primary\" data-toggle=\"modal\" data-target=\"#feed\">订阅</button>\n" +
 75 |                 "\t\t\t\t\t\t\t</div>\n" +
 76 |                 "\t\t\t-->\n" +
 77 |                 "\t\t</div>\n" +
 78 |                 "\n" +
 79 |                 "\t\t<a  style=\"float:right; font-size:medium\" href=\"#\" onclick=\"downloadApp();\" >TRIZ创新辅助APP(下载量:<span id=\"downloadcount\"></span>)</a>\n" +
 80 |                 "        \n" +
 81 |                 "\t</div>\n" +
 82 |                 "\t\n" +
 83 |                 "\n" +
 84 |                 "\t\n" +
 85 |                 "\n" +
 86 |                 "\t<!--\n" +
 87 |                 "\t<div class=\"speedbar\">\n" +
 88 |                 "\t\t\t\t\n" +
 89 |                 "\t\t\t</div>\n" +
 90 |                 "\t-->\n" +
 91 |                 "\n" +
 92 |                 "\n" +
 93 |                 "</header>\n" +
 94 |                 "    </div>\n" +
 95 |                 "    <section class=\"container\">\n" +
 96 |                 "\t\n" +
 97 |                 "<div id=\"idcontentwrap\" style=\"float:left;width:100%;\">\n" +
 98 |                 "\t<div class=\"content\">\n" +
 99 |                 "        <div class=\"slideshow_container slideshow_container_style-light\" style=\"height: 326px; width: 880px;\" data-session-id=\"0\" data-style-name=\"style-light\" data-style-version=\"2.2.21\">\n" +
100 |                 "\n" +
101 |                 "\t\t\t\n" +
102 |                 "\t\n" +
103 |                 "\t<div class=\"slideshow_content\" style=\"width: 880px; height: 326px;\">\n" +
104 |                 "\n" +
105 |                 "\t\t<div style=\"width: 880px; height: 326px; z-index: 0; display: block; top: 0px; left: 0px;\" class=\"slideshow_view slideshow_currentView\">\n" +
106 |                 "\t\t\t<div style=\"margin-left: 0px; margin-right: 0px; width: 880px; height: 326px;\" class=\"slideshow_slide slideshow_slide_image\">\n" +
107 |                 "\t\t\t\t\t\t\t\t\t \n" +
108 |                 "                                    <img style=\"margin-top: -106px; margin-left: 0px; width: 880px; height: 538px;\" src=\"../../img/chapter_innovation_bob.jpg\"  height=\"538\" width=\"880\">\n" +
109 |                 "\t\t\t\t\t\t\t\t<div style=\"display: block; position: absolute; top: 326px;\" class=\"slideshow_description_box slideshow_transparent\">\n" +
110 |                 "                                </div>\n" +
111 |                 "\t\t\t</div>\n" +
112 |                 "\n" +
113 |                 "\t\t\t<div style=\"clear: both;\"></div></div><div style=\"top: 326px; width: 880px; height: 326px; left: 0px; z-index: 0; display: block;\" class=\"slideshow_view\">\n" +
114 |                 "\t\t\t<div style=\"margin-left: 0px; margin-right: 0px; width: 880px; height: 326px;\" class=\"slideshow_slide slideshow_slide_image\">\n" +
115 |                 "\t\t\t\t\t\t\t\t\t<img style=\"margin-top: 0px; margin-left: 0px; width: 880px; height: 326px;\" src=\" \"   height=\"326\" width=\"880\">\n" +
116 |                 "\t\t\t\t\t\t\t\t<div style=\"display: block; position: absolute; top: 326px;\" class=\"slideshow_description_box slideshow_transparent\"></div>\n" +
117 |                 "\t\t\t</div>\n" +
118 |                 "\n" +
119 |                 "\t\t\t<div style=\"clear: both;\"></div></div><div style=\"top: 326px; width: 880px; height: 326px; left: 0px; z-index: 0; display: block;\" class=\"slideshow_view\">\n" +
120 |                 "\t\t\t<div style=\"margin-left: 0px; margin-right: 0px; width: 880px; height: 326px;\" class=\"slideshow_slide slideshow_slide_image\">\n" +
121 |                 "\t\t\t\t\t\t\t\t\t<img style=\"margin-top: 0px; margin-left: 0px; height: 326px; width: 880px;\"  height=\"326\" width=\"880\">\n" +
122 |                 "\t\t\t\t\t\t\t\t<div style=\"display: block; position: absolute; top: 326px;\" class=\"slideshow_description_box slideshow_transparent\">\n" +
123 |                 "\t\t\t\t\t<div class=\"slideshow_title\"></div>\t\t\t\t\t\t\t\t\t</div>\n" +
124 |                 "\t\t\t</div>\n" +
125 |                 "\n" +
126 |                 "\t\t\t<div style=\"clear: both;\"></div></div>\n" +
127 |                 "\t</div>\n" +
128 |                 "\n" +
129 |                 "\t<div class=\"slideshow_controlPanel slideshow_transparent\" style=\"display: none;\"><ul><li class=\"slideshow_togglePlay\" data-play-text=\"Play\" data-pause-text=\"Pause\"></li></ul></div>\n" +
130 |                 "\n" +
131 |                 "\t<div title=\"Previous\" tabindex=\"0\" class=\"slideshow_button slideshow_previous slideshow_transparent\" role=\"button\" data-previous-text=\"Previous\" style=\"display: block;\"><span class=\"assistive-text hide-text\">Previous</span></div>\n" +
132 |                 "\t<div title=\"Next\" tabindex=\"0\" class=\"slideshow_button slideshow_next slideshow_transparent\" role=\"button\" data-next-text=\"Next\" style=\"display: block;\"><span class=\"assistive-text hide-text\">Next</span></div>\n" +
133 |                 "\n" +
134 |                 "\t<div class=\"slideshow_pagination\" style=\"display: block; opacity: 1;\" data-go-to-text=\"Go to slide\"><div class=\"slideshow_pagination_center\"><ul><li tabindex=\"0\" class=\"slideshow_transparent slideshow_currentView\" data-view-id=\"0\" role=\"button\" title=\"Go to slide 1\"><span class=\"assistive-text hide-text\">Go to slide 1</span></li><li tabindex=\"0\" class=\"slideshow_transparent\" data-view-id=\"1\" role=\"button\" title=\"Go to slide 2\"><span class=\"assistive-text hide-text\">Go to slide 2</span></li><li tabindex=\"0\" class=\"slideshow_transparent\" data-view-id=\"2\" role=\"button\" title=\"Go to slide 3\"><span class=\"assistive-text hide-text\">Go to slide 3</span></li></ul></div></div>\n" +
135 |                 "\n" +
136 |                 "\t<!-- WordPress Slideshow Version 2.2.21 -->\n" +
137 |                 "\n" +
138 |                 "\t</div>\t</div>\n" +
139 |                 "</div>\n" +
140 |                 "\n" +
141 |                 "<aside class=\"sidebar\">\t\n" +
142 |                 "<div class=\"widget d_textbanner\"><a class=\"style01\" href=\"./view/thinking/thought.html\" target=\"_blank\"><strong></strong><h2>创新理论基础 - 培养专业的创新思维</h2><p>目标：训练设计者思维、扩展视野、打破思维局限</p></a></div><div class=\"widget d_textbanner\"><a class=\"style02\" href=\"./view/user/login.html\" target=\"_blank\"><strong></strong><h2>创新工具 - 加速创新过程</h2><p>包括标准解、冲突、进化三大经典及前期的问题分析工具</p></a></div><div class=\"widget d_textbanner\"><a class=\"style05\" href=\"./view/search/index.html\" target=\"_blank\"><strong></strong><h2>知识检索 专利推荐</h2><p>快速检索到相关领域的内容 加速创新进程</p></a></div></aside>\n" +
143 |                 "\n" +
144 |                 "\n" +
145 |                 "<div id=\"idcontentzhuanjia\" class=\"content-wrap\">\n" +
146 |                 "\n" +
147 |                 "\n" +
148 |                 "\t<div class=\"content-fullwidth\">\n" +
149 |                 "\t<h2 class=\"title\">创新动态</h2>\n" +
150 |                 "    <div id=\"cxdt\">\n" +
151 |                 "</div>\n" +
152 |                 "\t</div>\n" +
153 |                 "</div>\n" +
154 |                 "\n" +
155 |                 "\n" +
156 |                 "\n" +
157 |                 "<div id=\"idcontentchuangyichanpin\" class=\"content-wrap\">\n" +
158 |                 "\t<div class=\"content-fullwidth\">\n" +
159 |                 "\t<h2 class=\"title\">创意展示 - 新产品</h2>\n" +
160 |                 "<div id=\"cyzsxcp\">\n" +
161 |                 "</div>\n" +
162 |                 "\t</div>\n" +
163 |                 "</div>\n" +
164 |                 "\n" +
165 |                 "<div id=\"idcontentchuangyichuangyi\" class=\"content-wrap\">\n" +
166 |                 "\t<div class=\"content-fullwidth\">\n" +
167 |                 "\t<h2 class=\"title\">创意展示 - 新创意</h2>\n" +
168 |                 "    <div id=\"cyzsxcy\">\n" +
169 |                 "</div>\n" +
170 |                 "\t</div>\n" +
171 |                 "</div>\n" +
172 |                 "\n" +
173 |                 "<div id=\"idcontentzhishigongxiang\" class=\"content-wrap\">\n" +
174 |                 "\n" +
175 |                 "\t<div class=\"content-fullwidth\">\n" +
176 |                 "\t<h2 class=\"title\">知识共享</h2>\n" +
177 |                 "     <div id=\"zsgx\">\n" +
178 |                 "</div>\n" +
179 |                 "\t</div>\n" +
180 |                 "</div>\n" +
181 |                 "\n" +
182 |                 "\n" +
183 |                 "<div id=\"idcontentfangan\" class=\"content-wrap\">\n" +
184 |                 "\t<div class=\"content-fullwidth\">\n" +
185 |                 "\t<h2 class=\"title\">方案征集</h2>\n" +
186 |                 "<div class=\"gridstyle4cols\">\n" +
187 |                 "<div id=\"fazj\">\n" +
188 |                 "\n" +
189 |                 "</div>\n" +
190 |                 "\n" +
191 |                 "</div>\n" +
192 |                 "\t<!--\n" +
193 |                 "\t\n" +
194 |                 "\t-->\n" +
195 |                 "\t</div>\n" +
196 |                 "</div>\n" +
197 |                 "\n" +
198 |                 "\n" +
199 |                 "\n" +
200 |                 "\n" +
201 |                 "\n" +
202 |                 "\n" +
203 |                 "\n" +
204 |                 "\n" +
205 |                 "\n" +
206 |                 "</section>\n" +
207 |                 "    <!--------------------------------------------------友情链接--------------------------------------------------------------------------------->\n" +
208 |                 "    <div class=\"dilink\">\n" +
209 |                 "        <h3>\n" +
210 |                 "            友情链接 <a href=\"\"></a>\n" +
211 |                 "        </h3>\n" +
212 |                 "        <a target=\"_blank\" href=\"http://www.hebstd.gov.cn/\">\n" +
213 |                 "            <img width=\"150\" height=\"60\" src=\"./img/connect3.png\" alt=\"\" /></a> <a target=\"_blank\"\n" +
214 |                 "                href=\"http://www.trizconsulting.com/\">\n" +
215 |                 "                <img width=\"150\" height=\"60\" src=\"./img/connect5.jpg\" alt=\"\" /></a> <a target=\"_blank\"\n" +
216 |                 "                    href=\"http://www.triz.com.cn\">\n" +
217 |                 "                    <img width=\"150\" height=\"60\" src=\"./img/connect8.jpg\" alt=\"\" /></a>\n" +
218 |                 "        <a target=\"_blank\" href=\"http://www.triz-journal.com/\">\n" +
219 |                 "            <img width=\"150\" height=\"60\" src=\"./img/connect6.jpg\" alt=\"\" /></a> <a target=\"_blank\"\n" +
220 |                 "                href=\"http://www.most.gov.cn/\">\n" +
221 |                 "                <img width=\"150\" height=\"60\" src=\"./img/connect1.jpg\" alt=\"\" /></a> <a target=\"_blank\"\n" +
222 |                 "                    href=\"http://www.innovationmethod.org.cn/\">\n" +
223 |                 "                    <img width=\"150\" height=\"60\" src=\"./img/connect2.jpg\" alt=\"\" /></a>\n" +
224 |                 "        <div>\n" +
225 |                 "            <select onchange=\"javascript:window.open(this.options[this.selectedIndex].value)\"\n" +
226 |                 "                id=\"select2\" name=\"areaselect\">\n" +
227 |                 "                <option selected=\"\">国家级</option>\n" +
228 |                 "                <option value=\"http://www.most.gov.cn/\">国家科技部</option>\n" +
229 |                 "                <option value=\"http://www.mii.gov.cn/mii/index.html\">国家信息产业部</option>\n" +
230 |                 "                <option value=\"http://www.moe.edu.cn/\">国家教育部</option>\n" +
231 |                 "                <option value=\"http://www.dost.moe.edu.cn/\">教育部科技司</option>\n" +
232 |                 "                <option value=\"http://www.cutech.edu.cn/\">教育部科技发展</option>\n" +
233 |                 "                <option value=\"http://www.chinainfo.gov.cn/\">中国科学情报网</option>\n" +
234 |                 "                <option value=\"http://www.cnipr.com/\">中国知识产权网</option>\n" +
235 |                 "                <option value=\"http://www.casted.org.cn/cn/\">中国科技战略研究网</option>\n" +
236 |                 "            </select>\n" +
237 |                 "            <select onchange=\"javascript:window.open(this.options[this.selectedIndex].value)\"\n" +
238 |                 "                id=\"select3\" name=\"areaselect\">\n" +
239 |                 "                <option selected=\"\">各省有关部门</option>\n" +
240 |                 "                <option value=\"http://www.triz.gov.cn\">黑龙江TRIZ专题网站</option>\n" +
241 |                 "                <option value=\"http://www.chiantriz.net.cn\">四川省技术创新方法网</option>\n" +
242 |                 "                <option value=\"http://www.jstriz.cn/\">TRIZ江苏创新方法网</option>\n" +
243 |                 "                <option value=\"http://www.cxff100.com/\">天津创新方法网</option>\n" +
244 |                 "                <option value=\"http://www.sntriz.cn/\">陕西创新方法网</option>\n" +
245 |                 "                <option value=\"http://www.xjtriz.gov.cn/\">新疆创新方法网</option>\n" +
246 |                 "                <option value=\"http://www.gdim.org.cn\">广东创新方法网</option>\n" +
247 |                 "                <option value=\"http://www.hntriz.cn/\">河南省创新方法网</option>\n" +
248 |                 "                <option value=\"http://www.yntriz.rog.cn/\">云南创新方法网</option>\n" +
249 |                 "                <option value=\"http://www.jltriz.com:8080/\">吉林省创新方法网</option>\n" +
250 |                 "                <option value=\"http://www.cxff.org\">重庆市创新方法网</option>\n" +
251 |                 "            </select>\n" +
252 |                 "            <select onchange=\"javascript:window.open(this.options[this.selectedIndex].value)\"\n" +
253 |                 "                id=\"select1\" name=\"areaselect\">\n" +
254 |                 "                <option selected=\"\">本省内</option>\n" +
255 |                 "                <option value=\"http://www.hebstd.gov.cn/\">河北省科技厅</option>\n" +
256 |                 "                <option value=\"http://qbs.heinfo.gov.cn/index.do?templet=index\">河北省科学情报研究院</option>\n" +
257 |                 "                <option value=\"http://www.triz.com.cn\">河北工业大学triz研究中心</option>\n" +
258 |                 "                <option value=\"http://www.ii.gov.cn/\">河北省工信厅</option>\n" +
259 |                 "                <option value=\"http://www.hebsts.org.cn/\">河北省科技统计网</option>\n" +
260 |                 "                <option value=\"http://www.heinfo.gov.cn/\">河北科技信息网</option>\n" +
261 |                 "                <option value=\"http://www.hbdrc.gov.cn/\">河北省发改委员会</option>\n" +
262 |                 "                <option value=\"http://www.hebcz.gov.cn/\">河北省财政厅</option>\n" +
263 |                 "                <option value=\"http://www.hebipo.gov.cn/\">河北省知识产权信息网</option>\n" +
264 |                 "                <option value=\"http://www.hbkp.gov.cn/template/site00_index.jsp\">河北科普网</option>\n" +
265 |                 "                <option value=\"http://www.he.lss.gov.cn/\">省人力资源和社会保障厅</option>\n" +
266 |                 "            </select>\n" +
267 |                 "        </div>\n" +
268 |                 "    </div>\n" +
269 |                 "    <!--------------------------------------------------友情链接--------------------------------------------------------------------------------->\n" +
270 |                 "    <div class=\"footerwrap\">\n" +
271 |                 "        <footer class=\"footer\">\n" +
272 |                 "    <div class=\"footer-inner\">\n" +
273 |                 "        <div class=\"copyright pull-left\"> \n" +
274 |                 "版权所有：河北工业大学CAI研究实验室 天津市北辰区西平道 5340 号, 邮编:300401\n" +
275 |                 "        </div>\n" +
276 |                 "        <div class=\"trackcode pull-right\"><a href=\"javascript:void(0)\">・</a>\n" +
277 |                 "                    </div>\n" +
278 |                 "    </div>\n" +
279 |                 "</footer>\n" +
280 |                 "    </div>\n" +
281 |                 "    <div class=\"modal hide fade\" id=\"feed\" tabindex=\"-1\" style=\"width: 400px; margin-left: -200px;\">\n" +
282 |                 "        <div class=\"modal-header\">\n" +
283 |                 "            <button type=\"button\" class=\"close\" data-dismiss=\"modal\">\n" +
284 |                 "                ×</button><h3>\n" +
285 |                 "                    订阅计算机辅助创新设计公共服务平台</h3>\n" +
286 |                 "        </div>\n" +
287 |                 "        <div class=\"modal-body\">\n" +
288 |                 "            <p>\n" +
289 |                 "                <strong>订阅地址</strong><br>\n" +
290 |                 "                <input class=\"input-block-level\" readonly=\"readonly\" type=\"text\"></p>\n" +
291 |                 "            <p>\n" +
292 |                 "                <strong>订阅到</strong><br>\n" +
293 |                 "                <a class=\"btn btn-mini btn-success\" target=\"_blank\" href=\"http://mail.qq.com/cgi-bin/feed?u=http://\">\n" +
294 |                 "                    QQ邮箱</a> <a class=\"btn btn-mini btn-success\" target=\"_blank\" href=\"http://www.xianguo.com/subscribe.php?url=http://\">\n" +
295 |                 "                        鲜果</a> <a class=\"btn btn-mini btn-success\" target=\"_blank\" href=\"http://mail.qq.com/cgi-bin/feed?u=http://\">\n" +
296 |                 "                            抓虾</a></p>\n" +
297 |                 "        </div>\n" +
298 |                 "    </div>\n" +
299 |                 "    <div class=\"rollto\">\n" +
300 |                 "        <button class=\"btn btn-inverse\" data-type=\"totop\" title=\"回顶部\">\n" +
301 |                 "            <i class=\"icon-eject icon-white\"></i>\n" +
302 |                 "        </button>\n" +
303 |                 "    </div>\n" +
304 |                 "   \n" +
305 |                 "</body>\n" +
306 |                 "<script type=\"text/javascript\">\n" +
307 |                 "  function loadnum(){\n" +
308 |                 "        //显示，修改下载量\n" +
309 |                 "        $.ajax({\n" +
310 |                 "            url: \"http://etriz.hebut.edu.cn/down/count?countMethod=0\",\n" +
311 |                 "            type: 'get',\n" +
312 |                 "            crossDomain: true,\n" +
313 |                 "            dataType: 'jsonp',\n" +
314 |                 "            success: function (data, status) {\n" +
315 |                 "                data = $.parseJSON(data);\n" +
316 |                 "                $(\"#downloadcount\").text(data[\"\\\"result\\\"\"]);\n" +
317 |                 "            }\n" +
318 |                 "        });\n" +
319 |                 "    }\n" +
320 |                 "    function downloadApp() {\n" +
321 |                 "        //导出管理创新项目立项评审意见汇总表\n" +
322 |                 "\n" +
323 |                 "        //显示，修改下载量\n" +
324 |                 "        $.ajax({\n" +
325 |                 "            url: \"http://etriz.hebut.edu.cn/down/count?countMethod=1\",\n" +
326 |                 "            type: 'get',\n" +
327 |                 "            crossDomain: true,\n" +
328 |                 "            dataType: 'jsonp',\n" +
329 |                 "            success: function (data, status) {\n" +
330 |                 "                data = $.parseJSON(data);\n" +
331 |                 "                $(\"#downloadcount\").text(data[\"\\\"result\\\"\"]);\n" +
332 |                 "            }\n" +
333 |                 "        });\n" +
334 |                 "        //下载文件\n" +
335 |                 "        if (typeof (downloadApp.iframe) == \"undefined\") {\n" +
336 |                 "            var iframe = document.createElement(\"iframe\");\n" +
337 |                 "            downloadApp.iframe = iframe;\n" +
338 |                 "            downloadApp.iframe.style.display = \"none\";\n" +
339 |                 "            document.body.appendChild(downloadApp.iframe);\n" +
340 |                 "        }\n" +
341 |                 "        downloadApp.iframe.src = \"/view/User/etrizappdownload.ashx\";\n" +
342 |                 "    }\n" +
343 |                 "</script>\n" +
344 |                 "</html>");
345 |         Document document1 = null;
346 |         try {
347 |             document1 = Jsoup.connect("http://xww.hebut.edu.cn/gdyw/67001.htm").get();
348 | //            // 标题
349 | //            Element element = document1.selectFirst("div.sub_articleTitle");
350 | //            System.out.println(element.getElementsByTag("h2").text());
351 | //            // 时间
352 | //            Element element1 = document1.selectFirst("div.sub_articleAuthor");
353 | //            System.out.println(element1.getElementsByTag("strong").eachText().get(0));
354 | //            // 正文
355 | //            Element element2 = document1.selectFirst("div.sub_articleInfo");
356 | 
357 |             //StringBuilder stringBuilder = new StringBuilder();
358 |             Set<String> urlSeeds = new HashSet<String>();
359 |             Iterator iterator = document1.getElementsByTag("a").iterator();
360 |             while (iterator.hasNext()) {
361 |                 Element element3 = (Element) iterator.next();
362 |                 String href = element3.attr("href").toString();
363 |                 if (href.contains("/") || href.contains("#")) continue;
364 |                 urlSeeds.add("http://xww.hebut.edu.cn/gdyw/" + href);
365 |                 //stringBuilder.append(element3.text());
366 |             }
367 |             urlSeeds.remove("index.html");
368 |             urlSeeds.remove("");
369 |             urlSeeds.remove("javascript:void(0);");
370 |             System.out.println(urlSeeds);
371 | 
372 | 
373 |         } catch (IOException e) {
374 |             e.printStackTrace();
375 |         }
376 |         //System.out.println(document1.getElementsByTag("h2"));
377 | 
378 |     }
379 | 
380 | }
381 | 


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestRedis.java:
--------------------------------------------------------------------------------
 1 | package com;
 2 | 
 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
 4 | import redis.clients.jedis.Jedis;
 5 | 
 6 | /**
 7 |  * @author: JS
 8 |  * @date: 2018/3/27
 9 |  * @description:
10 |  */
11 | public class TestRedis {
12 | 
13 |     public static void main(String[] args) {
14 |         //连接本地的 Redis 服务
15 |         Jedis jedis = new Jedis("127.0.0.1",6379);
16 |         //查看服务是否运行
17 |         System.out.println("服务正在运行: "+jedis.ping());
18 | //        jedis.sre
19 |         GenericObjectPoolConfig config = new GenericObjectPoolConfig();
20 | 
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/spider-core/src/test/java/com/TestReg.java:
--------------------------------------------------------------------------------
 1 | package com;
 2 | 
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | /**
 7 |  * @author: JS
 8 |  * @date: 2018/3/23
 9 |  * @description:
10 |  */
11 | public class TestReg {
12 | 
13 |     public static void main(String[] args) {
14 |         String src = "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
15 |                 "<head>\n" +
16 |                 "    <meta http-equiv=\"Cache-Control\" content=\"no-siteapp\"/>\n" +
17 |                 "<meta name=\"shenma-site-verification\" content=\"5a59773ab8077d4a62bf469ab966a63b_1497598848\"/>\n" +
18 |                 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>\n" +
19 |                 "<meta name=\"referrer\" content=\"always\">\n" +
20 |                 "<link href=\"https://csdnimg.cn//public/favicon.ico\" rel=\"shortcut icon\" />\n" +
21 |                 "<link href=\"https://blog.csdn.net/lfdfhl/article/details/8225546\"  rel=\"canonical\" />\n" +
22 |                 "<link href=\"https://blog.csdn.net/lfdfhl/rss/list\" id=\"RSSLink\" title=\"RSS\" type=\"application/rss+xml\" rel=\"alternate\" />\n" +
23 |                 "<script src=\"https://csdnimg.cn/public/common/libs/jquery/jquery-1.9.1.min.js\" type=\"text/javascript\"></script>\n" +
24 |                 "<script src=\"https://dup.baidustatic.com/js/ds.js\" type=\"text/javascript\"></script>";
25 |         Pattern pattern = Pattern.compile("<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\"");
26 |         Matcher matcher = pattern.matcher(src.toLowerCase());
27 |         if (matcher.find()) {
28 |             System.out.println(matcher.group(4));
29 |         }
30 | 
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------