├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── me
│ │ └── zhyd
│ │ └── hunter
│ │ ├── Hunter.java
│ │ ├── config
│ │ ├── HunterConfig.java
│ │ ├── HunterConfigContext.java
│ │ ├── HunterConfigTemplate.java
│ │ ├── HunterDateDeserializer.java
│ │ ├── HunterResolver.java
│ │ ├── HunterResolverConfig.java
│ │ └── platform
│ │ │ ├── BasePlatform.java
│ │ │ ├── CnblogsPlatform.java
│ │ │ ├── CsdnPlatform.java
│ │ │ ├── ImoocPlatform.java
│ │ │ ├── InnerPlatform.java
│ │ │ ├── IteyePlatform.java
│ │ │ ├── JianshuPlatform.java
│ │ │ ├── JuejinPlatform.java
│ │ │ ├── OschinaPlatform.java
│ │ │ ├── Platform.java
│ │ │ └── V2exPlatform.java
│ │ ├── consts
│ │ └── HunterConsts.java
│ │ ├── downloader
│ │ ├── HttpClientDownloader.java
│ │ └── HttpClientGenerator.java
│ │ ├── entity
│ │ ├── Cookie.java
│ │ ├── ImageLink.java
│ │ └── VirtualArticle.java
│ │ ├── enums
│ │ ├── ExitWayEnum.java
│ │ └── UserAgentEnum.java
│ │ ├── exception
│ │ └── HunterException.java
│ │ ├── processor
│ │ ├── BlogHunterProcessor.java
│ │ └── HunterProcessor.java
│ │ ├── resolver
│ │ ├── HtmlResolver.java
│ │ ├── JsonResolver.java
│ │ └── Resolver.java
│ │ ├── scheduler
│ │ └── BlockingQueueScheduler.java
│ │ └── util
│ │ ├── CommonUtil.java
│ │ ├── DateUtil.java
│ │ ├── HunterPrintWriter.java
│ │ └── PlatformUtil.java
└── resources
│ ├── HunterConfig.json
│ └── log4j.properties
└── test
├── java
└── me
│ └── zhyd
│ └── hunter
│ └── test
│ ├── CommonUtilTest.java
│ └── QuickStartTest.java
└── resources
└── log4j.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
25 | *.iml
26 | /.idea/
27 | /target/
28 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # 20210603
2 |
3 | 适配 CSDN 新版用户文章列表页面
4 |
5 | # 20200115
6 |
7 | 1. 修复因csdn页面结构变化导致的抓取发布时间错误的问题
8 | 2. 支持抓取OSCHINA的博文,by [小海](https://gitee.com/huhaitao) [PR#1](https://gitee.com/yadong.zhang/blog-hunter/pulls/1)
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 yadong.zhang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Blog Hunter /'hʌntɚ/
: 博客猎手,基于webMagic的博客爬取工具
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | -------------------------------------------------------------------------------
20 |
21 | 博客猎手,基于webMagic的博客爬取工具,支持慕课、csdn、iteye、cnblogs、掘金和V2EX等各大主流博客平台。**博客千万条,版权第一条。狩猎不规范,亲人两行泪。**
22 |
23 | ## 主要功能
24 |
25 | - **多个平台**:该项目内置了慕课、csdn、iteye、cnblogs、掘金、V2EX、oschina等多个主流的博客平台
26 | - **单篇抓取**:只需指定一个文章连接,即可自动抓取文章内容
27 | - **列表抓取**:只需简单的配置,就可快速抓取列表文章
28 | - **程序可控**:可选择根据抓取的链接数或者程序运行的时间停止程序
29 | - **字符流输出**:可配合前端,实现实时打印程序日志的功能
30 | - **多线程**:支持多线程抓取,效率更高
31 |
32 | ## 快速使用
33 |
34 | #### 添加依赖
35 |
36 | ```xml
37 |
38 | me.zhyd.hunter
39 | blog-hunter
40 | 1.0.4
41 |
42 | ```
43 |
44 | #### 抓取单篇文章
45 |
46 | ```java
47 | String url = "https://www.cnblogs.com/zhangyadong/p/oneblog.html";
48 | boolean convertImage = true;
49 | HunterProcessor hunter = new BlogHunterProcessor(url, convertImage);
50 | CopyOnWriteArrayList list = hunter.execute();
51 | ```
52 |
53 | - `url` 实际待抓取的文章地址
54 | - `convertImage` 是否转存图片,当选择true时会在结果中返回该文中的所有图片链接
55 |
56 | **返回结果**
57 |
58 | ```json
59 | [{
60 | "author": "HandsomeBoy丶",
61 | "content": "xx",
62 | "description": "xx",
63 | "imageLinks": [{
64 | "originalLink": "https://images2018.cnblogs.com/blog/631092/201809/631092-20180911093741389-1090581462.png",
65 | "srcLink": "https://images2018.cnblogs.com/blog/631092/201809/631092-20180911093741389-1090581462.png"
66 | }, {
67 | "originalLink": "https://img.shields.io/badge/MySQL-5.6.4-green.svg",
68 | "srcLink": "https://img.shields.io/badge/MySQL-5.6.4-green.svg"
69 | }],
70 | "releaseDate": 1536630780000,
71 | "source": "https://www.cnblogs.com/zhangyadong/p/oneblog.html",
72 | "tags": ["其他"],
73 | "title": "推荐一款自研的Java版开源博客系统OneBlog"
74 | }]
75 | ```
76 |
77 | `imageLink` 包含两个属性:`originalLink`,`srcLink`。其中`srcLink`为目标网站的`src`属性中的值,而`originalLink`表示真实的图片路径,之所以这么处理是因为有些网站使用了图片懒加载技术,`src`中并不是真实的图片地址。
78 |
79 | #### 抓取文章列表(只抓两篇文章)
80 |
81 | ```java
82 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.IMOOC);
83 | config.setUid("1175248")
84 | .setExitWay(ExitWayEnum.URL_COUNT)
85 | .setCount(2);
86 | HunterProcessor hunter = new BlogHunterProcessor(config);
87 | CopyOnWriteArrayList list = hunter.execute();
88 | ```
89 | **运行结果**
90 |
91 | ```
92 | 16:52:27,098 INFO HunterPrintWriter:38 - [ hunter ] springboot之一文带你搞懂Scheduler定时器(修订-详尽版) -- 慕冬雪 -- 2018-11-08 17:31:00
93 | 16:52:28,543 INFO HunterPrintWriter:38 - [ hunter ] springboot整合Mybatis+Mapper+Pagehelper(修订-详尽版) -- 慕冬雪 -- 2018-11-05 21:02:00
94 | ```
95 |
96 | #### 抓取文章列表(程序运行10秒后停止)
97 |
98 | ```java
99 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.CSDN);
100 | config.setUid("u011197448")
101 | .setExitWay(ExitWayEnum.DURATION)
102 | .setCount(10);
103 | HunterProcessor hunter = new BlogHunterProcessor(config);
104 | System.out.println("程序开始执行:" + new Date());
105 | CopyOnWriteArrayList list = hunter.execute();
106 | System.out.println("程序执行完毕:" + new Date());
107 | ```
108 | **运行结果**
109 |
110 | ```
111 | 程序开始执行:Mon Mar 04 16:56:56 CST 2019
112 | 16:56:59,274 INFO HunterPrintWriter:38 - [ hunter ] springboot整合Freemark模板(详尽版) -- 七彩狼 -- 2018-11-09 17:45:56
113 | 16:57:00,634 INFO HunterPrintWriter:38 - [ hunter ] DBlog开源博客新增博客迁移功能(支持多个站点) -- 七彩狼 -- 2018-08-24 17:16:24
114 | 16:57:01,862 INFO HunterPrintWriter:38 - [ hunter ] 【超赞】推荐一波优秀的开发工具 -- 七彩狼 -- 2018-07-27 10:40:31
115 | 16:57:03,080 INFO HunterPrintWriter:38 - [ hunter ] 消息称微软计划收购GitHub,估值超50亿美元 -- 七彩狼 -- 2018-06-04 10:11:12
116 | 16:57:04,356 INFO HunterPrintWriter:38 - [ hunter ] Springboot + Freemarker项目中使用自定义注解 -- 七彩狼 -- 2018-03-08 15:04:50
117 | 16:57:05,638 INFO HunterPrintWriter:38 - [ hunter ] StringRedisTemplate常用操作 -- 七彩狼 -- 2018-01-23 17:35:22
118 | 16:57:06,879 INFO HunterPrintWriter:38 - [ hunter ] JS异常(intermediate value)(intermediate value)(...) is not a function -- 七彩狼 -- 2018-01-23 17:30:15
119 | 程序执行完毕:Mon Mar 04 16:57:07 CST 2019
120 | ```
121 |
122 | #### 高级使用
123 |
124 | ```java
125 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.IMOOC);
126 | // set会重置,add会追加
127 | config.setEntryUrls("https://www.imooc.com/u/1175248/articles")
128 | .addEntryUrl("https://www.imooc.com/u/4321686/articles")
129 | // 设置程序退出的方式
130 | .setExitWay(ExitWayEnum.URL_COUNT)
131 | // 设定抓取120秒, 如果所有文章都被抓取过了,则会提前停止
132 | .setCount(20)
133 | // 每次抓取间隔的时间
134 | .setSleepTime(100)
135 | // 失败重试次数
136 | .setRetryTimes(3)
137 | // 针对抓取失败的链接 循环重试次数
138 | .setCycleRetryTimes(3)
139 | // 开启的线程数
140 | .setThreadCount(5)
141 | // 开启图片转存
142 | .setConvertImg(true);
143 | HunterProcessor hunter = new BlogHunterProcessor(config);
144 | CopyOnWriteArrayList list = hunter.execute();
145 | ```
146 |
147 | **运行结果**
148 |
149 | ```
150 | 16:58:44,510 INFO HunterPrintWriter:38 - [ hunter ] 【硬核优惠】三月涨薪季,过关斩将,“职”由你! -- 慕课网官方_运营中心 -- 2019-03-01 11:58:00
151 | 16:58:44,512 INFO HunterPrintWriter:38 - [ hunter ] springboot整合Mybatis+Mapper+Pagehelper(修订-详尽版) -- 慕冬雪 -- 2018-11-05 21:02:00
152 | 16:58:44,510 INFO HunterPrintWriter:38 - [ hunter ] 慕课网每周干货福利礼包(第二十棒) -- 慕课网官方_运营中心 -- 2019-03-01 17:30:00
153 | 16:58:44,544 INFO HunterPrintWriter:38 - [ hunter ] springboot之一文带你搞懂Scheduler定时器(修订-详尽版) -- 慕冬雪 -- 2018-11-08 17:31:00
154 | 16:58:44,571 INFO HunterPrintWriter:38 - [ hunter ] springboot整合Freemark模板(修订-详尽版) -- 慕冬雪 -- 2018-11-02 21:05:00
155 | 16:58:45,138 INFO HunterPrintWriter:38 - [ hunter ] 直播 | 价值99元的2019前端面试课,限时免费听! -- 慕课网官方_运营中心 -- 2019-02-27 11:45:00
156 | 16:58:45,140 INFO HunterPrintWriter:38 - [ hunter ] 一次糟心的排错历程 -- 慕冬雪 -- 2018-10-15 11:47:00
157 | 16:58:45,142 INFO HunterPrintWriter:38 - [ hunter ] 慕课网每周干货福利礼包(第十九棒) -- 慕课网官方_运营中心 -- 2019-02-22 16:11:00
158 | 16:58:45,156 INFO HunterPrintWriter:38 - [ hunter ] 一文读懂慕课专栏,文末福利! -- 慕课网官方_运营中心 -- 2019-02-22 18:35:00
159 | 16:58:45,191 INFO HunterPrintWriter:38 - [ hunter ] SpringBoot项目实战(10):自定义freemarker标签 -- 慕冬雪 -- 2018-09-28 14:01:00
160 | 16:58:45,698 INFO HunterPrintWriter:38 - [ hunter ] 【慕课有约】bobo老师:算法就是一场“游戏”,攻关打Boss(上) -- 慕课网官方_运营中心 -- 2019-02-21 13:54:00
161 | 16:58:45,707 INFO HunterPrintWriter:38 - [ hunter ] 【面试技巧系列一】备战金三银四,涨薪先人一步 -- 慕课网官方_运营中心 -- 2019-02-20 15:54:00
162 | 16:58:45,727 INFO HunterPrintWriter:38 - [ hunter ] DBlog开源博客新增博客迁移功能(支持多个站点) -- 慕冬雪 -- 2018-08-24 17:33:00
163 | 16:58:45,955 INFO HunterPrintWriter:38 - [ hunter ] 详细介绍如何自研一款"博客搬家"功能 -- 慕冬雪 -- 2018-09-13 13:25:00
164 | 16:58:46,095 INFO HunterPrintWriter:38 - [ hunter ] echarts统计图中世界国家汉化表及汉化方式 -- 慕冬雪 -- 2018-08-22 13:58:00
165 | 16:58:46,128 INFO HunterPrintWriter:38 - [ hunter ] 【注意】恕我直言,我想教你抓取慕课的文章! -- 慕冬雪 -- 2018-07-31 18:33:00
166 | 16:58:46,173 INFO HunterPrintWriter:38 - [ hunter ] 慕课网每周干货福利礼包(第十八棒) -- 慕课网官方_运营中心 -- 2019-02-15 15:28:00
167 | 16:58:46,258 INFO HunterPrintWriter:38 - [ hunter ] 【中奖公告】012期:程序员们,你妈催你相亲/结婚/生娃了吗? -- 慕课网官方_运营中心 -- 2019-02-12 11:26:00
168 | 16:58:46,388 INFO HunterPrintWriter:38 - [ hunter ] DBlog建站之Websocket的实际使用方式 -- 慕冬雪 -- 2018-07-05 14:50:00
169 | 16:58:46,565 INFO HunterPrintWriter:38 - [ hunter ] 大神云集——Redis命令实现源码分析 -- 慕课网官方_运营中心 -- 2019-01-30 15:21:00
170 | ```
171 |
172 | #### 停止爬虫
173 |
174 | 创建Hunter时指定`uuid`,本例使用`当前用户的id`作为`uuid`
175 | ```java
176 | HunterProcessor hunter = new BlogHunterProcessor(config, writerUtil, userId);
177 | CopyOnWriteArrayList list = hunter.execute();
178 | ```
179 |
180 | 停止爬虫
181 |
182 | ```java
183 | Hunter spider = Hunter.getHunter(userId);
184 | spider.stop();
185 | ```
186 |
187 | **注意**
188 |
189 | 部分网站没有配置`Keywords`,所以在运行单元测试时如果碰到`Keywords`内容为空,可以忽略。如果是`title`、`content`等内容为空,请检查配置文件中的`xpath`匹配规则是否正确。
190 |
191 | 更多使用方式请参考文档...
192 |
193 | ## 配置信息
194 |
195 | | 字段 | 释义 | 数据类型 | 默认 | 必填 | 备注 |
196 | | :------------ | :------------: | :------------: | :------------: | :------------: | :------------ |
197 | | resolver | 针对每个`xxRegex`提供的解析器,可以通过该配置对单个属性指定提取规则 | string | - | x | - |
198 | | resolver.releaseDate | 针对`releaseDate`提供的解析器 | Object | - | x | - |
199 | | resolver.releaseDate.type | 解析器类型,可选:regex、xpath,默认为xpath,并且当type不等于regex时,直接取xpath | string | `xpath` | x | - |
200 | | resolver.releaseDate.clazz | 需要处理的字段类类型,一般为数字类型,比如java.lang.Long、java.lang.Integer、java.lang.Float、java.lang.Double | string | - | x | - |
201 | | resolver.releaseDate.operator | 操作符,支持简单的`=-*/`操作,如果`type=regex`并且`clazz=数字类型`,则按照`operator`进行计算 | string | - | x | - |
202 | | resolver.title | 同上 | Object | - | x | - |
203 | | resolver.content | 同上 | Object | - | x | - |
204 | | resolver.author | 同上 | Object | - | x | - |
205 | | resolver.targetLinks | 同上 | Object | - | x | - |
206 | | resolver.tag | 同上 | Object | - | x | - |
207 | | resolver.keywords | 同上 | Object | - | x | - |
208 | | resolver.description | 同上 | Object | - | x | - |
209 | | titleRegex | 标题的匹配规则(`xpath`) | string | - | √ | - |
210 | | contentRegex | 内容的匹配规则(`xpath`) | string | - | √ | - |
211 | | releaseDateRegex | 发布日期的匹配规则(`xpath`) | string | - | √ | - |
212 | | authorRegex | 作者的匹配规则(`xpath`) | string | - | √ | - |
213 | | targetLinksRegex | 待抓取的url的匹配规则(`regex`) | string | - | √ | - |
214 | | tagRegex | 标签的匹配规则(`xpath`) | string | - | × | - |
215 | | keywordsRegex | 文章关键词的匹配规则(`xpath`) | string | `//meta[@name=keywords]/@content` | × | - |
216 | | descriptionRegex | 文章描述的匹配规则(`xpath`) | string | `//meta[@name=description]/@content` | × | - |
217 | | domain | 网站根域名 | string | - | √ | - |
218 | | charset | 网站编码 | string | `UTF-8` | × | - |
219 | | single | 是否抓取的单个文章 | bool | `false` | × | - |
220 | | sleepTime | 每次抓取等待的时间 | int | `1000` | × | - |
221 | | retryTimes | 抓取失败时重试的次数 | int | `2` | × | - |
222 | | cycleRetryTimes | 循环重试次数 | int | `2` | × | 抓取失败时重试的次数用完后依然未抓取成功时,循环重试 |
223 | | threadCount | 线程个数 | int | `1` | × | - |
224 | | entryUrls | 抓取入口地址 | list | - | √ | - |
225 | | exitWay | 程序退出的方式 | string | `URL_COUNT` | × | `DEFAULT`:默认方式,直到将所有匹配到的url抓取完成才会退出
`DURATION` 按照程序持续的时间,默认`60秒`
`URL_COUNT` 按照抓取的条数,默认`10条`|
226 | | count | 对应退出方式 | int | - | × | exitWay = `DURATION` 时默认`60`
exitWay = `URL_COUNT` 时默认`10`|
227 | | cookies | 网站的Cookie | list | - | × | 当有些网站必须需要登录时,可以指定该值,用以绕过登录 |
228 | | headers | http请求的header | map | - | × | 有些网站存在防盗链时,可能需要指定header |
229 | | ua | http请求的User-agent | String | - | × | 随机生成,不建议用mobile端的ua,因为有些网站根据ua自动跳转移动端和pc端链接,可能导致抓取失败 |
230 | | uid | 博客平台的用户id | String | - | × | 一般为用户个人中心里url后的一串随机字符串 |
231 | | onlyThisAuthor | 是否只抓取指定的uid用户 | bool | - | × | 保留字段,暂时无用 |
232 | | ajaxRequest | 是否为ajax渲染的页面 | bool | - | × | 保留字段,暂时无用 |
233 | | convertImg | 是否转存图片 | bool | - | × | 当选择true时会自动过滤原文中的img链接并返回,调用端可选择将图片下载后替换掉原来的图片 |
234 | | proxyList | 代理的列表 | list | - | × | 保留字段,暂时无用 |
235 | | proxyType | 代理的类型 | enum | - | × | 保留字段,暂时无用 |
236 |
237 | ## 交流
238 |
239 | | 微信(备注:`hunter加群`) | 欢迎关注公众号 |
240 | | :------------: | :------------: |
241 | |
|
|
242 |
243 |
244 | ## 致谢
245 |
246 | - [WebMagic](https://gitee.com/flashsword20/webmagic): 一个简单而又强大的爬虫框架
247 | - [Hutool](https://gitee.com/loolly/hutool): 一个优秀的Java工具包
248 | - [OneBlog](https://gitee.com/yadong.zhang/DBlog): 一个牛逼的Java开源博客
249 | - [JustAuth](https://gitee.com/yadong.zhang/JustAuth): 小而全而美的第三方登录开源组件。目前已支持Github、Gitee、微博、钉钉、百度、Coding、腾讯云开发者平台、OSChina、支付宝、QQ、微信、淘宝、Google、Facebook、抖音、领英、小米、微软、今日头条、Teambition、StackOverflow、Pinterest、人人、华为、企业微信、酷家乐、Gitlab、美团、饿了么和推特等第三方平台的授权登录。 Login, so easy!
250 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | me.zhyd.hunter
8 | blog-hunter
9 | 1.0.4
10 |
11 | blog-hunter
12 | https://github.com/zhangyd-c/blog-hunter
13 | 博客猎手,基于webMagic的博客爬取工具,支持慕课、csdn、iteye、cnblogs、掘金、V2EX 和开源中国等各大主流博客平台。博客千万篇,版权第一条。狩猎不规范,亲人两行泪。
14 |
15 |
16 |
17 | The Apache Software License, Version 2.0
18 | https://github.com/zhangyd-c/blog-hunter/blob/master/LICENSE
19 |
20 |
21 |
22 |
23 | scm:git:https://github.com/zhangyd-c/blog-hunter.git
24 | scm:git:https://github.com/zhangyd-c/blog-hunter.git
25 | https://github.com/zhangyd-c/blog-hunter
26 |
27 |
28 |
29 |
30 | yadong.zhang
31 | yadong.zhang0415@gmail.com
32 |
33 |
34 |
35 |
36 | UTF-8
37 | 1.8
38 | 1.8
39 | 1.8
40 | 2.2.1
41 | 3.7.0
42 | true
43 | 5.5.7
44 | 1.18.20
45 | 4.11
46 | [1.2.76,)
47 | 0.7.3
48 | 1.10.2
49 | 6.1.5.Final
50 | 8.5.24
51 | 1.2.17
52 |
53 |
54 |
55 |
56 | org.projectlombok
57 | lombok
58 | ${lombok-version}
59 |
60 |
61 | cn.hutool
62 | hutool-http
63 | ${hutool-version}
64 |
65 |
66 | junit
67 | junit
68 | ${junit-version}
69 | test
70 |
71 |
72 | com.alibaba
73 | fastjson
74 | ${fastjson-version}
75 |
76 |
77 | us.codecraft
78 | webmagic-core
79 | ${webmagic.version}
80 |
81 |
82 | us.codecraft
83 | webmagic-extension
84 | ${webmagic.version}
85 |
86 |
87 | org.jsoup
88 | jsoup
89 | ${jsoup.version}
90 |
91 |
92 | org.hibernate.validator
93 | hibernate-validator
94 | ${hibernate.validator.version}
95 |
96 |
97 | org.apache.tomcat
98 | tomcat-el-api
99 | ${tomcat.version}
100 | provided
101 |
102 |
103 | org.apache.tomcat
104 | tomcat-jasper-el
105 | ${tomcat.version}
106 | provided
107 |
108 |
109 | log4j
110 | log4j
111 | ${log4j.version}
112 |
113 |
114 |
115 |
116 | ${project.artifactId}-${project.version}
117 |
118 |
119 | org.apache.maven.plugins
120 | maven-compiler-plugin
121 | ${maven-compiler.version}
122 |
123 | ${project.build.sourceEncoding}
124 | ${java.version}
125 | ${java.version}
126 |
127 |
128 |
129 | maven-source-plugin
130 | ${maven-source.version}
131 | true
132 |
133 |
134 | package
135 |
136 | jar-no-fork
137 |
138 |
139 |
140 |
141 |
142 |
143 | org.apache.maven.plugins
144 | maven-javadoc-plugin
145 |
146 |
147 |
148 | org.apache.maven.plugins
149 | maven-gpg-plugin
150 |
151 |
152 |
153 |
154 |
155 | release
156 |
157 |
158 |
159 |
160 | org.apache.maven.plugins
161 | maven-source-plugin
162 | ${maven-source.version}
163 | true
164 |
165 |
166 | package
167 |
168 | jar-no-fork
169 |
170 |
171 |
172 |
173 |
174 |
175 | org.apache.maven.plugins
176 | maven-javadoc-plugin
177 |
178 |
179 | package
180 |
181 | jar
182 |
183 |
184 |
185 |
186 |
187 |
188 | org.apache.maven.plugins
189 | maven-gpg-plugin
190 |
191 |
192 | verify
193 |
194 | sign
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 | sonatype-nexus-snapshots
204 | https://oss.sonatype.org/content/repositories/snapshots/
205 |
206 |
207 | sonatype-nexus-staging
208 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
209 |
210 |
211 |
212 |
213 |
214 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/Hunter.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 | import me.zhyd.hunter.enums.ExitWayEnum;
5 | import me.zhyd.hunter.exception.HunterException;
6 | import org.apache.commons.lang3.StringUtils;
7 | import us.codecraft.webmagic.Request;
8 | import us.codecraft.webmagic.Spider;
9 | import us.codecraft.webmagic.processor.PageProcessor;
10 |
11 | import java.util.concurrent.ConcurrentHashMap;
12 |
13 | /**
14 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
15 | * @version 1.0
16 | * @since 1.8
17 | */
18 | public class Hunter extends Spider {
19 |
20 | /**
21 | * 用来保存正在运行的所有Spider,key要求唯一,一般为用户ID,需要调用方生成
22 | */
23 | public static final ConcurrentHashMap SPIDER_BUCKET = new ConcurrentHashMap<>();
24 |
25 | private HunterConfig config;
26 |
27 | /**
28 | * 唯一的key,一般为用户ID,需要调用方生成
29 | */
30 | private String hunterId;
31 | private volatile long startTime = 0L;
32 |
33 | private Hunter(PageProcessor pageProcessor, HunterConfig config, String hunterId) {
34 | super(pageProcessor);
35 | this.config = config;
36 | this.hunterId = hunterId;
37 | SPIDER_BUCKET.put(hunterId, this);
38 | }
39 |
40 | public static Hunter create(PageProcessor pageProcessor, HunterConfig config, String hunterId) {
41 | return new Hunter(pageProcessor, config, hunterId);
42 | }
43 |
44 | public static Hunter getHunter(String hunterId) {
45 | if (StringUtils.isEmpty(hunterId)) {
46 | throw new HunterException("HunterId:[" + hunterId + "]为空,请指定HunterId");
47 | }
48 | Hunter hunter = SPIDER_BUCKET.get(hunterId);
49 | if (null == hunter) {
50 | throw new HunterException("当前没有正在运行的爬虫!HunterId:[" + hunterId + "]");
51 | }
52 | return hunter;
53 | }
54 |
55 | @Override
56 | protected void onSuccess(Request request) {
57 | super.onSuccess(request);
58 | if (this.getStatus() == Status.Running && ExitWayEnum.DURATION.toString().equals(config.getExitWay())) {
59 | if (startTime < System.currentTimeMillis()) {
60 | this.stop();
61 | }
62 | }
63 | }
64 |
65 | @Override
66 | public void run() {
67 | if (ExitWayEnum.DURATION.toString().equals(config.getExitWay())) {
68 | startTime = System.currentTimeMillis() + config.getCount() * 1000;
69 | }
70 | super.run();
71 | }
72 |
73 | @Override
74 | protected void onError(Request request) {
75 | super.onError(request);
76 | }
77 |
78 | @Override
79 | public void close() {
80 | super.close();
81 | SPIDER_BUCKET.remove(this.hunterId);
82 | }
83 |
84 | @Override
85 | public void stop() {
86 | Spider.Status status = this.getStatus();
87 | if (status.equals(Spider.Status.Running)) {
88 | super.stop();
89 | SPIDER_BUCKET.remove(this.hunterId);
90 | } else if (status.equals(Spider.Status.Init)) {
91 | throw new HunterException("爬虫正在初始化!HunterId:[" + this.hunterId + "]");
92 | } else {
93 | throw new HunterException("当前没有正在运行的爬虫!HunterId:[" + this.hunterId + "]");
94 | }
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterConfig.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import lombok.Data;
4 | import me.zhyd.hunter.entity.Cookie;
5 | import me.zhyd.hunter.enums.ExitWayEnum;
6 | import me.zhyd.hunter.enums.UserAgentEnum;
7 | import org.apache.commons.collections.CollectionUtils;
8 | import org.apache.commons.lang3.StringUtils;
9 | import us.codecraft.webmagic.proxy.Proxy;
10 |
11 | import javax.validation.constraints.Max;
12 | import javax.validation.constraints.Min;
13 | import javax.validation.constraints.NotNull;
14 | import java.util.*;
15 |
16 | /**
17 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
18 | * @version 1.0
19 | */
20 | @Data
21 | public class HunterConfig {
22 | /**
23 | * 是否抓取的单个文章
24 | */
25 | public boolean single;
26 | @NotNull(message = "必须指定标题抓取规则(xpath)")
27 | private String titleRegex;
28 | @NotNull(message = "必须指定内容抓取规则(xpath)")
29 | private String contentRegex;
30 | @NotNull(message = "必须指定发布日期抓取规则(xpath)")
31 | private String releaseDateRegex;
32 | @NotNull(message = "必须指定作者抓取规则(xpath)")
33 | private String authorRegex;
34 | @NotNull(message = "必须指定待抓取的url抓取规则(regex)")
35 | private String targetLinksRegex;
36 | private String tagRegex;
37 | private String keywordsRegex = "//meta[@name=keywords]/@content";
38 | private String descriptionRegex = "//meta[@name=description]/@content";
39 | @NotNull(message = "必须指定网站根域名")
40 | private String domain;
41 | private String charset = "utf8";
42 | /**
43 | * 每次爬取页面时的等待时间
44 | */
45 | @Max(value = 10000, message = "线程等待时间不可大于10000毫秒")
46 | @Min(value = 100, message = "线程等待时间不可小于100毫秒")
47 | private int sleepTime = 1000;
48 | /**
49 | * 抓取失败时重试的次数
50 | */
51 | @Max(value = 5, message = "抓取失败时最多只能重试5次")
52 | @Min(value = 1, message = "抓取失败时最少只能重试1次")
53 | private int retryTimes = 2;
54 | /**
55 | * 抓取失败时重试的次数用完后依然未抓取成功时,循环重试
56 | */
57 | @Max(value = 5, message = "最多支持5次失败循环重试")
58 | @Min(value = 1, message = "最少支持1次失败循环重试")
59 | private int cycleRetryTimes = 2;
60 | /**
61 | * 线程个数
62 | */
63 | @Max(value = 10, message = "最多只能开启10个线程(请谨慎使用)")
64 | @Min(value = 1, message = "至少要开启1个线程")
65 | private int threadCount = 1;
66 | /**
67 | * 抓取入口地址
68 | */
69 | // @NotNull(message = "必须指定待抓取的网址")
70 | private List entryUrls;
71 | /**
72 | * 退出方式{DURATION:爬虫持续的时间,URL_COUNT:抓取到的url数量}
73 | */
74 | private String exitWay = ExitWayEnum.URL_COUNT.toString();
75 | /**
76 | * 对应退出方式,当exitWay = URL_COUNT时,该值表示url数量,当exitWay = DURATION时,该值表示爬虫持续的时间
77 | */
78 | private int count;
79 | private List cookies = new ArrayList<>();
80 | private Map headers = new HashMap<>();
81 | private String ua = UserAgentEnum.getRandomUa();
82 | private String uid;
83 | private boolean onlyThisAuthor;
84 | /**
85 | * 保留字段,针对ajax渲染的页面,暂时不支持
86 | */
87 | private Boolean ajaxRequest = false;
88 | /**
89 | * 是否转存图片,当选择true时会自动过滤原文中的img链接,调用端可选择将图片下载后替换掉原来的图片
90 | */
91 | private boolean convertImg = false;
92 | private List proxyList = new ArrayList<>();
93 | /**
94 | * 是否开启自动代理,开启时将会自动获取代理ip
95 | */
96 | private ProxyType proxyType = ProxyType.CUSTOM;
97 |
98 | /**
99 | * 解析器配置,针对每个字段,都可以配置单独的解析器,参考{@link HunterResolver}
100 | */
101 | private HunterResolverConfig resolver = new HunterResolverConfig();
102 |
103 | public HunterConfig() {
104 | }
105 |
106 | public HunterConfig setUid(String uid) {
107 | this.uid = uid;
108 | return this;
109 | }
110 |
111 | public HunterConfig setOnlyThisAuthor(boolean onlyThisAuthor) {
112 | this.onlyThisAuthor = onlyThisAuthor;
113 | return this;
114 | }
115 |
116 | public HunterConfig setTitleRegex(String titleRegex) {
117 | this.titleRegex = titleRegex;
118 | return this;
119 | }
120 |
121 | public HunterConfig setContentRegex(String contentRegex) {
122 | this.contentRegex = contentRegex;
123 | return this;
124 | }
125 |
126 | public HunterConfig setReleaseDateRegex(String releaseDateRegex) {
127 | this.releaseDateRegex = releaseDateRegex;
128 | return this;
129 | }
130 |
131 | public HunterConfig setAuthorRegex(String authorRegex) {
132 | this.authorRegex = authorRegex;
133 | return this;
134 | }
135 |
136 | public HunterConfig setTargetLinksRegex(String targetLinksRegex) {
137 | this.targetLinksRegex = targetLinksRegex;
138 | return this;
139 | }
140 |
141 | public HunterConfig setTagRegex(String tagRegex) {
142 | this.tagRegex = tagRegex;
143 | return this;
144 | }
145 |
146 | public HunterConfig setKeywordsRegex(String keywordsRegex) {
147 | this.keywordsRegex = keywordsRegex;
148 | return this;
149 | }
150 |
151 | public HunterConfig setDescriptionRegex(String descriptionRegex) {
152 | this.descriptionRegex = descriptionRegex;
153 | return this;
154 | }
155 |
156 | public HunterConfig setDomain(String domain) {
157 | this.domain = domain;
158 | return this;
159 | }
160 |
161 | public HunterConfig setCharset(String charset) {
162 | this.charset = charset;
163 | return this;
164 | }
165 |
166 | public HunterConfig setSleepTime(int sleepTime) {
167 | this.sleepTime = sleepTime;
168 | return this;
169 | }
170 |
171 | public HunterConfig setRetryTimes(int retryTimes) {
172 | this.retryTimes = retryTimes;
173 | return this;
174 | }
175 |
176 | public HunterConfig setCycleRetryTimes(int cycleRetryTimes) {
177 | this.cycleRetryTimes = cycleRetryTimes;
178 | return this;
179 | }
180 |
181 | public HunterConfig setThreadCount(int threadCount) {
182 | this.threadCount = threadCount;
183 | return this;
184 | }
185 |
186 | public HunterConfig setEntryUrls(List entryUrls) {
187 | this.entryUrls = entryUrls;
188 | return this;
189 | }
190 |
191 | public HunterConfig setEntryUrls(String entryUrls) {
192 | if (StringUtils.isNotEmpty(entryUrls)) {
193 | if (entryUrls.startsWith("[")) {
194 | entryUrls = entryUrls.substring(1);
195 | }
196 | if (entryUrls.endsWith("]")) {
197 | entryUrls = entryUrls.substring(0, entryUrls.length() - 1);
198 | }
199 | List list = Arrays.asList(entryUrls.split("\r\n"));
200 | this.entryUrls = new LinkedList<>();
201 | this.entryUrls.addAll(list);
202 | }
203 | return this;
204 | }
205 |
206 | public HunterConfig addEntryUrl(String url) {
207 | if (CollectionUtils.isEmpty(this.entryUrls)) {
208 | this.entryUrls = new LinkedList<>();
209 | }
210 | this.entryUrls.add(url);
211 | return this;
212 | }
213 |
214 | public HunterConfig setExitWay(String exitWay) {
215 | this.exitWay = exitWay;
216 | return this;
217 | }
218 |
219 | public HunterConfig setExitWay(ExitWayEnum exitWay) {
220 | this.exitWay = exitWay.toString();
221 | this.count = exitWay.getDefaultCount();
222 | return this;
223 | }
224 |
225 | public HunterConfig setCount(int count) {
226 | this.count = count;
227 | return this;
228 | }
229 |
230 | public HunterConfig setHeader(String key, String value) {
231 | Map headers = this.getHeaders();
232 | headers.put(key, value);
233 | return this;
234 | }
235 |
236 | public HunterConfig setHeader(String headersStr) {
237 | if (StringUtils.isNotEmpty(headersStr)) {
238 | String[] headerArr = headersStr.split("\r\n");
239 | for (String s : headerArr) {
240 | String[] header = s.split("=");
241 | setHeader(header[0], header[1]);
242 | }
243 | }
244 | return this;
245 | }
246 |
247 | public HunterConfig setCookie(String domain, String key, String value) {
248 | List cookies = this.getCookies();
249 | cookies.add(new Cookie(domain, key, value));
250 | return this;
251 | }
252 |
253 | public HunterConfig setCookie(String cookiesStr) {
254 | if (StringUtils.isNotEmpty(cookiesStr)) {
255 | List cookies = this.getCookies();
256 | String[] cookieArr = cookiesStr.split(";");
257 | for (String aCookieArr : cookieArr) {
258 | String[] cookieNode = aCookieArr.split("=");
259 | if (cookieNode.length <= 1) {
260 | continue;
261 | }
262 | cookies.add(new Cookie(cookieNode[0].trim(), cookieNode[1].trim()));
263 | }
264 | }
265 | return this;
266 | }
267 |
268 | public HunterConfig setAjaxRequest(boolean ajaxRequest) {
269 | this.ajaxRequest = ajaxRequest;
270 | return this;
271 | }
272 |
273 | private void addProxy(Proxy proxy) {
274 | if (this.proxyType == ProxyType.CUSTOM || null == proxy) {
275 | return;
276 | }
277 | proxyList.add(proxy);
278 | }
279 |
280 | public HunterConfig setProxy(String proxyStr) {
281 | if (this.proxyType != ProxyType.CUSTOM || proxyStr == null) {
282 | return this;
283 | }
284 | String[] proxyArr = proxyStr.split("\r\n");
285 | for (String s : proxyArr) {
286 | String[] proxy = s.split("|");
287 | if (proxy.length == 2) {
288 | this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1])));
289 | } else if (proxy.length == 4) {
290 | this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1]), proxy[2], proxy[3]));
291 | }
292 | }
293 | return this;
294 | }
295 |
296 | public HunterConfig setConvertImg(boolean convertImg) {
297 | this.convertImg = convertImg;
298 | return this;
299 | }
300 |
301 | public HunterConfig setSingle(boolean single) {
302 | this.single = single;
303 | return this;
304 | }
305 |
306 | public HunterConfig setResolver(HunterResolverConfig resolver) {
307 | this.resolver = resolver;
308 | return this;
309 | }
310 |
311 | enum ProxyType {
312 | /**
313 | * 自动获取IP代理池
314 | */
315 | AUTO,
316 | /**
317 | * 自定义
318 | */
319 | CUSTOM,
320 | /**
321 | * 禁用代理
322 | */
323 | DISABLE
324 | }
325 | }
326 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterConfigContext.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import com.alibaba.fastjson.JSONArray;
4 | import com.alibaba.fastjson.JSONObject;
5 | import me.zhyd.hunter.config.platform.InnerPlatform;
6 | import me.zhyd.hunter.config.platform.Platform;
7 | import me.zhyd.hunter.util.PlatformUtil;
8 | import org.apache.commons.collections.CollectionUtils;
9 | import org.apache.commons.collections.MapUtils;
10 | import org.apache.commons.lang3.StringUtils;
11 |
12 | import java.util.*;
13 |
14 | /**
15 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
16 | * @version 1.0
17 | * @since 1.8
18 | */
19 | public class HunterConfigContext {
20 |
21 | /**
22 | * 抓取单个文章时可用;
23 | *
24 | * @param url 待抓取的文章连接
25 | * @return HunterConfig
26 | */
27 | public static HunterConfig getHunterConfig(String url) {
28 | InnerPlatform platform = PlatformUtil.getPlarform(url);
29 | return platform.process(url);
30 | }
31 |
32 | /**
33 | * 抓取单个文章时可用;
34 | *
35 | * @param platform 博客平台
36 | * @return HunterConfig
37 | */
38 | public static HunterConfig getHunterConfig(Platform platform) {
39 | String platformConfig = HunterConfigTemplate.getConfig(platform.getPlatform());
40 | JSONObject platformObj = JSONObject.parseObject(platformConfig);
41 | String br = "\r\n";
42 | Set> entries = platformObj.entrySet();
43 | for (Map.Entry entry : entries) {
44 | if ("header".equals(entry.getKey())) {
45 | List headers = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class);
46 | entry.setValue(String.join(br, headers));
47 | }
48 | }
49 | return JSONObject.toJavaObject(platformObj, HunterConfig.class);
50 | }
51 |
52 | /**
53 | * 重新解析配置模板, 将用户id替换为真实的id
54 | *
55 | * @param config config
56 | * @return config
57 | */
58 | public static HunterConfig parseConfig(HunterConfig config) {
59 | if (null == config) {
60 | return null;
61 | }
62 | String uid = config.getUid();
63 | if (StringUtils.isEmpty(uid)) {
64 | return config;
65 | }
66 | String domain = config.getDomain();
67 | if (StringUtils.isNotEmpty(domain)) {
68 | config.setDomain(domain.replace("{uid}", uid));
69 | }
70 | String targetLinksRegex = config.getTargetLinksRegex();
71 | if (StringUtils.isNotEmpty(targetLinksRegex)) {
72 | config.setTargetLinksRegex(targetLinksRegex.replace("{uid}", uid));
73 | }
74 | List entryUrls = config.getEntryUrls();
75 | if (CollectionUtils.isNotEmpty(entryUrls)) {
76 | List newEntryUrls = new ArrayList<>();
77 | for (String entryUrl : entryUrls) {
78 | newEntryUrls.add(entryUrl.replace("{uid}", uid));
79 | }
80 | config.setEntryUrls(newEntryUrls);
81 | }
82 | Map header = config.getHeaders();
83 | if (MapUtils.isNotEmpty(header)) {
84 | Set> entries = header.entrySet();
85 | for (Map.Entry entry : entries) {
86 | String key = entry.getKey();
87 | String value = entry.getValue();
88 | header.put(key, value.replace("{uid}", uid));
89 | }
90 | }
91 | return config;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterConfigTemplate.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import cn.hutool.core.io.IoUtil;
4 | import com.alibaba.fastjson.JSONObject;
5 | import me.zhyd.hunter.consts.HunterConsts;
6 | import me.zhyd.hunter.exception.HunterException;
7 | import org.apache.commons.lang3.StringUtils;
8 |
9 | import java.io.InputStream;
10 | import java.nio.charset.Charset;
11 |
12 | /**
13 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
14 | * @version 1.0
15 | * @since 1.8
16 | */
17 | public class HunterConfigTemplate {
18 |
19 | public static JSONObject configTemplate;
20 |
21 | static {
22 | HunterConfigTemplate configTemplate = new HunterConfigTemplate();
23 | configTemplate.init();
24 | }
25 |
26 | public static String getConfig(String platform) {
27 | if (configTemplate.containsKey(platform)) {
28 | return configTemplate.getString(platform);
29 | }
30 | throw new HunterException("暂不支持该平台[" + platform + "]");
31 | }
32 |
33 | private void init() {
34 | String configFileName = HunterConsts.CONFIG_FILE_NAME;
35 | String config = null;
36 | try {
37 | InputStream inputStream = this.getClass().getResourceAsStream(configFileName);
38 | if (null == inputStream) {
39 | throw new HunterException("请检查`src/main/resources`下是否存在" + configFileName);
40 | }
41 | config = IoUtil.read(inputStream, Charset.forName("UTF-8"));
42 | if (StringUtils.isEmpty(config)) {
43 | throw new HunterException("HunterConfig内容为空:" + configFileName);
44 | }
45 | } catch (Exception e) {
46 | e.printStackTrace();
47 | }
48 |
49 | try {
50 | configTemplate = JSONObject.parseObject(config);
51 | } catch (Exception e) {
52 | throw new HunterException("HunterConfig配置文件格式错误");
53 | }
54 |
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterDateDeserializer.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import com.alibaba.fastjson.JSONObject;
4 | import com.alibaba.fastjson.parser.DefaultJSONParser;
5 | import com.alibaba.fastjson.parser.deserializer.ObjectDeserializer;
6 | import me.zhyd.hunter.util.DateUtil;
7 |
8 | import java.lang.reflect.Type;
9 |
10 | /**
11 | * fastjson 的日期反序列化组件,适配大部分日期格式
12 | *
13 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
14 | * @version 1.0
15 | * @since 1.8
16 | */
17 | public class HunterDateDeserializer implements ObjectDeserializer {
18 | public static final HunterDateDeserializer instance = new HunterDateDeserializer();
19 |
20 | public HunterDateDeserializer() {
21 | }
22 |
23 | protected T cast(DefaultJSONParser parser, Type clazz, Object fieldName, Object val) {
24 | return (T) DateUtil.parse(val);
25 | }
26 |
27 | @Override
28 | public T deserialze(DefaultJSONParser defaultJSONParser, Type type, Object o) {
29 | JSONObject object = JSONObject.parseObject(defaultJSONParser.getInput());
30 | if (null != o && o.equals("releaseDate")) {
31 | return (T) DateUtil.parse(object.get(o));
32 | }
33 | return (T) object.get(o);
34 | }
35 |
36 | @Override
37 | public int getFastMatchToken() {
38 | return 0;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterResolver.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import lombok.Getter;
4 | import lombok.Setter;
5 | import org.apache.commons.lang3.StringUtils;
6 |
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | /**
11 | * 解析器,针对每个平台,可以单独定制解析器,因为部分平台的部分内容,不是常规的html结构,可能为html中嵌套json结构
12 | *
13 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
14 | * @version 1.0.0
15 | * @since 1.0.0
16 | */
17 | @Getter
18 | @Setter
19 | public class HunterResolver {
20 | /**
21 | * 解析器类型,可选:regex、xpath,默认为xpath,并且当type不等于regex时,直接取xpath
22 | */
23 | private String type;
24 | /**
25 | * 需要处理的字段类类型,一般为数字类型,比如java.lang.Long、java.lang.Integer、java.lang.Float、java.lang.Double
26 | */
27 | private String clazz;
28 | /**
29 | * 操作符,如果type=regex并且clazz=数字类型,则按照operator进行计算
30 | */
31 | private String operator;
32 |
33 | /**
34 | * 解析operator,转换为 计算符 和 数字
35 | *
36 | * @return map
37 | */
38 | public Map getOperatorMap() {
39 | String operator = this.operator;
40 | if (StringUtils.isEmpty(operator)) {
41 | return null;
42 | }
43 | String[] operatorArr = operator.split(" ");
44 | if (operatorArr.length < 2) {
45 | return null;
46 | }
47 | Map res = new HashMap<>();
48 | res.put("operator", operatorArr[0]);
49 | res.put("num", operatorArr[1]);
50 | return res;
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/HunterResolverConfig.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config;
2 |
3 | import com.alibaba.fastjson.JSON;
4 | import com.alibaba.fastjson.JSONObject;
5 | import lombok.Getter;
6 | import lombok.Setter;
7 |
8 | import java.util.HashMap;
9 | import java.util.Map;
10 | import java.util.Set;
11 |
12 | /**
13 | * 解析器配置,针对每个字段,都可以配置单独的解析器,参考{@link HunterResolver}
14 | *
15 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
16 | * @version 1.0.0
17 | * @since 1.0.0
18 | */
19 | @Getter
20 | @Setter
21 | public class HunterResolverConfig {
22 | private HunterResolver title;
23 | private HunterResolver content;
24 | private HunterResolver releaseDate;
25 | private HunterResolver author;
26 | private HunterResolver targetLinks;
27 | private HunterResolver tag;
28 | private HunterResolver keywords;
29 | private HunterResolver description;
30 |
31 | /**
32 | * 将HunterResolverConfig转换为map,方便根据字段名进行操作
33 | *
34 | * @return map
35 | */
36 | public Map toMap() {
37 | Map map = JSON.parseObject(JSON.toJSONString(this), Map.class);
38 | Set> entries = map.entrySet();
39 | Map res = new HashMap<>();
40 | HunterResolver resolver = null;
41 | for (Map.Entry entry : entries) {
42 | if (null != entry.getValue()) {
43 | resolver = entry.getValue().toJavaObject(HunterResolver.class);
44 | }
45 | res.put(entry.getKey(), resolver);
46 | }
47 | return res;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/BasePlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import com.alibaba.fastjson.JSONObject;
4 | import me.zhyd.hunter.config.HunterConfig;
5 | import me.zhyd.hunter.config.HunterConfigTemplate;
6 | import me.zhyd.hunter.util.PlatformUtil;
7 |
8 | import java.util.Collections;
9 | import java.util.Map;
10 | import java.util.Set;
11 |
12 | /**
13 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
14 | * @version 1.0
15 | * @since 1.8
16 | */
17 | public abstract class BasePlatform implements InnerPlatform {
18 | String platform;
19 |
20 | public BasePlatform(String platform) {
21 | this.platform = platform;
22 | }
23 |
24 | protected final HunterConfig get(String url) {
25 |
26 | String host = PlatformUtil.getHost(url);
27 | String domain = PlatformUtil.getDomain(url);
28 |
29 | String platformConfig = HunterConfigTemplate.getConfig(platform);
30 | JSONObject platformObj = JSONObject.parseObject(platformConfig);
31 | String br = "\r\n", header = null;
32 | Set> entries = platformObj.entrySet();
33 | for (Map.Entry entry : entries) {
34 | if ("header".equals(entry.getKey())) {
35 | header = "Host=" + host + br + "Referer=" + domain;
36 | entry.setValue(header);
37 | } else if ("entryUrls".equals(entry.getKey())) {
38 | entry.setValue(Collections.singletonList(url));
39 | } else {
40 | if (platform.equals(Platform.ITEYE.getPlatform()) && "domain".equals(entry.getKey())) {
41 | entry.setValue(host);
42 | }
43 | }
44 | }
45 | HunterConfig config = JSONObject.toJavaObject(platformObj, HunterConfig.class);
46 | config.setSingle(true);
47 | return config;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/CnblogsPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class CnblogsPlatform extends BasePlatform {
11 |
12 | public CnblogsPlatform() {
13 | super(Platform.CNBLOGS.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/CsdnPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class CsdnPlatform extends BasePlatform {
11 |
12 | public CsdnPlatform() {
13 | super(Platform.CSDN.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/ImoocPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class ImoocPlatform extends BasePlatform {
11 |
12 | public ImoocPlatform() {
13 | super(Platform.IMOOC.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/InnerPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public interface InnerPlatform {
11 |
12 | HunterConfig process(String url);
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/IteyePlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class IteyePlatform extends BasePlatform {
11 |
12 | public IteyePlatform() {
13 | super(Platform.ITEYE.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | HunterConfig config = this.get(url);
19 | String domain = config.getDomain();
20 | String uid = domain.split("\\.")[0];
21 | config.setUid(uid);
22 | return config;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/JianshuPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class JianshuPlatform extends BasePlatform {
11 |
12 | public JianshuPlatform() {
13 | super(Platform.JIANSHU.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/JuejinPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class JuejinPlatform extends BasePlatform {
11 |
12 | public JuejinPlatform() {
13 | super(Platform.JUEJIN.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/OschinaPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author huht
7 | * @version 1.01
8 | * @since 1.8
9 | */
10 | public class OschinaPlatform extends BasePlatform {
11 |
12 | public OschinaPlatform() {
13 | super(Platform.OSCHINA.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/Platform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.util.PlatformUtil;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public enum Platform {
11 | CSDN("csdn", "csdn.net", CsdnPlatform.class),
12 | ITEYE("iteye", "iteye.com", IteyePlatform.class),
13 | IMOOC("imooc", "imooc.com", ImoocPlatform.class),
14 | CNBLOGS("cnblogs", "cnblogs.com", CnblogsPlatform.class),
15 | JUEJIN("juejin", "juejin.im", JuejinPlatform.class),
16 | V2EX("v2ex", "v2ex.com", V2exPlatform.class),
17 | OSCHINA("oschina", "oschina.net", OschinaPlatform.class),
18 | JIANSHU("jianshu","jianshu.com",JianshuPlatform.class)
19 | ;
20 |
21 | private String platform;
22 | private String host;
23 | private Class clazz;
24 |
25 | Platform(String platform, String host, Class clazz) {
26 | this.platform = platform;
27 | this.host = host;
28 | this.clazz = clazz;
29 | }
30 |
31 | public static Platform getPlatformByUrl(String url) {
32 | if (null == url) {
33 | return null;
34 | }
35 | String host = PlatformUtil.getHost(url);
36 | if (host == null) {
37 | return null;
38 | }
39 | for (Platform value : Platform.values()) {
40 | if (host.contains(value.getHost())) {
41 | return value;
42 | }
43 | }
44 | return null;
45 | }
46 |
47 | public String getPlatform() {
48 | return platform;
49 | }
50 |
51 | public String getHost() {
52 | return host;
53 | }
54 |
55 | public Class getClazz() {
56 | return clazz;
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/config/platform/V2exPlatform.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.config.platform;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class V2exPlatform extends BasePlatform {
11 |
12 | public V2exPlatform() {
13 | super(Platform.V2EX.getPlatform());
14 | }
15 |
16 | @Override
17 | public HunterConfig process(String url) {
18 | return this.get(url);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/consts/HunterConsts.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.consts;
2 |
3 | /**
4 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
5 | * @version 1.0
6 | * @since 1.8
7 | */
8 | public class HunterConsts {
9 |
10 | public static final String LOG_PREFIX = "[ hunter ] ";
11 |
12 | public static final String CONFIG_FILE_NAME = "/HunterConfig.json";
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/downloader/HttpClientDownloader.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.downloader;
2 |
3 | import org.apache.commons.io.IOUtils;
4 | import org.apache.http.HttpResponse;
5 | import org.apache.http.client.methods.CloseableHttpResponse;
6 | import org.apache.http.impl.client.CloseableHttpClient;
7 | import org.apache.http.util.EntityUtils;
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 | import us.codecraft.webmagic.Page;
11 | import us.codecraft.webmagic.Request;
12 | import us.codecraft.webmagic.Site;
13 | import us.codecraft.webmagic.Task;
14 | import us.codecraft.webmagic.downloader.AbstractDownloader;
15 | import us.codecraft.webmagic.downloader.HttpClientRequestContext;
16 | import us.codecraft.webmagic.downloader.HttpUriRequestConverter;
17 | import us.codecraft.webmagic.proxy.Proxy;
18 | import us.codecraft.webmagic.proxy.ProxyProvider;
19 | import us.codecraft.webmagic.selector.PlainText;
20 | import us.codecraft.webmagic.utils.CharsetUtils;
21 | import us.codecraft.webmagic.utils.HttpClientUtils;
22 |
23 | import java.io.IOException;
24 | import java.nio.charset.Charset;
25 | import java.util.HashMap;
26 | import java.util.Map;
27 |
28 |
29 | /**
30 | * The http downloader based on HttpClient.
31 | *
32 | * @author code4crafter@gmail.com
33 | * @since 0.1.0
34 | */
35 | public class HttpClientDownloader extends AbstractDownloader {
36 |
37 | private Logger logger = LoggerFactory.getLogger(getClass());
38 |
39 | private final Map httpClients = new HashMap();
40 |
41 | private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
42 |
43 | private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
44 |
45 | private ProxyProvider proxyProvider;
46 |
47 | private boolean responseHeader = true;
48 |
49 | public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
50 | this.httpUriRequestConverter = httpUriRequestConverter;
51 | }
52 |
53 | public void setProxyProvider(ProxyProvider proxyProvider) {
54 | this.proxyProvider = proxyProvider;
55 | }
56 |
57 | private CloseableHttpClient getHttpClient(Site site) {
58 | if (site == null) {
59 | return httpClientGenerator.getClient(null);
60 | }
61 | String domain = site.getDomain();
62 | CloseableHttpClient httpClient = httpClients.get(domain);
63 | if (httpClient == null) {
64 | synchronized (this) {
65 | httpClient = httpClients.get(domain);
66 | if (httpClient == null) {
67 | httpClient = httpClientGenerator.getClient(site);
68 | httpClients.put(domain, httpClient);
69 | }
70 | }
71 | }
72 | return httpClient;
73 | }
74 |
75 | @Override
76 | public Page download(Request request, Task task) {
77 | if (task == null || task.getSite() == null) {
78 | throw new NullPointerException("task or site can not be null");
79 | }
80 | CloseableHttpResponse httpResponse = null;
81 | CloseableHttpClient httpClient = getHttpClient(task.getSite());
82 | Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
83 | HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
84 | Page page = Page.fail();
85 | try {
86 | httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
87 | page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
88 | onSuccess(request);
89 | logger.debug("downloading page success {}", request.getUrl());
90 | return page;
91 | } catch (IOException e) {
92 | logger.warn("download page {} error", request.getUrl(), e);
93 | onError(request);
94 | return page;
95 | } finally {
96 | if (httpResponse != null) {
97 | //ensure the connection is released back to pool
98 | EntityUtils.consumeQuietly(httpResponse.getEntity());
99 | }
100 | if (proxyProvider != null && proxy != null) {
101 | proxyProvider.returnProxy(proxy, page, task);
102 | }
103 | }
104 | }
105 |
106 | @Override
107 | public void setThread(int thread) {
108 | httpClientGenerator.setPoolSize(thread);
109 | }
110 |
111 | protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
112 | byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
113 | String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
114 | Page page = new Page();
115 | page.setBytes(bytes);
116 | if (!request.isBinaryContent()){
117 | if (charset == null) {
118 | charset = getHtmlCharset(contentType, bytes);
119 | }
120 | page.setCharset(charset);
121 | page.setRawText(new String(bytes, charset));
122 | }
123 | page.setUrl(new PlainText(request.getUrl()));
124 | page.setRequest(request);
125 | page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
126 | page.setDownloadSuccess(true);
127 | if (responseHeader) {
128 | page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
129 | }
130 | return page;
131 | }
132 |
133 | private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
134 | String charset = CharsetUtils.detectCharset(contentType, contentBytes);
135 | if (charset == null) {
136 | charset = Charset.defaultCharset().name();
137 | logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
138 | }
139 | return charset;
140 | }
141 | }
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/downloader/HttpClientGenerator.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.downloader;
2 |
3 | import org.apache.http.HttpException;
4 | import org.apache.http.HttpRequest;
5 | import org.apache.http.HttpRequestInterceptor;
6 | import org.apache.http.client.CookieStore;
7 | import org.apache.http.config.Registry;
8 | import org.apache.http.config.RegistryBuilder;
9 | import org.apache.http.config.SocketConfig;
10 | import org.apache.http.conn.socket.ConnectionSocketFactory;
11 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
12 | import org.apache.http.conn.ssl.DefaultHostnameVerifier;
13 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
14 | import org.apache.http.impl.client.*;
15 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
16 | import org.apache.http.impl.cookie.BasicClientCookie;
17 | import org.apache.http.protocol.HttpContext;
18 | import org.slf4j.Logger;
19 | import org.slf4j.LoggerFactory;
20 | import us.codecraft.webmagic.Site;
21 | import us.codecraft.webmagic.downloader.CustomRedirectStrategy;
22 |
23 | import javax.net.ssl.SSLContext;
24 | import javax.net.ssl.TrustManager;
25 | import javax.net.ssl.X509TrustManager;
26 | import java.io.IOException;
27 | import java.security.KeyManagementException;
28 | import java.security.NoSuchAlgorithmException;
29 | import java.security.cert.CertificateException;
30 | import java.security.cert.X509Certificate;
31 | import java.util.Map;
32 |
33 | /**
34 | * @author code4crafter@gmail.com
35 | * @since 0.4.0
36 | */
37 | public class HttpClientGenerator {
38 |
39 | private transient Logger logger = LoggerFactory.getLogger(getClass());
40 |
41 | private PoolingHttpClientConnectionManager connectionManager;
42 |
43 | public HttpClientGenerator() {
44 | Registry reg = RegistryBuilder.create()
45 | .register("http", PlainConnectionSocketFactory.INSTANCE)
46 | .register("https", buildSSLConnectionSocketFactory())
47 | .build();
48 | connectionManager = new PoolingHttpClientConnectionManager(reg);
49 | connectionManager.setDefaultMaxPerRoute(100);
50 | }
51 |
52 | private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
53 | try {
54 | return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书
55 | } catch (KeyManagementException e) {
56 | logger.error("ssl connection fail", e);
57 | } catch (NoSuchAlgorithmException e) {
58 | logger.error("ssl connection fail", e);
59 | }
60 | return SSLConnectionSocketFactory.getSocketFactory();
61 | }
62 |
63 | private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
64 | // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
65 | X509TrustManager trustManager = new X509TrustManager() {
66 |
67 | @Override
68 | public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
69 | }
70 |
71 | @Override
72 | public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
73 | }
74 |
75 | @Override
76 | public X509Certificate[] getAcceptedIssuers() {
77 | return null;
78 | }
79 |
80 | };
81 |
82 | SSLContext sc = SSLContext.getInstance("SSLv3");
83 | sc.init(null, new TrustManager[]{trustManager}, null);
84 | return sc;
85 | }
86 |
87 | public HttpClientGenerator setPoolSize(int poolSize) {
88 | connectionManager.setMaxTotal(poolSize);
89 | return this;
90 | }
91 |
92 | public CloseableHttpClient getClient(Site site) {
93 | return generateClient(site);
94 | }
95 |
96 | private CloseableHttpClient generateClient(Site site) {
97 | HttpClientBuilder httpClientBuilder = HttpClients.custom();
98 |
99 | httpClientBuilder.setConnectionManager(connectionManager);
100 | if (site.getUserAgent() != null) {
101 | httpClientBuilder.setUserAgent(site.getUserAgent());
102 | } else {
103 | httpClientBuilder.setUserAgent("");
104 | }
105 | if (site.isUseGzip()) {
106 | httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
107 |
108 | public void process(
109 | final HttpRequest request,
110 | final HttpContext context) throws HttpException, IOException {
111 | if (!request.containsHeader("Accept-Encoding")) {
112 | request.addHeader("Accept-Encoding", "gzip");
113 | }
114 | }
115 | });
116 | }
117 | //解决post/redirect/post 302跳转问题
118 | httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());
119 |
120 | SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
121 | socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
122 | socketConfigBuilder.setSoTimeout(site.getTimeOut());
123 | SocketConfig socketConfig = socketConfigBuilder.build();
124 | httpClientBuilder.setDefaultSocketConfig(socketConfig);
125 | connectionManager.setDefaultSocketConfig(socketConfig);
126 | httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
127 | generateCookie(httpClientBuilder, site);
128 | return httpClientBuilder.build();
129 | }
130 |
131 | private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
132 | if (site.isDisableCookieManagement()) {
133 | httpClientBuilder.disableCookieManagement();
134 | return;
135 | }
136 | CookieStore cookieStore = new BasicCookieStore();
137 | for (Map.Entry cookieEntry : site.getCookies().entrySet()) {
138 | BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
139 | cookie.setDomain(site.getDomain());
140 | cookieStore.addCookie(cookie);
141 | }
142 | for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) {
143 | for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) {
144 | BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
145 | cookie.setDomain(domainEntry.getKey());
146 | cookieStore.addCookie(cookie);
147 | }
148 | }
149 | httpClientBuilder.setDefaultCookieStore(cookieStore);
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/entity/Cookie.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.entity;
2 |
3 | import lombok.Data;
4 | import lombok.EqualsAndHashCode;
5 |
6 | /**
7 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
8 | * @version 1.0
9 | */
10 | @Data
11 | @EqualsAndHashCode(callSuper = false)
12 | public class Cookie {
13 |
14 | String domain;
15 | String name;
16 | String value;
17 |
18 | public Cookie(String domain, String name, String value) {
19 | this.domain = domain;
20 | this.name = name;
21 | this.value = value;
22 | }
23 |
24 | public Cookie(String name, String value) {
25 | this.name = name;
26 | this.value = value;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/entity/ImageLink.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.entity;
2 |
3 | import lombok.Data;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | @Data
11 | public class ImageLink {
12 |
13 | private String srcLink;
14 |
15 | public ImageLink(String srcLink) {
16 | this.srcLink = srcLink;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/entity/VirtualArticle.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.entity;
2 |
3 | import lombok.Data;
4 | import lombok.EqualsAndHashCode;
5 |
6 | import java.util.Date;
7 | import java.util.List;
8 | import java.util.Set;
9 |
10 | /**
11 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
12 | * @version 1.0
13 | */
14 | @Data
15 | @EqualsAndHashCode(callSuper = false)
16 | public class VirtualArticle {
17 |
18 | private String title;
19 | private String content;
20 | private String author;
21 | private Date releaseDate;
22 | private String source;
23 | private String description;
24 | private String keywords;
25 | private List tags;
26 |
27 | private Set imageLinks;
28 |
29 | public VirtualArticle setTitle(String title) {
30 | this.title = title;
31 | return this;
32 | }
33 |
34 | public VirtualArticle setContent(String content) {
35 | this.content = content;
36 | return this;
37 | }
38 |
39 | public VirtualArticle setAuthor(String author) {
40 | this.author = author;
41 | return this;
42 | }
43 |
44 | public VirtualArticle setReleaseDate(Date releaseDate) {
45 | this.releaseDate = releaseDate;
46 | return this;
47 | }
48 |
49 | public VirtualArticle setSource(String source) {
50 | this.source = source;
51 | return this;
52 | }
53 |
54 | public VirtualArticle setTags(List tags) {
55 | this.tags = tags;
56 | return this;
57 | }
58 |
59 | public VirtualArticle setDescription(String description) {
60 | this.description = description;
61 | return this;
62 | }
63 |
64 | public VirtualArticle setKeywords(String keywords) {
65 | this.keywords = keywords;
66 | return this;
67 | }
68 |
69 | public VirtualArticle setImageLinks(Set imageLinks) {
70 | this.imageLinks = imageLinks;
71 | return this;
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/enums/ExitWayEnum.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.enums;
2 |
3 | /**
4 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
5 | * @version 1.0
6 | */
7 | public enum ExitWayEnum {
8 | /**
9 | * 默认方式,直到将所有匹配到的url抓取完成才会退出
10 | */
11 | DEFAULT("默认", 0),
12 | /*
13 | * 持续时间
14 | */
15 | DURATION("持续时间(s)", 60),
16 | /**
17 | * 抓取的条数
18 | */
19 | URL_COUNT("链接条数", 10);
20 |
21 | private String desc;
22 | private int defaultCount;
23 |
24 | ExitWayEnum(String desc, int defaultCount) {
25 | this.desc = desc;
26 | this.defaultCount = defaultCount;
27 | }
28 |
29 | public String getDesc() {
30 | return desc;
31 | }
32 |
33 | public int getDefaultCount() {
34 | return defaultCount;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/enums/UserAgentEnum.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.enums;
2 |
3 | import cn.hutool.core.util.RandomUtil;
4 |
5 | /**
6 | * 更多UA请参考:http://www.useragentstring.com/pages/useragentstring.php
7 | *
8 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
9 | * @version 1.0
10 | * @since 1.8
11 | */
12 | public enum UserAgentEnum {
13 |
14 | PC("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36"),
15 | PC_WIN10("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36"),
16 | PC_FIREFOX64_WIN("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0"),
17 | PC_FIREFOX64_LINUX("Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0");
18 |
19 | private String ua;
20 |
21 | UserAgentEnum(String ua) {
22 | this.ua = ua;
23 | }
24 |
25 | public static String getRandomUa() {
26 | UserAgentEnum[] uas = UserAgentEnum.values();
27 | return uas[RandomUtil.randomInt(0, uas.length)].getUa();
28 | }
29 |
30 | public String getUa() {
31 | return ua;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/exception/HunterException.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.exception;
2 |
3 | import me.zhyd.hunter.consts.HunterConsts;
4 |
5 | /**
6 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
7 | * @version 1.0
8 | * @since 1.8
9 | */
10 | public class HunterException extends RuntimeException {
11 |
12 | public HunterException(String message) {
13 | super(HunterConsts.LOG_PREFIX + message);
14 | }
15 |
16 | public HunterException(String message, Throwable cause) {
17 | super(HunterConsts.LOG_PREFIX + message, cause);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/processor/BlogHunterProcessor.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.processor;
2 |
3 | import me.zhyd.hunter.Hunter;
4 | import me.zhyd.hunter.config.HunterConfig;
5 | import me.zhyd.hunter.entity.VirtualArticle;
6 | import me.zhyd.hunter.scheduler.BlockingQueueScheduler;
7 | import me.zhyd.hunter.util.HunterPrintWriter;
8 | import me.zhyd.hunter.downloader.HttpClientDownloader;
9 | import org.apache.commons.collections.CollectionUtils;
10 | import us.codecraft.webmagic.proxy.Proxy;
11 | import us.codecraft.webmagic.proxy.SimpleProxyProvider;
12 |
13 | import java.util.List;
14 | import java.util.concurrent.CopyOnWriteArrayList;
15 |
16 | /**
17 | * 爬虫入口
18 | *
19 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
20 | * @version 1.0
21 | */
22 | public class BlogHunterProcessor extends HunterProcessor {
23 |
24 | public BlogHunterProcessor(String url, boolean convertImage) {
25 | super(url, convertImage);
26 | }
27 |
28 | public BlogHunterProcessor(String url, boolean convertImage, HunterPrintWriter writer) {
29 | super(url, convertImage, writer);
30 | }
31 |
32 | public BlogHunterProcessor(HunterConfig config) {
33 | super(config);
34 | }
35 |
36 | public BlogHunterProcessor(HunterConfig config, String uuid) {
37 | super(config, uuid);
38 | }
39 |
40 | /**
41 | * @param config Hunter Config
42 | * @param writer
43 | * @param uuid
44 | */
45 | public BlogHunterProcessor(HunterConfig config, HunterPrintWriter writer, String uuid) {
46 | super(config, writer, uuid);
47 | }
48 |
49 | /**
50 | * 运行爬虫并返回结果
51 | *
52 | * @return
53 | */
54 | @Override
55 | public CopyOnWriteArrayList execute() {
56 | List errors = this.validateModel(config);
57 | if (CollectionUtils.isNotEmpty(errors)) {
58 | writer.print("校验不通过!请依据下方提示,检查输入参数是否正确......");
59 | for (String error : errors) {
60 | writer.print(">> " + error);
61 | }
62 | return null;
63 | }
64 |
65 | CopyOnWriteArrayList virtualArticles = new CopyOnWriteArrayList<>();
66 | Hunter spider = Hunter.create(this, config, uuid);
67 |
68 | spider.addUrl(config.getEntryUrls().toArray(new String[0]))
69 | .setScheduler(new BlockingQueueScheduler(config))
70 | .addPipeline((resultItems, task) -> this.process(resultItems, virtualArticles, spider))
71 | .setDownloader(new HttpClientDownloader())
72 | .thread(config.getThreadCount());
73 |
74 | //设置抓取代理IP
75 | if (!CollectionUtils.isEmpty(config.getProxyList())) {
76 | HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
77 | SimpleProxyProvider provider = SimpleProxyProvider.from(config.getProxyList().toArray(new Proxy[0]));
78 | httpClientDownloader.setProxyProvider(provider);
79 | spider.setDownloader(httpClientDownloader);
80 | }
81 | // 测试代理
82 | /*HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
83 | SimpleProxyProvider provider = SimpleProxyProvider.from(
84 | new Proxy("61.135.217.7", 80)
85 | );
86 | httpClientDownloader.setProxyProvider(provider);
87 | spider.setDownloader(httpClientDownloader);*/
88 |
89 | // 启动爬虫
90 | spider.run();
91 | return virtualArticles;
92 | }
93 |
94 |
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/processor/HunterProcessor.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.processor;
2 |
3 | import cn.hutool.core.collection.CollectionUtil;
4 | import com.alibaba.fastjson.JSON;
5 | import com.alibaba.fastjson.parser.ParserConfig;
6 | import lombok.extern.slf4j.Slf4j;
7 | import me.zhyd.hunter.Hunter;
8 | import me.zhyd.hunter.config.HunterConfig;
9 | import me.zhyd.hunter.config.HunterConfigContext;
10 | import me.zhyd.hunter.config.HunterDateDeserializer;
11 | import me.zhyd.hunter.entity.Cookie;
12 | import me.zhyd.hunter.entity.VirtualArticle;
13 | import me.zhyd.hunter.resolver.HtmlResolver;
14 | import me.zhyd.hunter.resolver.JsonResolver;
15 | import me.zhyd.hunter.resolver.Resolver;
16 | import me.zhyd.hunter.util.CommonUtil;
17 | import me.zhyd.hunter.util.HunterPrintWriter;
18 | import org.apache.commons.collections.CollectionUtils;
19 | import org.apache.commons.collections.MapUtils;
20 | import org.apache.commons.lang3.StringUtils;
21 | import us.codecraft.webmagic.Page;
22 | import us.codecraft.webmagic.ResultItems;
23 | import us.codecraft.webmagic.Site;
24 | import us.codecraft.webmagic.processor.PageProcessor;
25 |
26 | import javax.validation.ConstraintViolation;
27 | import javax.validation.Validation;
28 | import javax.validation.Validator;
29 | import java.util.*;
30 | import java.util.concurrent.CopyOnWriteArrayList;
31 |
32 | /**
33 | * 统一对页面进行解析处理
34 | *
35 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
36 | * @version 1.0
37 | */
38 | @Slf4j
39 | public abstract class HunterProcessor implements PageProcessor {
40 | protected HunterConfig config;
41 | protected HunterPrintWriter writer = new HunterPrintWriter();
42 | protected String uuid;
43 | private Validator validator = Validation.buildDefaultValidatorFactory().getValidator();
44 |
45 | HunterProcessor() {
46 | }
47 |
48 | HunterProcessor(HunterConfig m) {
49 | this(m, UUID.randomUUID().toString());
50 | }
51 |
52 | HunterProcessor(HunterConfig m, String uuid) {
53 | this(m, null, uuid);
54 | }
55 |
56 | HunterProcessor(HunterConfig config, HunterPrintWriter writer, String uuid) {
57 | this.config = HunterConfigContext.parseConfig(config);
58 | this.uuid = uuid;
59 | if (null != writer) {
60 | this.writer = writer;
61 | }
62 | }
63 |
64 | HunterProcessor(String url, boolean convertImage) {
65 | this(HunterConfigContext.getHunterConfig(url).setConvertImg(convertImage));
66 | }
67 |
68 | HunterProcessor(String url, boolean convertImage, HunterPrintWriter writer) {
69 | this(HunterConfigContext.getHunterConfig(url).setConvertImg(convertImage));
70 | if (writer != null) {
71 | this.writer = writer;
72 | }
73 | }
74 |
75 | /**
76 | * 程序入口方法
77 | *
78 | * @return 返回VirtualArticle列表
79 | */
80 | public abstract CopyOnWriteArrayList execute();
81 |
82 | @Override
83 | public void process(Page page) {
84 | Resolver resolver = new HtmlResolver();
85 | if (config.getAjaxRequest()) {
86 | resolver = new JsonResolver();
87 | }
88 | resolver.process(page, config);
89 |
90 | }
91 |
92 | @Override
93 | public Site getSite() {
94 | Site site = Site.me()
95 | .setCharset(config.getCharset())
96 | .setDomain(config.getDomain())
97 | .setUserAgent(config.getUa())
98 | .setSleepTime(config.getSleepTime())
99 | .setRetryTimes(config.getRetryTimes())
100 | .setCycleRetryTimes(config.getCycleRetryTimes());
101 |
102 | //添加抓包获取的cookie信息
103 | List cookies = config.getCookies();
104 | if (CollectionUtils.isNotEmpty(cookies)) {
105 | for (Cookie cookie : cookies) {
106 | if (StringUtils.isEmpty(cookie.getDomain())) {
107 | site.addCookie(cookie.getName(), cookie.getValue());
108 | continue;
109 | }
110 | site.addCookie(cookie.getDomain(), cookie.getName(), cookie.getValue());
111 | }
112 | }
113 | //添加请求头,有些网站会根据请求头判断该请求是由浏览器发起还是由爬虫发起的
114 | Map headers = config.getHeaders();
115 | if (MapUtils.isNotEmpty(headers)) {
116 | Set> entrySet = headers.entrySet();
117 | for (Map.Entry entry : entrySet) {
118 | site.addHeader(entry.getKey(), entry.getValue());
119 | }
120 | }
121 | return site;
122 | }
123 |
124 | /**
125 | * 校验参数
126 | *
127 | * @param t 待校验的参数
128 | */
129 | final List validateModel(T t) {
130 | Set> constraintViolations = validator.validate(t);
131 |
132 | List messageList = new ArrayList<>();
133 | for (ConstraintViolation constraintViolation : constraintViolations) {
134 | messageList.add(constraintViolation.getMessage());
135 | }
136 | return messageList;
137 | }
138 |
139 | /**
140 | * 自定义管道的处理方法
141 | *
142 | * @param resultItems 自定义Processor处理完后的所有参数
143 | * @param virtualArticles 爬虫文章集合
144 | */
145 | final void process(ResultItems resultItems, List virtualArticles, Hunter spider) {
146 | if (null == spider) {
147 | return;
148 | }
149 | Map map = resultItems.getAll();
150 | if (CollectionUtil.isEmpty(map)) {
151 | return;
152 | }
153 | String title = String.valueOf(map.get("title"));
154 | ParserConfig jcParserConfig = new ParserConfig();
155 | jcParserConfig.putDeserializer(Date.class, HunterDateDeserializer.instance);
156 | VirtualArticle virtualArticle = JSON.parseObject(JSON.toJSONString(map), VirtualArticle.class, jcParserConfig, JSON.DEFAULT_PARSER_FEATURE);
157 | virtualArticle.setDescription(CommonUtil.getRealDescription(virtualArticle.getDescription(), virtualArticle.getContent()))
158 | .setKeywords(CommonUtil.getRealKeywords(virtualArticle.getKeywords()));
159 | if (this.config.isConvertImg()) {
160 | virtualArticle.setContent(CommonUtil.formatHtml(virtualArticle.getContent()));
161 | virtualArticle.setImageLinks(CommonUtil.getAllImageLink(virtualArticle.getContent()));
162 | }
163 | if (CollectionUtils.isEmpty(virtualArticle.getTags())) {
164 | virtualArticle.setTags(Collections.singletonList("其他"));
165 | }
166 | virtualArticles.add(virtualArticle);
167 | writer.print(String.format("%s -- %s -- %s", virtualArticle.getSource(), title, virtualArticle.getAuthor(), virtualArticle.getReleaseDate()));
168 | }
169 |
170 | public HunterConfig getConfig() {
171 | return config;
172 | }
173 | }
174 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/resolver/HtmlResolver.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.resolver;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 | import me.zhyd.hunter.config.HunterResolver;
5 | import me.zhyd.hunter.config.HunterResolverConfig;
6 | import org.apache.commons.lang3.StringUtils;
7 | import us.codecraft.webmagic.Page;
8 | import us.codecraft.webmagic.selector.Html;
9 | import us.codecraft.webmagic.selector.RegexSelector;
10 |
11 | import java.util.Arrays;
12 | import java.util.Map;
13 |
14 | /**
15 | * 解析处理普通的Html网页
16 | *
17 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
18 | * @version 1.0
19 | */
20 | public class HtmlResolver implements Resolver {
21 |
22 | @Override
23 | public void process(Page page, HunterConfig model) {
24 | Html pageHtml = page.getHtml();
25 | String title = StringUtils.trim(pageHtml.xpath(model.getTitleRegex()).get());
26 | String source = page.getRequest().getUrl();
27 | if (model.isSingle() || (!StringUtils.isEmpty(title) && (!"null".equals(title) && !model.getEntryUrls().contains(source)))) {
28 | page.putField("title", title);
29 | page.putField("source", source);
30 | this.put(page, pageHtml, "releaseDate", model.getReleaseDateRegex(), model);
31 | this.put(page, pageHtml, "author", model.getAuthorRegex(), model);
32 | this.put(page, pageHtml, "content", model.getContentRegex(), model);
33 | this.put(page, pageHtml, "tags", model.getTagRegex(), model);
34 | this.put(page, pageHtml, "description", model.getDescriptionRegex(), model);
35 | this.put(page, pageHtml, "keywords", model.getKeywordsRegex(), model);
36 | }
37 | if (!model.isSingle()) {
38 | if (StringUtils.isNotEmpty(model.getTargetLinksRegex())) {
39 | page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
40 | }
41 | }
42 | }
43 |
44 | private void put(Page page, Html pageHtml, String key, String regex, HunterConfig model) {
45 | if (StringUtils.isNotEmpty(regex)) {
46 | HunterResolverConfig resolverConfig = model.getResolver();
47 | Map resolverMap = resolverConfig.toMap();
48 | HunterResolver resolver = null;
49 | if (resolverMap.containsKey(key)) {
50 | resolver = resolverMap.get(key);
51 | }
52 | Object res = null;
53 | if (null != resolver && "regex".equals(resolver.getType())) {
54 | String text = new RegexSelector(regex).select(pageHtml.get());
55 | if (Arrays.asList("java.lang.Long", "java.lang.Integer", "java.lang.Float", "java.lang.Double").contains(resolver.getClazz())) {
56 | Map operatorMap = resolver.getOperatorMap();
57 | if (operatorMap == null || operatorMap.isEmpty()) {
58 | res = text;
59 | } else {
60 | String operator = String.valueOf(operatorMap.get("operator"));
61 | if (!StringUtils.isEmpty(operator)) {
62 | long num = Long.parseLong(String.valueOf(operatorMap.get("num")));
63 | switch (operator) {
64 | case "+":
65 | res = Long.parseLong(text) + num;
66 | break;
67 | case "-":
68 | res = Long.parseLong(text) - num;
69 | break;
70 | case "*":
71 | res = Long.parseLong(text) * num;
72 | break;
73 | case "/":
74 | res = Long.parseLong(text) / num;
75 | break;
76 | default:
77 | break;
78 | }
79 | }
80 | }
81 | }
82 | } else {
83 | if ("tags".equals(key)) {
84 | res = pageHtml.xpath(regex).all();
85 | } else {
86 | res = StringUtils.trim(pageHtml.xpath(regex).get());
87 | }
88 | }
89 | page.putField(key, res);
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/resolver/JsonResolver.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.resolver;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 | import org.apache.commons.lang3.StringUtils;
5 | import us.codecraft.webmagic.Page;
6 | import us.codecraft.webmagic.selector.JsonPathSelector;
7 |
8 | /**
9 | * 解析处理Ajax渲染的页面(待完善)
10 | *
11 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
12 | * @version 1.0
13 | */
14 | public class JsonResolver implements Resolver {
15 |
16 | @Override
17 | public void process(Page page, HunterConfig model) {
18 | String rawText = page.getRawText();
19 | String title = new JsonPathSelector(model.getTitleRegex()).select(rawText);
20 | if (!StringUtils.isEmpty(title) && !"null".equals(title)) {
21 | page.putField("title", title);
22 | page.putField("releaseDate", new JsonPathSelector(model.getReleaseDateRegex()).select(rawText));
23 | page.putField("author", new JsonPathSelector(model.getAuthorRegex()).select(rawText));
24 | page.putField("content", new JsonPathSelector(model.getContentRegex()).select(rawText));
25 | page.putField("source", page.getRequest().getUrl());
26 | }
27 | page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/resolver/Resolver.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.resolver;
2 |
3 | import me.zhyd.hunter.config.HunterConfig;
4 | import us.codecraft.webmagic.Page;
5 |
6 | /**
7 | * 页面解析器
8 | *
9 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
10 | * @version 1.0
11 | */
12 | public interface Resolver {
13 | void process(Page page, HunterConfig model);
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/scheduler/BlockingQueueScheduler.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.scheduler;
2 |
3 | import me.zhyd.hunter.enums.ExitWayEnum;
4 | import me.zhyd.hunter.config.HunterConfig;
5 | import us.codecraft.webmagic.Request;
6 | import us.codecraft.webmagic.Task;
7 | import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
8 | import us.codecraft.webmagic.scheduler.MonitorableScheduler;
9 |
10 | import java.util.concurrent.BlockingQueue;
11 | import java.util.concurrent.LinkedBlockingQueue;
12 |
13 | /**
14 | * 自定义的调度器,主要用来处理url
15 | *
16 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
17 | * @version 1.0
18 | */
19 | public class BlockingQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
20 | private BlockingQueue queue = new LinkedBlockingQueue<>();
21 | private int realUrlCount = -1;
22 |
23 | public BlockingQueueScheduler(HunterConfig model) {
24 | if (ExitWayEnum.URL_COUNT.toString().equals(model.getExitWay())) {
25 | // 实际抓取的url数量包括入口页面
26 | this.realUrlCount = model.getCount() + model.getEntryUrls().size();
27 | }
28 | }
29 |
30 | @Override
31 | public void pushWhenNoDuplicate(Request request, Task task) {
32 | // 当程序退出方式非URL_COUNT时按照正常逻辑处理
33 | if (realUrlCount == -1) {
34 | this.queue.add(request);
35 | return;
36 | }
37 | // 在有效期内(realUrlCount > 0),每次push url时realUrlCount - 1, 当 realUrlCount <= 0 时,当前Scheduler将不再收录新的url
38 | if (realUrlCount <= 0) {
39 | return;
40 | }
41 | realUrlCount--;
42 | this.queue.add(request);
43 | }
44 |
45 | @Override
46 | public Request poll(Task task) {
47 | return (Request) this.queue.poll();
48 | }
49 |
50 | @Override
51 | public int getLeftRequestsCount(Task task) {
52 | return this.queue.size();
53 | }
54 |
55 | @Override
56 | public int getTotalRequestsCount(Task task) {
57 | return this.getDuplicateRemover().getTotalRequestsCount(task);
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/util/CommonUtil.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.util;
2 |
3 | import me.zhyd.hunter.entity.ImageLink;
4 | import org.apache.commons.lang3.StringUtils;
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.safety.Whitelist;
7 | import us.codecraft.webmagic.Page;
8 | import us.codecraft.webmagic.Request;
9 | import us.codecraft.webmagic.selector.Html;
10 | import us.codecraft.webmagic.selector.Selectable;
11 |
12 | import java.util.HashSet;
13 | import java.util.List;
14 | import java.util.Set;
15 | import java.util.regex.Pattern;
16 |
17 | /**
18 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
19 | * @version 1.0
20 | * @since 1.8
21 | */
22 | public class CommonUtil {
23 |
24 | private static final Pattern PATTERN = Pattern.compile("
]+src\\s*=\\s*['\"]([^'\"]+)['\"][^'\"]+data-original\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>|
]+data-original\\s*=\\s*['\"]([^'\"]+)['\"][^'\"]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>|
]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
25 |
26 | /**
27 | * 获取真实的网站介绍,最多只保留100个字符
28 | *
29 | * @param description 原博客的description
30 | * @param content 原博客的正文内容
31 | */
32 | public static String getRealDescription(String description, String content) {
33 | if (StringUtils.isNotEmpty(description)) {
34 | return description.replaceAll("\r\n| ", "");
35 | }
36 | if (StringUtils.isNotEmpty(content)) {
37 | content = Jsoup.clean(content.trim(), Whitelist.simpleText());
38 | return content.length() > 100 ? content.substring(0, 100) : content;
39 | }
40 | return null;
41 | }
42 |
43 | /**
44 | * 获取真实的无特殊标签的网站关键字
45 | *
46 | * @param keywords 原博客的keywords
47 | */
48 | public static String getRealKeywords(String keywords) {
49 | String keys = StringUtils.isNotEmpty(keywords) && !"null".equals(keywords) ? keywords.trim().replaceAll(" +|,", ",").replaceAll(",,", ",") : null;
50 | return StringUtils.isEmpty(keys) ? null : Jsoup.clean(keys, Whitelist.simpleText());
51 | }
52 |
53 | /**
54 | * 获取所有图片标签的src连接
55 | *
56 | * @param html 原博客内容
57 | */
58 | public static String formatHtml(String html) {
59 | if (StringUtils.isEmpty(html)) {
60 | return null;
61 | }
62 | String lazyloadFormat = "
";
63 |
64 | Html pageHtml = getHtml(html);
65 | List imgSelectables = pageHtml.$("img").nodes();
66 | for (Selectable imgSelectable : imgSelectables) {
67 | String oldImg = imgSelectable.get();
68 | String newImg = String.format(lazyloadFormat, getRealImgUrl(imgSelectable), imgSelectable.xpath("//img/@title").get(), imgSelectable.xpath("//img/@alt").get());
69 | html = html.replace(oldImg, newImg);
70 | }
71 | return html;
72 | }
73 |
74 | private static String getRealImgUrl(Selectable selectable) {
75 | String realImgUrl = selectable.xpath("//img/@data-original").get();
76 | if (StringUtils.isEmpty(realImgUrl)) {
77 | realImgUrl = selectable.xpath("//img/@data-src").get();
78 | if (StringUtils.isEmpty(realImgUrl)) {
79 | realImgUrl = selectable.xpath("//img/@src").get();
80 | }
81 | if (StringUtils.isEmpty(realImgUrl)){
82 | realImgUrl = selectable.xpath("//img/@data-original-src").get();
83 | }
84 | }
85 | if (StringUtils.isNotEmpty(realImgUrl)) {
86 | if (realImgUrl.contains("?")) {
87 | realImgUrl = realImgUrl.substring(0, realImgUrl.indexOf("?"));
88 | }
89 | }
90 | return realImgUrl;
91 | }
92 |
93 | /**
94 | * 获取所有图片标签的src连接
95 | *
96 | * @param html 原博客内容
97 | */
98 | public static Set getAllImageLink(String html) {
99 | if (StringUtils.isEmpty(html)) {
100 | return null;
101 | }
102 | Set imageLinks = new HashSet<>();
103 | ImageLink imageLink = null;
104 |
105 | Html pageHtml = getHtml(html);
106 | List imgSelectables = pageHtml.$("img").nodes();
107 | for (Selectable imgSelectable : imgSelectables) {
108 | String newImgSrc = getRealImgUrl(imgSelectable);
109 | imageLink = new ImageLink(newImgSrc);
110 | imageLinks.add(imageLink);
111 | }
112 | return imageLinks;
113 | }
114 |
115 | private static Html getHtml(String html) {
116 | Page page = new Page();
117 | page.setRequest(new Request(""));
118 | page.setRawText(html);
119 | return page.getHtml();
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/util/DateUtil.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.util;
2 |
3 | import java.util.Calendar;
4 | import java.util.Date;
5 |
6 | /**
7 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
8 | * @version 1.0
9 | * @since 1.8
10 | */
11 | public class DateUtil extends cn.hutool.core.date.DateUtil {
12 | private static final String PATTERN1 = "yyyy/MM/dd HH:mm:ss";
13 | private static final String PATTERN16 = "yyyy/MM/dd HH:mm";
14 | private static final String PATTERN15 = "MM/dd HH:mm";
15 | private static final String PATTERN2 = "yyyy/MM/dd";
16 | private static final String PATTERN8 = "dd/MM/yyyy HH:mm:ss";
17 | private static final String PATTERN7 = "dd/MM/yyyy";
18 |
19 | private static final String PATTERN3 = "yyyy-MM-dd HH:mm:ss";
20 | private static final String PATTERN4 = "yyyy-MM-dd";
21 | private static final String PATTERN9 = "dd-MM-yyyy HH:mm:ss";
22 | private static final String PATTERN10 = "dd-MM-yyyy";
23 |
24 | private static final String PATTERN11 = "yyyy年MM月dd日";
25 | private static final String PATTERN12 = "yyyy年MM月dd日 00:00:00";
26 | private static final String PATTERN17 = "yyyy年MM月dd日 HH:mm:ss";
27 |
28 | private static final String PATTERN5 = "yyyyMMddHHmmssSSS";
29 | private static final String PATTERN6 = "HH:mm";
30 |
31 | private static final String PATTERN13 = "yyyy-MM-dd HH:mm";
32 | private static final String PATTERN14 = "yyyy.MM.dd HH:mm";
33 |
34 | private static final String FULL_TIME_PATTERN0 = "yyyy-MM-dd HH:mm:ss.SSS";
35 |
36 | public static Date parse(Object originalDateObj) {
37 | if (null == originalDateObj) {
38 | return null;
39 | }
40 | if (originalDateObj instanceof Long || originalDateObj instanceof Integer) {
41 | return new Date(Long.parseLong(String.valueOf(originalDateObj)));
42 | }
43 | String originalDateStr = String.valueOf(originalDateObj);
44 | originalDateStr = originalDateStr.replace("T", " ").replace("Z", "");
45 | String pattern = null;
46 | boolean containsSemicolon = originalDateStr.contains(":");
47 | if (originalDateStr.length() > 20) {
48 | pattern = FULL_TIME_PATTERN0;
49 | } else if (originalDateStr.contains("/")) {
50 | if (originalDateStr.split("/")[0].length() == 2) {
51 | pattern = containsSemicolon ? (originalDateStr.split("/").length == 2 ? PATTERN15 : PATTERN8) : PATTERN7;
52 | } else {
53 | if (containsSemicolon) {
54 | if (originalDateStr.split(":").length == 2) {
55 | pattern = PATTERN16;
56 | } else {
57 | pattern = PATTERN1;
58 | }
59 | } else {
60 | pattern = PATTERN2;
61 | }
62 | }
63 | } else if (originalDateStr.contains("-")) {
64 | if (originalDateStr.split("-")[0].length() == 2) {
65 | pattern = containsSemicolon ? PATTERN9 : PATTERN10;
66 | } else {
67 | pattern = containsSemicolon ? (originalDateStr.split(":").length == 2 ? PATTERN13 : PATTERN3) : PATTERN4;
68 | }
69 | } else if (originalDateStr.contains("年") || originalDateStr.contains("月")) {
70 | pattern = containsSemicolon ? (originalDateStr.contains("00:00") ? PATTERN12 : PATTERN17) : PATTERN11;
71 | } else if (originalDateStr.contains(".")) {
72 | pattern = PATTERN14;
73 | } else {
74 | if (originalDateStr.length() <= 5) {
75 | pattern = PATTERN6;
76 | } else {
77 | pattern = PATTERN5;
78 | }
79 | }
80 | Date date = null;
81 | try {
82 | if (pattern.equals(PATTERN15)) {
83 | pattern = PATTERN16;
84 | Calendar calendar = Calendar.getInstance();
85 | originalDateStr = calendar.get(Calendar.YEAR) + originalDateStr;
86 | }
87 | date = parse(originalDateStr, pattern);
88 | } catch (Exception ignored) {
89 |
90 | }
91 | return date;
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/util/HunterPrintWriter.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.util;
2 |
3 | import lombok.extern.slf4j.Slf4j;
4 | import me.zhyd.hunter.consts.HunterConsts;
5 |
6 | import java.io.PrintWriter;
7 |
8 | /**
9 | * 系统输出工具类,当传入PrintWriter时可以将字符流输出到页面, 默认为log日志输出
10 | *
11 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
12 | * @version 1.0
13 | * @since 1.0
14 | */
15 | @Slf4j
16 | public class HunterPrintWriter {
17 |
18 | private String jsoupCallback = "";
19 | private PrintWriter writer;
20 |
21 | public HunterPrintWriter() {
22 | }
23 |
24 | /**
25 | * @param writer 输出流
26 | * @param jsoupCallback 用于页面打印日志的jsoup回调函数,默认为使用iframe方式打开,回调函数为‘parent.printMessage’。具体使用方法,可参考帮助文档
27 | */
28 | public HunterPrintWriter(PrintWriter writer, String jsoupCallback) {
29 | this.writer = writer;
30 | if (null != jsoupCallback) {
31 | this.jsoupCallback = jsoupCallback;
32 | }
33 | }
34 |
35 | /**
36 | * @param writer 输出流
37 | */
38 | public HunterPrintWriter(PrintWriter writer) {
39 | this(writer, null);
40 | }
41 |
42 | public HunterPrintWriter print(String... msgs) {
43 | for (String msg : msgs) {
44 | if (!msg.equals("shutdown")) {
45 | msg = HunterConsts.LOG_PREFIX + msg;
46 | }
47 |
48 | log.info(msg);
49 | if (null != writer) {
50 | writer.print(String.format(this.jsoupCallback, msg));
51 | writer.flush();
52 | }
53 | }
54 |
55 | return this;
56 | }
57 |
58 | public void shutdown() {
59 | print("bye~~", "shutdown");
60 | if (null != writer) {
61 | writer.close();
62 | writer = null;
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/me/zhyd/hunter/util/PlatformUtil.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.util;
2 |
3 | import me.zhyd.hunter.config.platform.InnerPlatform;
4 | import me.zhyd.hunter.config.platform.Platform;
5 | import me.zhyd.hunter.exception.HunterException;
6 |
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | /**
11 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
12 | * @version 1.0
13 | * @since 1.8
14 | */
15 | public class PlatformUtil {
16 |
17 | public static String getHost(String url) {
18 | String res = getDomain(url);
19 | if (null == res) {
20 | return null;
21 | }
22 | return res.replace("https://", "").replace("http://", "");
23 | }
24 |
25 | public static InnerPlatform getPlarform(String url) {
26 | Platform platform = Platform.getPlatformByUrl(url);
27 | if (null == platform) {
28 | throw new HunterException("暂时不支持该平台:" + url);
29 | }
30 | return getPlarform(platform);
31 | }
32 |
33 | public static InnerPlatform getPlarform(Platform platform) {
34 | if (null == platform) {
35 | throw new HunterException("无效的博客平台");
36 | }
37 | Class clazz = platform.getClazz();
38 | try {
39 | return (InnerPlatform) clazz.newInstance();
40 | } catch (InstantiationException | IllegalAccessException e) {
41 | throw new HunterException(String.format("无法获取InnerPlatform实例,url: %s", platform.getHost()), e);
42 | }
43 | }
44 |
45 | public static String getDomain(String url) {
46 | String regex = "(http|https)://(www.)?([\\w-_]+(\\.)?)+";
47 | Pattern pattern = Pattern.compile(regex);
48 | Matcher matcher = pattern.matcher(url);
49 | return matcher.find() ? matcher.group() : null;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/resources/HunterConfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "imooc": {
3 | "domain": "www.imooc.com",
4 | "titleRegex": "//span[@class=js-title]/html()",
5 | "authorRegex": "//div[@class=name_con]/p[@class=name]/a[@class=nick]/html()",
6 | "releaseDateRegex": "//div[@class='dc-profile']/div[@class='l']/span[@class='spacer']/text()",
7 | "contentRegex": "//div[@class=detail-content]/html()",
8 | "tagRegex": "//div[@class=cat-box]/div[@class=cat-wrap]/a[@class=cat]/html()",
9 | "descriptionRegex": "//meta[@name=Description]/@content",
10 | "targetLinksRegex": "/article/[0-9]{1,10}",
11 | "header": [
12 | "Host=www.imooc.com",
13 | "Referer=https://www.imooc.com"
14 | ],
15 | "entryUrls": [
16 | "https://www.imooc.com/u/{uid}/articles?page=1"
17 | ]
18 | },
19 | "csdn": {
20 | "domain": "blog.csdn.net",
21 | "titleRegex": "//h1[@class=title-article]/html()",
22 | "authorRegex": "//a[@class=follow-nickName]/html()",
23 | "releaseDateRegex": "//div[@class=article-bar-top]/div[@class='bar-content']/span[@class=time]/html()",
24 | "contentRegex": "//div[@id=content_views]/html()",
25 | "tagRegex": "//span[@class=artic-tag-box]/a[@class=tag-link]/html()",
26 | "targetLinksRegex": "(((http|ftp|https):\\/\\/[0-9a-zA-Z]{1,15}.blog.csdn.net/article/details/[0-9a-zA-Z]{1,15})|((http|ftp|https):\\/\\/blog.csdn.net/{uid}/article/details/[0-9a-zA-Z]{1,15}))",
27 | "header": [
28 | "Host=blog.csdn.net",
29 | "Referer=https://blog.csdn.net/{uid}/article/list/1"
30 | ],
31 | "entryUrls": [
32 | "https://blog.csdn.net/{uid}/article/list/1"
33 | ]
34 | },
35 | "iteye": {
36 | "domain": "{uid}.iteye.com",
37 | "titleRegex": "//div[@class=blog_title]/h3/text()",
38 | "authorRegex": "//div[@id=blog_owner_name]/html()",
39 | "releaseDateRegex": "//div[@class=blog_bottom]/ul/li[1]/html()",
40 | "contentRegex": "//div[@class=iteye-blog-content-contain]/html()",
41 | "tagRegex": "//div[@class=news_tag]/a/html()",
42 | "targetLinksRegex": ".*{uid}\\.iteye\\.com/blog/[0-9]+",
43 | "header": [
44 | "Host={uid}.iteye.com",
45 | "Referer=http://{uid}.iteye.com/"
46 | ],
47 | "entryUrls": [
48 | "http://{uid}.iteye.com/?page=1"
49 | ]
50 | },
51 | "cnblogs": {
52 | "domain": "www.cnblogs.com",
53 | "titleRegex": "//a[@id=cb_post_title_url]/html()",
54 | "authorRegex": "//div[@class=postDesc]/a[1]/html()",
55 | "releaseDateRegex": "//span[@id=post-date]/html()",
56 | "contentRegex": "//div[@id=cnblogs_post_body]/html()",
57 | "tagRegex": "//div[@id=EntryTag]/a/html()",
58 | "descriptionRegex": "//meta[@property=\"og:description\"]/@content",
59 | "targetLinksRegex": ".*www\\.cnblogs\\.com/{uid}/p/[\\w\\d]+\\.html",
60 | "header": [
61 | "Host=www.cnblogs.com",
62 | "Referer=https://www.cnblogs.com/"
63 | ],
64 | "entryUrls": [
65 | "https://www.cnblogs.com/{uid}/default.html?page=1"
66 | ]
67 | },
68 | "juejin": {
69 | "domain": "juejin.im",
70 | "titleRegex": "//h1[@class=article-title]/html()",
71 | "authorRegex": "//div[@itemprop=author]/meta[@itemprop=\"name\"]/@content",
72 | "releaseDateRegex": "//meta[@itemprop=\"datePublished\"]/@content",
73 | "contentRegex": "//div[@class=article-content]/html()",
74 | "tagRegex": "//div[@class=tag-title]/html()",
75 | "targetLinksRegex": ".*juejin\\.im/post/[\\w\\d]+",
76 | "header": [
77 | "Host=juejin.im",
78 | "Referer=https://juejin.im"
79 | ],
80 | "entryUrls": [
81 | "https://juejin.im/user/{uid}/posts"
82 | ]
83 | },
84 | "v2ex": {
85 | "domain": "v2ex.com",
86 | "titleRegex": "//*[@id=Main]/div[@class=box]/div[@class=header]/h1/html()",
87 | "authorRegex": "//*[@id=Main]/div[@class=box]/div[@class=header]/small/a/html()",
88 | "releaseDateRegex": "//meta[@property=\"article:published_time\"]/@content",
89 | "contentRegex": "//div[@class=markdown_body]/html()",
90 | "tagRegex": "//*[@id=\"Main\"]/div[6]/div/a/html()",
91 | "descriptionRegex": "//meta[@property=\"og:description\"]/@content",
92 | "targetLinksRegex": ".*www\\.v2ex\\.com/t/[\\w\\d]+",
93 | "header": [
94 | "Host=www.v2ex.com",
95 | "Referer=https://www.v2ex.com"
96 | ],
97 | "entryUrls": [
98 | "https://www.v2ex.com/member/{uid}"
99 | ]
100 | },
101 | "oschina": {
102 | "domain": "oschina.net",
103 | "titleRegex": "//h1[@class=article-box__title]/a/text()",
104 | "authorRegex": "//div[@class=article-box__meta]/div[@class=item-list]/div[2]/a/html()",
105 | "releaseDateRegex": "//div[@class=article-box__meta]/div[@class=item-list]/div[4]/html()",
106 | "contentRegex": "//div[@class=content]/html()",
107 | "tagRegex": "//div[@class=tags-box]/div[@class=tags-box__inner]/a/html()",
108 | "targetLinksRegex": "https://my.oschina.net/.*/blog/[0-9]{1,10}",
109 | "header": [
110 | "Host=my.oschina.net",
111 | "Referer=https://my.oschina.net"
112 | ],
113 | "entryUrls": [
114 | "https://my.oschina.net/{uid}",
115 | "https://my.oschina.net/u/{uid}"
116 | ]
117 | },
118 | "jianshu": {
119 | "resolver": {
120 | "releaseDate": {
121 | "type": "regex",
122 | "clazz": "java.lang.Long",
123 | "operator": "* 1000"
124 | }
125 | },
126 | "domain": "jianshu.com",
127 | "titleRegex": "//h1[@class=_1RuRku]/text()",
128 | "authorRegex": "//span[@class=_22gUMi]/html()",
129 | "releaseDateRegex": ".*\"first_shared_at\":([0-9]+),.*",
130 | "contentRegex": "//article[@class=_2rhmJa]/html()",
131 | "tagRegex": "//div",
132 | "targetLinksRegex": "/p/[0-9a-zA-Z]{1,15}",
133 | "header": [
134 | "Host=www.jianshu.com",
135 | "Referer=https://www.jianshu.com/p/{uid}"
136 | ],
137 | "entryUrls": [
138 | "https://www.jianshu.com/p/{uid}",
139 | "https://www.jianshu.com/u/{uid}"
140 | ]
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | ### set log levels ###
2 | log4j.rootLogger=WARN,Console
3 | log4j.logger.me.zhyd.hunter.util=INFO
4 | ### \u8F93\u51FA\u5230\u63A7\u5236\u53F0 ###
5 | log4j.appender.Console=org.apache.log4j.ConsoleAppender
6 | log4j.appender.Console.Target=System.out
7 | log4j.appender.Console.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.Console.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/src/test/java/me/zhyd/hunter/test/CommonUtilTest.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.test;
2 |
3 | import me.zhyd.hunter.util.CommonUtil;
4 | import org.junit.Test;
5 |
6 | import java.util.Arrays;
7 | import java.util.List;
8 |
9 | /**
10 | * @author yadong.zhang (yadong.zhang0415(a)gmail.com)
11 | * @version 1.0
12 | * @website https://www.zhyd.me
13 | * @date 2019/3/11 9:56
14 | * @since 1.8
15 | */
16 | public class CommonUtilTest {
17 |
18 | @Test
19 | public void formatHtmlTest() {
20 | List htmls = Arrays.asList(
21 | "
",
22 | "
",
23 | "
"
24 | );
25 |
26 | for (String html : htmls) {
27 | System.out.println(html = CommonUtil.formatHtml(html));
28 | System.out.println(CommonUtil.getAllImageLink(html));
29 | System.out.println();
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/java/me/zhyd/hunter/test/QuickStartTest.java:
--------------------------------------------------------------------------------
1 | package me.zhyd.hunter.test;
2 |
3 | import com.alibaba.fastjson.JSONArray;
4 | import lombok.extern.slf4j.Slf4j;
5 | import me.zhyd.hunter.config.HunterConfig;
6 | import me.zhyd.hunter.config.HunterConfigContext;
7 | import me.zhyd.hunter.config.platform.Platform;
8 | import me.zhyd.hunter.consts.HunterConsts;
9 | import me.zhyd.hunter.entity.VirtualArticle;
10 | import me.zhyd.hunter.enums.ExitWayEnum;
11 | import me.zhyd.hunter.processor.BlogHunterProcessor;
12 | import me.zhyd.hunter.processor.HunterProcessor;
13 | import me.zhyd.hunter.util.PlatformUtil;
14 | import org.apache.commons.collections.CollectionUtils;
15 | import org.apache.commons.lang3.StringUtils;
16 | import org.junit.Test;
17 |
18 | import java.util.Date;
19 | import java.util.concurrent.CopyOnWriteArrayList;
20 |
21 | /**
22 | * 快速开始-测试工具使用方法
23 | */
24 | @Slf4j
25 | public class QuickStartTest {
26 |
27 | /**
28 | * 抓取单个文章
29 | *
30 | * @param url 文件地址
31 | * @param convertImage 是否转存图片,当选择true时会在结果中返回该文中的所有图片链接
32 | */
33 | private void single(String url, boolean convertImage) {
34 | log.info(HunterConsts.LOG_PREFIX + url + " | " + PlatformUtil.getDomain(url) + " | " + PlatformUtil.getHost(url));
35 | HunterProcessor hunter = new BlogHunterProcessor(url, convertImage);
36 | CopyOnWriteArrayList list = hunter.execute();
37 | if (null == list || list.isEmpty()) {
38 | log.info("没获取到数据: {}", url);
39 | } else {
40 | this.check(list);
41 | }
42 | }
43 |
44 | private void check(CopyOnWriteArrayList list) {
45 | for (VirtualArticle virtualArticle : list) {
46 | log.info(HunterConsts.LOG_PREFIX + JSONArray.toJSONString(virtualArticle.getImageLinks()));
47 | if (StringUtils.isEmpty(virtualArticle.getContent())) {
48 | log.error(HunterConsts.LOG_PREFIX + "内容为空");
49 | }
50 | if (StringUtils.isEmpty(virtualArticle.getAuthor())) {
51 | log.error(HunterConsts.LOG_PREFIX + "作者为空");
52 | }
53 | if (StringUtils.isEmpty(virtualArticle.getSource())) {
54 | log.error(HunterConsts.LOG_PREFIX + "源站为空");
55 | }
56 | if (StringUtils.isEmpty(virtualArticle.getDescription())) {
57 | log.error(HunterConsts.LOG_PREFIX + "Description为空");
58 | }
59 | if (StringUtils.isEmpty(virtualArticle.getKeywords())) {
60 | log.error(HunterConsts.LOG_PREFIX + "Keywords内容为空");
61 | }
62 | if (StringUtils.isEmpty(virtualArticle.getTitle())) {
63 | log.error(HunterConsts.LOG_PREFIX + "标题为空");
64 | }
65 | if (null == virtualArticle.getReleaseDate()) {
66 | log.error(HunterConsts.LOG_PREFIX + "发布日期为空");
67 | }
68 | if (CollectionUtils.isEmpty(virtualArticle.getTags())) {
69 | log.error(HunterConsts.LOG_PREFIX + "标签为空");
70 | }
71 | }
72 | }
73 |
74 | /**
75 | * 测试抓取单篇文章
76 | */
77 | @Test
78 | public void singleTest() {
79 | this.single("https://www.imooc.com/article/259921", true);
80 | this.single("https://blog.csdn.net/u011197448/article/details/83901306", true);
81 | this.single("https://www.iteye.com/blog/843977358-2317810", true);
82 | this.single("https://www.cnblogs.com/zhangyadong/p/oneblog.html", true);
83 | this.single("https://juejin.im/post/5c75d34851882564965edb23", true);
84 | this.single("https://www.v2ex.com/t/519648", true);
85 | this.single("https://my.oschina.net/u/4007037/blog/3075219", true);
86 | this.single("https://www.jianshu.com/p/f33b0b5fa80c", true);
87 |
88 | }
89 |
90 | /**
91 | * 测试抓取imooc的文章列表。按照抓取的文章条数控制程序停止,并且手动指定待抓取的连接条数
92 | */
93 | @Test
94 | public void imoocTest() {
95 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.IMOOC);
96 | // 设置用户的id
97 | config.setUid("1175248")
98 | // 设置程序退出的方式
99 | .setExitWay(ExitWayEnum.URL_COUNT)
100 | // 根据ExitWay设置,当ExitWay = URL_COUNT时, count表示待抓取的链接个数;当ExitWay = DURATION时, count表示爬虫运行的时间,理想状态时1s抓取一条,受实际网速影响;当ExitWay = default时,程序不做限制,抓取所有匹配到的文章,“慎用”
101 | // 如果不手动设置该值, 则取ExitWayEnum中默认的数量,URL_COUNT(10),DURATION(60)
102 | .setCount(2);
103 | HunterProcessor hunter = new BlogHunterProcessor(config);
104 | CopyOnWriteArrayList list = hunter.execute();
105 | if (null == list || list.isEmpty()) {
106 | System.out.println("没获取到数据");
107 | } else {
108 | this.check(list);
109 | }
110 | }
111 |
112 | /**
113 | * 测试抓取csdn的文章列表。按照程序运行的时间(s)控制程序停止,并且手动指定程序运行的时间
114 | */
115 | @Test
116 | public void csdnTest() {
117 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.CSDN);
118 | // 设置用户的id
119 | config.setUid("u011197448")
120 | // 设置程序退出的方式
121 | .setExitWay(ExitWayEnum.URL_COUNT)
122 | // 根据ExitWay设置,当ExitWay = URL_COUNT时, count表示待抓取的链接个数;当ExitWay = DURATION时, count表示爬虫运行的时间,理想状态时1s抓取一条,受实际网速影响;当ExitWay = default时,程序不做限制,抓取所有匹配到的文章,“慎用”
123 | // 如果不手动设置该值, 则取ExitWayEnum中默认的数量,URL_COUNT(10),DURATION(60)
124 | .setCount(10);
125 | HunterProcessor hunter = new BlogHunterProcessor(config);
126 | System.out.println("程序开始执行:" + new Date());
127 | CopyOnWriteArrayList list = hunter.execute();
128 | System.out.println("程序执行完毕:" + new Date());
129 | if (null == list || list.isEmpty()) {
130 | System.out.println("没获取到数据");
131 | } else {
132 | this.check(list);
133 | }
134 | }
135 |
136 | /**
137 | * 测试抓取iteye的文章列表。按照抓取的文章条数控制程序停止,并使用默认的条数(10条)
138 | */
139 | @Test
140 | public void iteyeTest() {
141 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.ITEYE);
142 | // 设置用户的id
143 | config.setUid("843977358")
144 | // 设置程序退出的方式
145 | .setExitWay(ExitWayEnum.URL_COUNT);
146 | HunterProcessor hunter = new BlogHunterProcessor(config);
147 | CopyOnWriteArrayList list = hunter.execute();
148 | if (null == list || list.isEmpty()) {
149 | System.out.println("没获取到数据");
150 | } else {
151 | this.check(list);
152 | }
153 | }
154 |
155 | /**
156 | * 测试抓取cnblogs的文章列表。按照程序运行的时间(s)控制程序停止,并使用默认的时间(60s)
157 | */
158 | @Test
159 | public void cnblogsTest() {
160 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.CNBLOGS);
161 | // 设置用户的id
162 | config.setUid("zhangyadong")
163 | // 设置程序退出的方式
164 | .setExitWay(ExitWayEnum.DURATION);
165 | HunterProcessor hunter = new BlogHunterProcessor(config);
166 | CopyOnWriteArrayList list = hunter.execute();
167 | if (null == list || list.isEmpty()) {
168 | System.out.println("没获取到数据");
169 | } else {
170 | this.check(list);
171 | }
172 | }
173 |
174 | /**
175 | * 测试抓取掘金的文章列表
176 | */
177 | @Test
178 | public void juejinTest() {
179 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.JUEJIN);
180 | // 设置用户的id
181 | config.setUid("5b90662de51d450e8b1370f6")
182 | // 设置程序退出的方式
183 | .setExitWay(ExitWayEnum.URL_COUNT)
184 | .setCount(5);
185 | HunterProcessor hunter = new BlogHunterProcessor(config);
186 | CopyOnWriteArrayList list = hunter.execute();
187 | if (null == list || list.isEmpty()) {
188 | System.out.println("没获取到数据");
189 | } else {
190 | this.check(list);
191 | }
192 | }
193 |
194 | /**
195 | * 测试抓取v2ex的文章列表
196 | */
197 | @Test
198 | public void v2exTest() {
199 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.V2EX);
200 | // 设置用户的id
201 | config.setUid("AlibabaSS")
202 | // 设置程序退出的方式
203 | .setExitWay(ExitWayEnum.DURATION)
204 | // 设定抓取120秒, 如果所有文章都被抓取过了,则会提前停止
205 | .setCount(120);
206 | HunterProcessor hunter = new BlogHunterProcessor(config);
207 | CopyOnWriteArrayList list = hunter.execute();
208 | if (null == list || list.isEmpty()) {
209 | System.out.println("没获取到数据");
210 | } else {
211 | this.check(list);
212 | }
213 | }
214 |
215 | /**
216 | * 测试抓取v2ex的文章列表,自定义抓取规则
217 | */
218 | @Test
219 | public void v2exTest2() {
220 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.V2EX);
221 | config.setEntryUrls("https://www.v2ex.com/member/Evernote")
222 | .addEntryUrl("https://www.v2ex.com/member/ityouknow")
223 | // 设置程序退出的方式
224 | .setExitWay(ExitWayEnum.DURATION)
225 | // 设定抓取120秒, 如果所有文章都被抓取过了,则会提前停止
226 | .setCount(120);
227 | HunterProcessor hunter = new BlogHunterProcessor(config);
228 | CopyOnWriteArrayList list = hunter.execute();
229 | if (null == list || list.isEmpty()) {
230 | System.out.println("没获取到数据");
231 | } else {
232 | this.check(list);
233 | }
234 | }
235 | /**
236 | * 测试抓取oschina的文章列表
237 | */
238 | @Test
239 | public void oschinaTest() {
240 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.OSCHINA);
241 | config.setUid("haitaohu")
242 | // 设置程序退出的方式
243 | .setExitWay(ExitWayEnum.URL_COUNT)
244 | .setCount(5);
245 | HunterProcessor hunter = new BlogHunterProcessor(config);
246 | CopyOnWriteArrayList list = hunter.execute();
247 | if (null == list || list.isEmpty()) {
248 | System.out.println("没获取到数据");
249 | } else {
250 | this.check(list);
251 | }
252 | }
253 | /**
254 | * 测试抓取oschina的文章列表,自定义抓取规则
255 | */
256 | @Test
257 | public void oschinaTest2() {
258 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.V2EX);
259 | config.setEntryUrls("https://my.oschina.net/haitaohu")
260 | // 设置程序退出的方式
261 | .setExitWay(ExitWayEnum.DURATION)
262 | // 设定抓取120秒, 如果所有文章都被抓取过了,则会提前停止
263 | .setCount(120);
264 | HunterProcessor hunter = new BlogHunterProcessor(config);
265 | CopyOnWriteArrayList list = hunter.execute();
266 | if (null == list || list.isEmpty()) {
267 | System.out.println("没获取到数据");
268 | } else {
269 | this.check(list);
270 | }
271 | }
272 |
273 | /**
274 | * 测试抓取jianshu的文章列表。按照程序运行的时间(s)控制程序停止,并且手动指定程序运行的时间
275 | */
276 | @Test
277 | public void jianshuTest() {
278 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.JIANSHU);
279 | // 设置用户的id
280 | config.setUid("c790f6e8eba5")
281 | // 设置程序退出的方式
282 | .setExitWay(ExitWayEnum.DURATION)
283 | // 根据ExitWay设置,当ExitWay = URL_COUNT时, count表示待抓取的链接个数;当ExitWay = DURATION时, count表示爬虫运行的时间,理想状态时1s抓取一条,受实际网速影响;当ExitWay = default时,程序不做限制,抓取所有匹配到的文章,“慎用”
284 | // 如果不手动设置该值, 则取ExitWayEnum中默认的数量,URL_COUNT(10),DURATION(60)
285 | .setCount(10);
286 | HunterProcessor hunter = new BlogHunterProcessor(config);
287 | System.out.println("程序开始执行:" + new Date());
288 | CopyOnWriteArrayList list = hunter.execute();
289 | System.out.println("程序执行完毕:" + new Date());
290 | if (null == list || list.isEmpty()) {
291 | System.out.println("没获取到数据");
292 | } else {
293 | this.check(list);
294 | }
295 | }
296 |
297 | /**
298 | * 高级使用
299 | */
300 | @Test
301 | public void other() {
302 | HunterConfig config = HunterConfigContext.getHunterConfig(Platform.IMOOC);
303 | // set会重置,add会追加
304 | config.setEntryUrls("https://www.imooc.com/u/1175248/articles")
305 | .addEntryUrl("https://www.imooc.com/u/4321686/articles")
306 | // 设置程序退出的方式
307 | .setExitWay(ExitWayEnum.URL_COUNT)
308 | // 设定抓取120秒, 如果所有文章都被抓取过了,则会提前停止
309 | .setCount(20)
310 | // 每次抓取间隔的时间
311 | .setSleepTime(100)
312 | // 失败重试次数
313 | .setRetryTimes(3)
314 | // 针对抓取失败的链接 循环重试次数
315 | .setCycleRetryTimes(3)
316 | // 开启的线程数
317 | .setThreadCount(5)
318 | // 开启图片转存
319 | .setConvertImg(true);
320 | HunterProcessor hunter = new BlogHunterProcessor(config);
321 | CopyOnWriteArrayList list = hunter.execute();
322 | if (null == list || list.isEmpty()) {
323 | System.out.println("没获取到数据");
324 | } else {
325 | this.check(list);
326 | }
327 | }
328 | }
329 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | ### set log levels ###
2 | log4j.rootLogger=WARN,Console
3 | log4j.logger.me.zhyd.hunter.test=INFO
4 | log4j.logger.me.zhyd.hunter.util=INFO
5 | ### \u8F93\u51FA\u5230\u63A7\u5236\u53F0 ###
6 | log4j.appender.Console=org.apache.log4j.ConsoleAppender
7 | log4j.appender.Console.Target=System.out
8 | log4j.appender.Console.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.Console.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
10 |
--------------------------------------------------------------------------------