├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── cetty-core
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── jibug
│ │ │ └── cetty
│ │ │ └── core
│ │ │ ├── AnnotationBootstrap.java
│ │ │ ├── Bootstrap.java
│ │ │ ├── Cetty.java
│ │ │ ├── Page.java
│ │ │ ├── Payload.java
│ │ │ ├── Result.java
│ │ │ ├── Seed.java
│ │ │ ├── annotation
│ │ │ ├── Query.java
│ │ │ └── TargetUrl.java
│ │ │ ├── concurrent
│ │ │ ├── CettyAbortPolicy.java
│ │ │ ├── CettyThreadPool.java
│ │ │ ├── CountableThreadPool.java
│ │ │ ├── NamedThreadFactory.java
│ │ │ └── ThreadPerTaskExecutor.java
│ │ │ ├── constants
│ │ │ └── HttpConstants.java
│ │ │ ├── context
│ │ │ └── CettyContext.java
│ │ │ ├── handler
│ │ │ ├── AbstractHandlerContext.java
│ │ │ ├── ConsoleReduceHandler.java
│ │ │ ├── DefaultHandlerContext.java
│ │ │ ├── Handler.java
│ │ │ ├── HandlerContext.java
│ │ │ ├── HandlerInitializer.java
│ │ │ ├── HandlerPipeline.java
│ │ │ ├── HttpDownloadHandler.java
│ │ │ ├── PageProcessHandler.java
│ │ │ ├── ProcessHandler.java
│ │ │ ├── ProcessHandlerAdapter.java
│ │ │ ├── ReduceHandler.java
│ │ │ └── ReduceHandlerAdapter.java
│ │ │ ├── model
│ │ │ ├── AnnotationConfig.java
│ │ │ └── RequestBody.java
│ │ │ ├── net
│ │ │ ├── AbstractHttpClientGenerator.java
│ │ │ ├── AsyncHttpClientGenerator.java
│ │ │ ├── HttpClientFactory.java
│ │ │ ├── HttpClientGenerator.java
│ │ │ ├── Proxy.java
│ │ │ ├── ProxyStrategy.java
│ │ │ └── SyncHttpClientGenerator.java
│ │ │ ├── scheduler
│ │ │ ├── DuplicateStrategy.java
│ │ │ ├── QueueScheduler.java
│ │ │ └── Scheduler.java
│ │ │ └── utils
│ │ │ └── UrlUtils.java
│ └── resources
│ │ └── logback.xml
│ └── test
│ └── java
│ └── com
│ └── jibug
│ └── cetty
│ ├── core
│ ├── bootstrap
│ │ └── BootstrapTest.java
│ ├── handler
│ │ └── HandlerTest.java
│ └── net
│ │ ├── AsyncHttpClientTest.java
│ │ └── SyncHttpClientTest.java
│ └── sample
│ ├── kuaidaili
│ └── Kuaidaili.java
│ └── tianya
│ └── Tianya.java
├── cetty-samples
├── .gitignore
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── jibug
│ │ └── cetty
│ │ └── sample
│ │ ├── SampleApplication.java
│ │ ├── entity
│ │ └── Article.java
│ │ ├── handler
│ │ ├── BasePageHandler.java
│ │ ├── CifnewsPageHandler.java
│ │ ├── GuxiaobeiPageHandler.java
│ │ └── Waimaob2cPageHandler.java
│ │ ├── reduce
│ │ └── ArticleReducer.java
│ │ ├── runner
│ │ └── CrawlerRunner.java
│ │ └── service
│ │ ├── CrawlerService.java
│ │ └── TaskService.java
│ └── resources
│ ├── application.properties
│ └── crawler.json
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | #package files
4 |
5 | *.war
6 | *.ear
7 |
8 | #kdiff3 ignore
9 | target/
10 |
11 | #eclipse ignore
12 | .settings/
13 | .project
14 | .classpath
15 |
16 | #idea
17 | .idea/
18 | /idea/
19 | *.ipr
20 | *.iml
21 | *.iws
22 |
23 | # temp file
24 |
25 | *.log
26 | *.cache
27 | *.diff
28 | *.patch
29 | *.tmp
30 |
31 | #system ignore
32 | .DS_Store
33 | Thumbs.db
34 | /.idea/
35 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | jdk:
4 | - openjdk8
5 |
6 | notifications:
7 | email: false
8 |
9 | sudo: false
10 |
11 |
12 | cache:
13 | directories:
14 | - $HOME/.m2
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2018 heyingcai
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cetty
2 |
3 | 一个轻量级的基于事件分发的爬虫框架。
4 |
5 | [](https://travis-ci.org/heyingcai/cetty)
6 | [](https://github.com/dempeZheng/forest/blob/master/LICENSE)
7 | [](https://img.shields.io/badge/language-java-yellowgreen.svg)
8 |
9 |
10 | >An event dispatch crawler framework.
11 |
12 | 
13 |
14 | ## 功能介绍
15 | * 基于完全自定义事件处理机制的爬虫框架。
16 | * 模块化的设计,提供强大的可扩展性。
17 | * 基于HttpClient支持同步和异步数据抓取。
18 | * 支持多线程。
19 | * 基于Jsoup页面解析框架提供强大的网页解析处理能力。
20 |
21 | ## 快速开始
22 | ### 使用Maven
23 | ```xml
24 |
25 | com.jibug.cetty
26 | cetty-core
27 | 0.1.8
28 |
29 | ```
30 |
31 | ## 帮助
32 | 1.详细文档:[http://cetty.jibug.com/](http://cetty.jibug.com/)
33 | 2.QQ群
34 | 
35 | 3.bug反馈:[issues](https://github.com/heyingcai/cetty/issues)
36 |
37 | ## 让我们来写第一个demo
38 |
39 | ```java
40 | /**
41 | * 抓取天涯论坛文章列表标题
42 | * http://bbs.tianya.cn/list-333-1.shtml
43 | *
44 | * @author heyingcai
45 | */
46 | public class Tianya extends ProcessHandlerAdapter {
47 |
48 | @Override
49 | public void process(HandlerContext ctx, Page page) {
50 | //获取 Document
51 | Document document = page.getDocument();
52 | //dom解析
53 | Elements itemElements = document.
54 | select("div#bbsdoc>div#bd>div#main>div.mt5>table>tbody").
55 | get(2).
56 | select("tr");
57 | List titles = Lists.newArrayList();
58 | for (Element item : itemElements) {
59 | String title = item.select("td.td-title").text();
60 | titles.add(title);
61 | }
62 |
63 | //获取Result对象,将我们解析出来的结果向下一个handler传递
64 | Result result = page.getResult();
65 | result.addResults(titles);
66 |
67 | //通过fireXXX 方法将本handler 处理的结果向下传递
68 | //本教程直接将结果传递给ConsoleHandler,将结果直接输出控制台
69 | ctx.fireReduce(page);
70 | }
71 |
72 | public static void main(String[] args) {
73 | //启动引导类
74 | Bootstrap.
75 | me()
76 | //使用同步抓取
77 | .isAsync(false)
78 | //开启一个线程
79 | .setThreadNum(1)
80 | //抓取入口url
81 | .startUrl("http://bbs.tianya.cn/list-333-1.shtml")
82 | //通用请求信息
83 | .setPayload(Payload.custom())
84 | //添加自定处理器
85 | .addHandler(new Tianya())
86 | //添加默认结果处理器,输出至控制台
87 | .addHandler(new ConsoleReduceHandler())
88 | //是否启用实时抓取模式,如果启用非实时抓取模式则当任务队列中没有任务的一段时间后爬虫会自动处于close状态
89 | .isDuration(false)
90 | .start();
91 | }
92 | }
93 | ```
94 |
95 | ## 历史版本
96 |
97 | | 版本 | 说明 |
98 | | :----: | :----: |
99 | | 0.1.0 | 支持基本爬虫功能|
100 | | 0.1.5 | 1.支持xpath 2.修复添加cookie失效问题 3.优化底层逻辑 |
101 | | 0.1.7 | 修复底层bug |
102 |
103 |
104 | ## TODO
105 |
106 | * 支持注解方式
107 | * 支持代理池
108 | * 支持Berkeley 内存数据作为url管理器,提供海量url存储并提高存取效率
109 | * 支持热更新
110 | * 支持爬虫治理
111 |
112 |
--------------------------------------------------------------------------------
/cetty-core/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | cetty-parent
7 | com.jibug.cetty
8 | 0.1.8
9 |
10 | 4.0.0
11 |
12 | cetty-core
13 |
14 |
15 |
16 | org.apache.maven.plugins
17 | maven-compiler-plugin
18 |
19 | 1.8
20 | 1.8
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | org.slf4j
29 | slf4j-api
30 |
31 |
32 | ch.qos.logback
33 | logback-classic
34 |
35 |
36 | org.apache.httpcomponents
37 | httpclient
38 |
39 |
40 | org.apache.httpcomponents
41 | httpasyncclient
42 |
43 |
44 | junit
45 | junit
46 |
47 |
48 |
49 | org.apache.commons
50 | commons-lang3
51 |
52 |
53 | org.apache.commons
54 | commons-io
55 |
56 |
57 | org.jsoup
58 | jsoup
59 |
60 |
61 | cn.wanghaomiao
62 | JsoupXpath
63 |
64 |
65 | com.alibaba
66 | fastjson
67 |
68 |
69 | com.google.guava
70 | guava
71 |
72 |
73 |
--------------------------------------------------------------------------------
/cetty-core/src/main/java/com/jibug/cetty/core/AnnotationBootstrap.java:
--------------------------------------------------------------------------------
1 | package com.jibug.cetty.core;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | /**
7 | * @author heyingcai
8 | */
9 | public class AnnotationBootstrap extends Bootstrap {
10 |
11 | private List> executorClasses = new ArrayList<>();
12 |
13 | public AnnotationBootstrap(Class> clazz) {
14 | this.executorClasses.add(clazz);
15 | }
16 |
17 | public AnnotationBootstrap(List> classes) {
18 | this.executorClasses.addAll(classes);
19 | }
20 |
21 | @Override
22 | public void start() {
23 | super.start();
24 | }
25 |
26 | public void execute() {
27 | if (this.executorClasses.size() == 0) {
28 | throw new IllegalArgumentException("The Crawler Annotation class not found!");
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/cetty-core/src/main/java/com/jibug/cetty/core/Bootstrap.java:
--------------------------------------------------------------------------------
1 | package com.jibug.cetty.core;
2 |
3 | import com.jibug.cetty.core.handler.Handler;
4 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter;
5 | import com.jibug.cetty.core.scheduler.Scheduler;
6 | import com.google.common.base.Preconditions;
7 |
8 | import java.util.List;
9 | import java.util.concurrent.ThreadPoolExecutor;
10 |
11 | /**
12 | * @author heyingcai
13 | */
14 | public class Bootstrap {
15 |
16 | private Cetty cetty;
17 |
18 | public Bootstrap() {
19 | cetty = new Cetty();
20 | }
21 |
22 | public static Bootstrap me() {
23 | return new Bootstrap();
24 | }
25 |
26 | public Bootstrap addHandler(Handler handler) {
27 | cetty.pipeline().addLast(Preconditions.checkNotNull(handler, "handler can not be null"));
28 | return this;
29 | }
30 |
31 | public Bootstrap addHandler(Handler handler, String name) {
32 | cetty.pipeline().addLast(Preconditions.checkNotNull(handler, "handler can not be null"), name);
33 | return this;
34 | }
35 |
36 | public Bootstrap setThreadNum(int threadNum) {
37 | cetty.setThreadNum(threadNum);
38 | return this;
39 | }
40 |
41 | public Bootstrap isAsync(boolean async) {
42 | cetty.setAsync(async);
43 | return this;
44 | }
45 |
46 | public Bootstrap isDuration(boolean duration) {
47 | cetty.setDuration(duration);
48 | return this;
49 | }
50 |
51 | public Bootstrap startUrl(String url) {
52 | cetty.setStartUrl(url);
53 | return this;
54 | }
55 |
56 | public Bootstrap startUrls(List urls) {
57 | cetty.setStartUrls(urls);
58 | return this;
59 | }
60 |
61 | public Bootstrap startSeed(Seed seed) {
62 | cetty.setStartSeed(seed);
63 | return this;
64 | }
65 |
66 | public Bootstrap startSeeds(List seeds) {
67 | cetty.setStartSeeds(seeds);
68 | return this;
69 | }
70 |
71 | public Bootstrap setScheduler(Scheduler scheduler) {
72 | cetty.setScheduler(scheduler);
73 | return this;
74 | }
75 |
76 | public Bootstrap setThreadPoolExecutor(ThreadPoolExecutor threadPoolExecutor) {
77 | cetty.setThreadPoolExecutor(threadPoolExecutor);
78 | return this;
79 | }
80 |
81 | public Bootstrap setDownloader(ProcessHandlerAdapter handlerAdapter) {
82 | cetty.pipeline().addLast(Preconditions.checkNotNull(handlerAdapter, "handler can not be null"));
83 | return this;
84 | }
85 |
86 | public Bootstrap setPayload(Payload payload) {
87 | cetty.setPayload(payload);
88 | return this;
89 | }
90 |
91 | public Cetty getCetty() {
92 | return cetty;
93 | }
94 |
95 | public void start() {
96 | cetty.startCrawler();
97 | }
98 |
99 | public void stop() {
100 | cetty.close();
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/cetty-core/src/main/java/com/jibug/cetty/core/Cetty.java:
--------------------------------------------------------------------------------
1 | package com.jibug.cetty.core;
2 |
3 | import com.jibug.cetty.core.concurrent.CountableThreadPool;
4 | import com.jibug.cetty.core.concurrent.NamedThreadFactory;
5 | import com.jibug.cetty.core.handler.HandlerPipeline;
6 | import com.jibug.cetty.core.handler.HttpDownloadHandler;
7 | import com.jibug.cetty.core.net.AsyncHttpClientGenerator;
8 | import com.jibug.cetty.core.net.HttpClientGenerator;
9 | import com.jibug.cetty.core.net.SyncHttpClientGenerator;
10 | import com.jibug.cetty.core.scheduler.QueueScheduler;
11 | import com.jibug.cetty.core.scheduler.Scheduler;
12 | import com.google.common.collect.Lists;
13 | import com.jibug.cetty.core.utils.UrlUtils;
14 | import org.apache.http.impl.client.CloseableHttpClient;
15 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | import java.io.IOException;
20 | import java.util.Arrays;
21 | import java.util.List;
22 | import java.util.UUID;
23 | import java.util.concurrent.LinkedBlockingQueue;
24 | import java.util.concurrent.ThreadPoolExecutor;
25 | import java.util.concurrent.TimeUnit;
26 | import java.util.concurrent.atomic.AtomicInteger;
27 | import java.util.concurrent.locks.Condition;
28 | import java.util.concurrent.locks.ReentrantLock;
29 |
30 | /**
31 | * @author heyingcai
32 | * @date 2018/7/3
33 | */
34 | public class Cetty implements Runnable {
35 |
36 | protected Logger logger = LoggerFactory.getLogger(getClass());
37 |
38 | private AtomicInteger stat = new AtomicInteger(STAT_INIT);
39 |
40 | private final static int STAT_INIT = 0;
41 |
42 | private final static int STAT_RUNNING = 1;
43 |
44 | private final static int STAT_STOPPED = 2;
45 |
46 | private String name;
47 |
48 | private CountableThreadPool countableThreadPool;
49 |
50 | private ThreadPoolExecutor threadPoolExecutor;
51 |
52 | private int threadNum = 1;
53 |
54 | private ReentrantLock newTask = new ReentrantLock();
55 |
56 | private Condition newTaskCondition = newTask.newCondition();
57 |
58 | private long stopAwaitTime = 20;
59 |
60 | private long newTaskWaitTime = 30000;
61 |
62 | private List startSeeds;
63 |
64 | /**
65 | * crawler duration grab
66 | * default value is not duration
67 | * when there is no task, the crawler stops after a period of time.
68 | */
69 | private boolean duration = false;
70 |
71 | /**
72 | * crawler is support async
73 | * default value is sync
74 | */
75 | private boolean async = false;
76 |
77 | private HttpClientGenerator asyncHttpClientGenerator;
78 |
79 | private HttpClientGenerator httpClientHttpClientGenerator;
80 |
81 | private CloseableHttpAsyncClient httpAsyncClient;
82 |
83 | private CloseableHttpClient httpClient;
84 |
85 | /**
86 | * crawler request payload
87 | */
88 | private Payload payload;
89 |
90 | /**
91 | * the crawler global handler
92 | * these handler all in the pipeline
93 | */
94 | private HandlerPipeline pipeline;
95 |
96 | /**
97 | * url scheduler
98 | */
99 | private Scheduler scheduler = new QueueScheduler();
100 |
101 | public Cetty() {
102 | this.pipeline = new HandlerPipeline(this);
103 | // downloader handler must have one
104 | boolean hasDownloadHandler = pipeline.checkDownloadHandler();
105 | if (!hasDownloadHandler) {
106 | pipeline.addLast(new HttpDownloadHandler(), "downloader");
107 | }
108 |
109 | }
110 |
111 | public Cetty setPayload(Payload payload) {
112 | this.payload = payload;
113 | return this;
114 | }
115 |
116 | public Payload getPayload() {
117 | return payload;
118 | }
119 |
120 | public Cetty setStartUrl(String url) {
121 | checkRunningStat();
122 | this.startSeeds = Arrays.asList(new Seed(url));
123 | return this;
124 | }
125 |
126 | public Cetty setStartUrls(List urls) {
127 | checkRunningStat();
128 | this.startSeeds = convertSeed(urls);
129 | return this;
130 | }
131 |
132 | public Cetty setStartSeed(Seed seed) {
133 | checkRunningStat();
134 | this.startSeeds = Arrays.asList(seed);
135 | return this;
136 | }
137 |
138 | public Cetty setStartSeeds(List seeds) {
139 | checkRunningStat();
140 | this.startSeeds = seeds;
141 | return this;
142 | }
143 |
144 | private List convertSeed(List urls) {
145 | List seeds = Lists.newArrayListWithCapacity(urls.size());
146 | urls.forEach(url -> {
147 | seeds.add(new Seed(url));
148 | });
149 | return seeds;
150 | }
151 |
152 | public Cetty setScheduler(Scheduler scheduler) {
153 | checkRunningStat();
154 | Scheduler oldScheduler = this.scheduler;
155 | this.scheduler = scheduler;
156 | if (oldScheduler != null) {
157 | Seed seed;
158 | while ((seed = oldScheduler.poll()) != null) {
159 | scheduler.push(seed);
160 | }
161 | }
162 | return this;
163 | }
164 |
165 | public Cetty setThreadPoolExecutor(ThreadPoolExecutor threadPoolExecutor) {
166 | checkRunningStat();
167 | this.threadPoolExecutor = threadPoolExecutor;
168 | return this;
169 | }
170 |
171 | public Scheduler getScheduler() {
172 | return scheduler;
173 | }
174 |
175 | public HandlerPipeline pipeline() {
176 | return pipeline;
177 | }
178 |
179 | public boolean isAsync() {
180 | return async;
181 | }
182 |
183 | public void setAsync(boolean async) {
184 | this.async = async;
185 | }
186 |
187 | public void setDuration(boolean duration) {
188 | this.duration = duration;
189 | }
190 |
191 | public HttpClientGenerator getAsyncHttpClientGenerator() {
192 | return asyncHttpClientGenerator;
193 | }
194 |
195 | public HttpClientGenerator getHttpClientHttpClientGenerator() {
196 | return httpClientHttpClientGenerator;
197 | }
198 |
199 | public CloseableHttpClient getHttpClient() {
200 | return httpClient;
201 | }
202 |
203 | public CloseableHttpAsyncClient getHttpAsyncClient() {
204 | return httpAsyncClient;
205 | }
206 |
207 | public Cetty setThreadNum(int threadNum) {
208 | this.threadNum = threadNum;
209 | return this;
210 | }
211 |
212 | public String getName() {
213 | if (name != null) {
214 | return name;
215 | }
216 | if (payload.getDomain() != null) {
217 | return payload.getDomain();
218 | }
219 | if (startSeeds.size() > 0) {
220 | Seed seed = startSeeds.get(0);
221 | return UrlUtils.getDomain(seed.getUrl());
222 | }
223 | name = UUID.randomUUID().toString();
224 | return name;
225 | }
226 |
227 | @Override
228 | public void run() {
229 | checkRunningStat();
230 | initComponent();
231 | logger.info("Crawler {} started!", getName());
232 | while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
233 | final Seed seed = scheduler.poll();
234 |
235 | if (seed == null) {
236 | if (!duration) {
237 | if (countableThreadPool.getThreadAliveCount() == 0 || stat.get() == STAT_STOPPED) {
238 | break;
239 | }
240 | }
241 | waitTask();
242 | } else {
243 | countableThreadPool.execute(new SeedTask(seed));
244 | }
245 | }
246 | if (!countableThreadPool.isShutdown()) {
247 | countableThreadPool.isShutdown();
248 | try {
249 | countableThreadPool.getThreadPoolExecutor().awaitTermination(stopAwaitTime, TimeUnit.SECONDS);
250 | } catch (InterruptedException e) {
251 | logger.error("Cetty {} crawler wait failed !", getName());
252 | }
253 | }
254 | stopCrawler();
255 | }
256 |
257 | private class SeedTask implements Runnable {
258 |
259 | private Seed seed;
260 |
261 | SeedTask(Seed seed) {
262 | this.seed = seed;
263 | }
264 |
265 | @Override
266 | public void run() {
267 | try {
268 | pipeline.download(seed);
269 | } catch (Exception e) {
270 | logger.error("Cetty crawler run error {}", e);
271 | } finally {
272 | signalTask();
273 | }
274 | }
275 | }
276 |
277 | public void stopCrawler() {
278 | if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
279 | logger.info("Cetty {} crawler closed!", getName());
280 | }
281 |
282 | releaseObject();
283 |
284 | if (!Thread.currentThread().isInterrupted()) {
285 | Thread.currentThread().interrupt();
286 | }
287 | }
288 |
289 | public void startCrawler() {
290 | Thread thread = new Thread(this);
291 | thread.setDaemon(false);
292 | thread.start();
293 | }
294 |
295 | private void releaseObject() {
296 | if (httpAsyncClient != null) {
297 | try {
298 | httpAsyncClient.close();
299 | } catch (IOException e) {
300 | logger.warn("close httpAsyncClient error {}", e);
301 | }
302 | }
303 | if (httpClient != null) {
304 | try {
305 | httpClient.close();
306 | } catch (IOException e) {
307 | logger.warn("close httpClient error {}", e);
308 | }
309 | }
310 | }
311 |
312 | public void close() {
313 | releaseObject();
314 | countableThreadPool.shutdown();
315 | }
316 |
317 | private void waitTask() {
318 | newTask.lock();
319 | try {
320 | newTaskCondition.await(newTaskWaitTime, TimeUnit.MILLISECONDS);
321 | } catch (InterruptedException e) {
322 | logger.warn("waitNewTask interrupted, error {}", e);
323 | } finally {
324 | newTask.unlock();
325 | }
326 | }
327 |
328 | private void signalTask() {
329 | try {
330 | newTask.lock();
331 | newTaskCondition.signalAll();
332 | } finally {
333 | newTask.unlock();
334 | }
335 | }
336 |
337 | protected void checkRunningStat() {
338 | if (stat.get() == STAT_RUNNING) {
339 | throw new IllegalStateException("Crawler is already running!");
340 | }
341 | }
342 |
343 | private void pushSeed(Seed seed) {
344 | if (seed != null && seed.getUrl() != null) {
345 | scheduler.push(seed);
346 | }
347 | }
348 |
349 | private void initComponent() {
350 | HandlerPipeline pipeline = this.pipeline();
351 |
352 | if (async) {
353 | asyncHttpClientGenerator = new AsyncHttpClientGenerator();
354 | httpAsyncClient = asyncHttpClientGenerator.getClient(getPayload());
355 | httpAsyncClient.start();
356 | } else {
357 | httpClientHttpClientGenerator = new SyncHttpClientGenerator();
358 | httpClient = httpClientHttpClientGenerator.getClient(getPayload());
359 | }
360 |
361 | boolean threadPoolAvailable = threadNum > 0 && countableThreadPool == null || countableThreadPool.isShutdown();
362 | if (threadPoolAvailable) {
363 | if (threadPoolExecutor != null && !threadPoolExecutor.isShutdown()) {
364 | countableThreadPool = new CountableThreadPool(threadNum, threadPoolExecutor);
365 | } else {
366 | countableThreadPool = new CountableThreadPool(threadNum);
367 | }
368 | }
369 |
370 | if (startSeeds != null) {
371 | startSeeds.forEach(seed -> {
372 | pushSeed(seed);
373 | });
374 | }
375 |
376 | pipeline.start();
377 |
378 | stat.set(STAT_RUNNING);
379 | }
380 |
381 | }
382 |
--------------------------------------------------------------------------------
/cetty-core/src/main/java/com/jibug/cetty/core/Page.java:
--------------------------------------------------------------------------------
1 | package com.jibug.cetty.core;
2 |
3 | import com.google.common.collect.Lists;
4 | import org.jsoup.Jsoup;
5 | import org.jsoup.nodes.Document;
6 | import org.seimicrawler.xpath.JXDocument;
7 | import org.seimicrawler.xpath.JXNode;
8 |
9 | import java.util.List;
10 | import java.util.Map;
11 |
12 | /**
13 | * @author heyingcai
14 | */
15 | public class Page {
16 |
17 | private String url;
18 |
19 | private Seed seed;
20 |
21 | private Result result = new Result();
22 |
23 | private String rawData;
24 |
25 | private byte[] bytes;
26 |
27 | private List nextSeeds = Lists.newLinkedList();
28 |
29 | private Map> headers;
30 |
31 | private Html html;
32 |
33 | public String getUrl() {
34 | return url;
35 | }
36 |
37 | public void setUrl(String url) {
38 | this.url = url;
39 | }
40 |
41 | public Seed getSeed() {
42 | return seed;
43 | }
44 |
45 | public void setSeed(Seed seed) {
46 | this.seed = seed;
47 | }
48 |
49 | public Result getResult() {
50 | result.setSeed(seed);
51 | return result;
52 | }
53 |
54 | public void setResult(Result result) {
55 | this.result = result;
56 | }
57 |
58 | public void addNextSeed(Seed seed) {
59 | nextSeeds.add(seed);
60 | }
61 |
62 | public void addNextSeed(String url) {
63 | nextSeeds.add(new Seed(url));
64 | }
65 |
66 | public void addNextSeed(List seeds) {
67 | nextSeeds.addAll(seeds);
68 | }
69 |
70 | public List getNextSeeds() {
71 | return nextSeeds;
72 | }
73 |
74 | public String getRawData() {
75 | return rawData;
76 | }
77 |
78 | public void setRawData(String rawData) {
79 | this.rawData = rawData;
80 | }
81 |
82 | public byte[] getBytes() {
83 | return bytes;
84 | }
85 |
86 | public void setBytes(byte[] bytes) {
87 | this.bytes = bytes;
88 | }
89 |
90 | public void setNextSeeds(List nextSeeds) {
91 | this.nextSeeds = nextSeeds;
92 | }
93 |
94 | public Map> getHeaders() {
95 | return headers;
96 | }
97 |
98 | public void setHeaders(Map> headers) {
99 | this.headers = headers;
100 | }
101 |
102 | public void setDocument(String text, String url) {
103 | try {
104 | this.html = new Html(Jsoup.parse(text, url));
105 | } catch (Exception e) {
106 | this.html = new Html(null);
107 | }
108 | }
109 |
110 | public Document getDocument() {
111 | return html.document;
112 | }
113 |
114 | public Html getHtml() {
115 | return html;
116 | }
117 |
118 | public class Html {
119 |
120 | private JXDocument jxDocument;
121 |
122 | private Document document;
123 |
124 | Html(Document document) {
125 | this.document = document;
126 | this.jxDocument = JXDocument.create(document);
127 | }
128 |
129 | public List