├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── cetty-core ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── jibug │ │ │ └── cetty │ │ │ └── core │ │ │ ├── AnnotationBootstrap.java │ │ │ ├── Bootstrap.java │ │ │ ├── Cetty.java │ │ │ ├── Page.java │ │ │ ├── Payload.java │ │ │ ├── Result.java │ │ │ ├── Seed.java │ │ │ ├── annotation │ │ │ ├── Query.java │ │ │ └── TargetUrl.java │ │ │ ├── concurrent │ │ │ ├── CettyAbortPolicy.java │ │ │ ├── CettyThreadPool.java │ │ │ ├── CountableThreadPool.java │ │ │ ├── NamedThreadFactory.java │ │ │ └── ThreadPerTaskExecutor.java │ │ │ ├── constants │ │ │ └── HttpConstants.java │ │ │ ├── context │ │ │ └── CettyContext.java │ │ │ ├── handler │ │ │ ├── AbstractHandlerContext.java │ │ │ ├── ConsoleReduceHandler.java │ │ │ ├── DefaultHandlerContext.java │ │ │ ├── Handler.java │ │ │ ├── HandlerContext.java │ │ │ ├── HandlerInitializer.java │ │ │ ├── HandlerPipeline.java │ │ │ ├── HttpDownloadHandler.java │ │ │ ├── PageProcessHandler.java │ │ │ ├── ProcessHandler.java │ │ │ ├── ProcessHandlerAdapter.java │ │ │ ├── ReduceHandler.java │ │ │ └── ReduceHandlerAdapter.java │ │ │ ├── model │ │ │ ├── AnnotationConfig.java │ │ │ └── RequestBody.java │ │ │ ├── net │ │ │ ├── AbstractHttpClientGenerator.java │ │ │ ├── AsyncHttpClientGenerator.java │ │ │ ├── HttpClientFactory.java │ │ │ ├── HttpClientGenerator.java │ │ │ ├── Proxy.java │ │ │ ├── ProxyStrategy.java │ │ │ └── SyncHttpClientGenerator.java │ │ │ ├── scheduler │ │ │ ├── DuplicateStrategy.java │ │ │ ├── QueueScheduler.java │ │ │ └── Scheduler.java │ │ │ └── utils │ │ │ └── UrlUtils.java │ └── resources │ │ └── logback.xml │ └── test │ └── java │ └── com │ └── jibug │ └── cetty │ ├── core │ ├── bootstrap │ │ └── BootstrapTest.java │ ├── handler │ │ └── HandlerTest.java │ └── net │ │ ├── AsyncHttpClientTest.java │ │ └── SyncHttpClientTest.java │ └── sample │ ├── kuaidaili │ └── Kuaidaili.java │ └── tianya │ └── Tianya.java ├── cetty-samples ├── .gitignore ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── jibug │ │ └── cetty │ │ └── sample │ │ ├── SampleApplication.java │ │ ├── entity │ │ └── Article.java │ │ ├── handler │ │ ├── BasePageHandler.java │ │ ├── CifnewsPageHandler.java │ │ ├── GuxiaobeiPageHandler.java │ │ └── Waimaob2cPageHandler.java │ │ ├── reduce │ │ └── ArticleReducer.java │ │ ├── runner │ │ └── CrawlerRunner.java │ │ └── service │ │ ├── CrawlerService.java │ │ └── TaskService.java │ └── resources │ ├── application.properties │ └── crawler.json └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | #package files 4 | 5 | *.war 6 | *.ear 7 | 8 | #kdiff3 ignore 9 | target/ 10 | 11 | #eclipse ignore 12 | .settings/ 13 | .project 14 | .classpath 15 | 16 | #idea 17 | .idea/ 18 | /idea/ 19 | *.ipr 20 | *.iml 21 | *.iws 22 | 23 | # temp file 24 | 25 | *.log 26 | *.cache 27 | *.diff 28 | *.patch 29 | *.tmp 30 | 31 | #system ignore 32 | .DS_Store 33 | Thumbs.db 34 | /.idea/ 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | jdk: 4 | - openjdk8 5 | 6 | notifications: 7 | email: false 8 | 9 | sudo: false 10 | 11 | 12 | cache: 13 | directories: 14 | - $HOME/.m2 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 heyingcai 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cetty 2 | 3 | 一个轻量级的基于事件分发的爬虫框架。 4 | 5 | [![Build Status](https://www.travis-ci.org/heyingcai/cetty.svg?branch=master)](https://travis-ci.org/heyingcai/cetty) 6 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/dempeZheng/forest/blob/master/LICENSE) 7 | [![](https://img.shields.io/badge/language-java-yellowgreen.svg)](https://img.shields.io/badge/language-java-yellowgreen.svg) 8 | 9 | 10 | >An event dispatch crawler framework. 11 | 12 | ![](https://s1.ax1x.com/2018/11/12/iOAjG8.png) 13 | 14 | ## 功能介绍 15 | * 基于完全自定义事件处理机制的爬虫框架。 16 | * 模块化的设计,提供强大的可扩展性。 17 | * 基于HttpClient支持同步和异步数据抓取。 18 | * 支持多线程。 19 | * 基于Jsoup页面解析框架提供强大的网页解析处理能力。 20 | 21 | ## 快速开始 22 | ### 使用Maven 23 | ```xml 24 | 25 | com.jibug.cetty 26 | cetty-core 27 | 0.1.8 28 | 29 | ``` 30 | 31 | ## 帮助 32 | 1.详细文档:[http://cetty.jibug.com/](http://cetty.jibug.com/)
33 | 2.QQ群
34 | ![](https://s1.ax1x.com/2018/11/20/F9GsFs.png)
35 | 3.bug反馈:[issues](https://github.com/heyingcai/cetty/issues) 36 | 37 | ## 让我们来写第一个demo 38 | 39 | ```java 40 | /** 41 | * 抓取天涯论坛文章列表标题 42 | * http://bbs.tianya.cn/list-333-1.shtml 43 | * 44 | * @author heyingcai 45 | */ 46 | public class Tianya extends ProcessHandlerAdapter { 47 | 48 | @Override 49 | public void process(HandlerContext ctx, Page page) { 50 | //获取 Document 51 | Document document = page.getDocument(); 52 | //dom解析 53 | Elements itemElements = document. 54 | select("div#bbsdoc>div#bd>div#main>div.mt5>table>tbody"). 55 | get(2). 56 | select("tr"); 57 | List titles = Lists.newArrayList(); 58 | for (Element item : itemElements) { 59 | String title = item.select("td.td-title").text(); 60 | titles.add(title); 61 | } 62 | 63 | //获取Result对象,将我们解析出来的结果向下一个handler传递 64 | Result result = page.getResult(); 65 | result.addResults(titles); 66 | 67 | //通过fireXXX 方法将本handler 处理的结果向下传递 68 | //本教程直接将结果传递给ConsoleHandler,将结果直接输出控制台 69 | ctx.fireReduce(page); 70 | } 71 | 72 | public static void main(String[] args) { 73 | //启动引导类 74 | Bootstrap. 75 | me() 76 | //使用同步抓取 77 | .isAsync(false) 78 | //开启一个线程 79 | .setThreadNum(1) 80 | //抓取入口url 81 | .startUrl("http://bbs.tianya.cn/list-333-1.shtml") 82 | //通用请求信息 83 | .setPayload(Payload.custom()) 84 | //添加自定处理器 85 | .addHandler(new Tianya()) 86 | //添加默认结果处理器,输出至控制台 87 | .addHandler(new ConsoleReduceHandler()) 88 | //是否启用实时抓取模式,如果启用非实时抓取模式则当任务队列中没有任务的一段时间后爬虫会自动处于close状态 89 | .isDuration(false) 90 | .start(); 91 | } 92 | } 93 | ``` 94 | 95 | ## 历史版本 96 | 97 | | 版本 | 说明 | 98 | | :----: | :----: | 99 | | 0.1.0 | 支持基本爬虫功能| 100 | | 0.1.5 | 1.支持xpath 2.修复添加cookie失效问题 3.优化底层逻辑 | 101 | | 0.1.7 | 修复底层bug | 102 | 103 | 104 | ## TODO 105 | 106 | * 支持注解方式 107 | * 支持代理池 108 | * 支持Berkeley 内存数据作为url管理器,提供海量url存储并提高存取效率 109 | * 支持热更新 110 | * 支持爬虫治理 111 | 112 | -------------------------------------------------------------------------------- /cetty-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | cetty-parent 7 | com.jibug.cetty 8 | 0.1.8 9 | 10 | 4.0.0 11 | 12 | cetty-core 13 | 14 | 15 | 16 | org.apache.maven.plugins 17 | maven-compiler-plugin 18 | 19 | 1.8 20 | 1.8 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.slf4j 29 | slf4j-api 30 | 31 | 32 | ch.qos.logback 33 | logback-classic 34 | 35 | 36 | org.apache.httpcomponents 37 | httpclient 38 | 39 | 40 | org.apache.httpcomponents 41 | httpasyncclient 42 | 43 | 44 | junit 45 | junit 46 | 47 | 48 | 49 | org.apache.commons 50 | commons-lang3 51 | 52 | 53 | org.apache.commons 54 | commons-io 55 | 56 | 57 | org.jsoup 58 | jsoup 59 | 60 | 61 | cn.wanghaomiao 62 | JsoupXpath 63 | 64 | 65 | com.alibaba 66 | fastjson 67 | 68 | 69 | com.google.guava 70 | guava 71 | 72 | 73 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/AnnotationBootstrap.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public class AnnotationBootstrap extends Bootstrap { 10 | 11 | private List> executorClasses = new ArrayList<>(); 12 | 13 | public AnnotationBootstrap(Class clazz) { 14 | this.executorClasses.add(clazz); 15 | } 16 | 17 | public AnnotationBootstrap(List> classes) { 18 | this.executorClasses.addAll(classes); 19 | } 20 | 21 | @Override 22 | public void start() { 23 | super.start(); 24 | } 25 | 26 | public void execute() { 27 | if (this.executorClasses.size() == 0) { 28 | throw new IllegalArgumentException("The Crawler Annotation class not found!"); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Bootstrap.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.jibug.cetty.core.handler.Handler; 4 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter; 5 | import com.jibug.cetty.core.scheduler.Scheduler; 6 | import com.google.common.base.Preconditions; 7 | 8 | import java.util.List; 9 | import java.util.concurrent.ThreadPoolExecutor; 10 | 11 | /** 12 | * @author heyingcai 13 | */ 14 | public class Bootstrap { 15 | 16 | private Cetty cetty; 17 | 18 | public Bootstrap() { 19 | cetty = new Cetty(); 20 | } 21 | 22 | public static Bootstrap me() { 23 | return new Bootstrap(); 24 | } 25 | 26 | public Bootstrap addHandler(Handler handler) { 27 | cetty.pipeline().addLast(Preconditions.checkNotNull(handler, "handler can not be null")); 28 | return this; 29 | } 30 | 31 | public Bootstrap addHandler(Handler handler, String name) { 32 | cetty.pipeline().addLast(Preconditions.checkNotNull(handler, "handler can not be null"), name); 33 | return this; 34 | } 35 | 36 | public Bootstrap setThreadNum(int threadNum) { 37 | cetty.setThreadNum(threadNum); 38 | return this; 39 | } 40 | 41 | public Bootstrap isAsync(boolean async) { 42 | cetty.setAsync(async); 43 | return this; 44 | } 45 | 46 | public Bootstrap isDuration(boolean duration) { 47 | cetty.setDuration(duration); 48 | return this; 49 | } 50 | 51 | public Bootstrap startUrl(String url) { 52 | cetty.setStartUrl(url); 53 | return this; 54 | } 55 | 56 | public Bootstrap startUrls(List urls) { 57 | cetty.setStartUrls(urls); 58 | return this; 59 | } 60 | 61 | public Bootstrap startSeed(Seed seed) { 62 | cetty.setStartSeed(seed); 63 | return this; 64 | } 65 | 66 | public Bootstrap startSeeds(List seeds) { 67 | cetty.setStartSeeds(seeds); 68 | return this; 69 | } 70 | 71 | public Bootstrap setScheduler(Scheduler scheduler) { 72 | cetty.setScheduler(scheduler); 73 | return this; 74 | } 75 | 76 | public Bootstrap setThreadPoolExecutor(ThreadPoolExecutor threadPoolExecutor) { 77 | cetty.setThreadPoolExecutor(threadPoolExecutor); 78 | return this; 79 | } 80 | 81 | public Bootstrap setDownloader(ProcessHandlerAdapter handlerAdapter) { 82 | cetty.pipeline().addLast(Preconditions.checkNotNull(handlerAdapter, "handler can not be null")); 83 | return this; 84 | } 85 | 86 | public Bootstrap setPayload(Payload payload) { 87 | cetty.setPayload(payload); 88 | return this; 89 | } 90 | 91 | public Cetty getCetty() { 92 | return cetty; 93 | } 94 | 95 | public void start() { 96 | cetty.startCrawler(); 97 | } 98 | 99 | public void stop() { 100 | cetty.close(); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Cetty.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.jibug.cetty.core.concurrent.CountableThreadPool; 4 | import com.jibug.cetty.core.concurrent.NamedThreadFactory; 5 | import com.jibug.cetty.core.handler.HandlerPipeline; 6 | import com.jibug.cetty.core.handler.HttpDownloadHandler; 7 | import com.jibug.cetty.core.net.AsyncHttpClientGenerator; 8 | import com.jibug.cetty.core.net.HttpClientGenerator; 9 | import com.jibug.cetty.core.net.SyncHttpClientGenerator; 10 | import com.jibug.cetty.core.scheduler.QueueScheduler; 11 | import com.jibug.cetty.core.scheduler.Scheduler; 12 | import com.google.common.collect.Lists; 13 | import com.jibug.cetty.core.utils.UrlUtils; 14 | import org.apache.http.impl.client.CloseableHttpClient; 15 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | import java.util.UUID; 23 | import java.util.concurrent.LinkedBlockingQueue; 24 | import java.util.concurrent.ThreadPoolExecutor; 25 | import java.util.concurrent.TimeUnit; 26 | import java.util.concurrent.atomic.AtomicInteger; 27 | import java.util.concurrent.locks.Condition; 28 | import java.util.concurrent.locks.ReentrantLock; 29 | 30 | /** 31 | * @author heyingcai 32 | * @date 2018/7/3 33 | */ 34 | public class Cetty implements Runnable { 35 | 36 | protected Logger logger = LoggerFactory.getLogger(getClass()); 37 | 38 | private AtomicInteger stat = new AtomicInteger(STAT_INIT); 39 | 40 | private final static int STAT_INIT = 0; 41 | 42 | private final static int STAT_RUNNING = 1; 43 | 44 | private final static int STAT_STOPPED = 2; 45 | 46 | private String name; 47 | 48 | private CountableThreadPool countableThreadPool; 49 | 50 | private ThreadPoolExecutor threadPoolExecutor; 51 | 52 | private int threadNum = 1; 53 | 54 | private ReentrantLock newTask = new ReentrantLock(); 55 | 56 | private Condition newTaskCondition = newTask.newCondition(); 57 | 58 | private long stopAwaitTime = 20; 59 | 60 | private long newTaskWaitTime = 30000; 61 | 62 | private List startSeeds; 63 | 64 | /** 65 | * crawler duration grab 66 | * default value is not duration 67 | * when there is no task, the crawler stops after a period of time. 68 | */ 69 | private boolean duration = false; 70 | 71 | /** 72 | * crawler is support async 73 | * default value is sync 74 | */ 75 | private boolean async = false; 76 | 77 | private HttpClientGenerator asyncHttpClientGenerator; 78 | 79 | private HttpClientGenerator httpClientHttpClientGenerator; 80 | 81 | private CloseableHttpAsyncClient httpAsyncClient; 82 | 83 | private CloseableHttpClient httpClient; 84 | 85 | /** 86 | * crawler request payload 87 | */ 88 | private Payload payload; 89 | 90 | /** 91 | * the crawler global handler 92 | * these handler all in the pipeline 93 | */ 94 | private HandlerPipeline pipeline; 95 | 96 | /** 97 | * url scheduler 98 | */ 99 | private Scheduler scheduler = new QueueScheduler(); 100 | 101 | public Cetty() { 102 | this.pipeline = new HandlerPipeline(this); 103 | // downloader handler must have one 104 | boolean hasDownloadHandler = pipeline.checkDownloadHandler(); 105 | if (!hasDownloadHandler) { 106 | pipeline.addLast(new HttpDownloadHandler(), "downloader"); 107 | } 108 | 109 | } 110 | 111 | public Cetty setPayload(Payload payload) { 112 | this.payload = payload; 113 | return this; 114 | } 115 | 116 | public Payload getPayload() { 117 | return payload; 118 | } 119 | 120 | public Cetty setStartUrl(String url) { 121 | checkRunningStat(); 122 | this.startSeeds = Arrays.asList(new Seed(url)); 123 | return this; 124 | } 125 | 126 | public Cetty setStartUrls(List urls) { 127 | checkRunningStat(); 128 | this.startSeeds = convertSeed(urls); 129 | return this; 130 | } 131 | 132 | public Cetty setStartSeed(Seed seed) { 133 | checkRunningStat(); 134 | this.startSeeds = Arrays.asList(seed); 135 | return this; 136 | } 137 | 138 | public Cetty setStartSeeds(List seeds) { 139 | checkRunningStat(); 140 | this.startSeeds = seeds; 141 | return this; 142 | } 143 | 144 | private List convertSeed(List urls) { 145 | List seeds = Lists.newArrayListWithCapacity(urls.size()); 146 | urls.forEach(url -> { 147 | seeds.add(new Seed(url)); 148 | }); 149 | return seeds; 150 | } 151 | 152 | public Cetty setScheduler(Scheduler scheduler) { 153 | checkRunningStat(); 154 | Scheduler oldScheduler = this.scheduler; 155 | this.scheduler = scheduler; 156 | if (oldScheduler != null) { 157 | Seed seed; 158 | while ((seed = oldScheduler.poll()) != null) { 159 | scheduler.push(seed); 160 | } 161 | } 162 | return this; 163 | } 164 | 165 | public Cetty setThreadPoolExecutor(ThreadPoolExecutor threadPoolExecutor) { 166 | checkRunningStat(); 167 | this.threadPoolExecutor = threadPoolExecutor; 168 | return this; 169 | } 170 | 171 | public Scheduler getScheduler() { 172 | return scheduler; 173 | } 174 | 175 | public HandlerPipeline pipeline() { 176 | return pipeline; 177 | } 178 | 179 | public boolean isAsync() { 180 | return async; 181 | } 182 | 183 | public void setAsync(boolean async) { 184 | this.async = async; 185 | } 186 | 187 | public void setDuration(boolean duration) { 188 | this.duration = duration; 189 | } 190 | 191 | public HttpClientGenerator getAsyncHttpClientGenerator() { 192 | return asyncHttpClientGenerator; 193 | } 194 | 195 | public HttpClientGenerator getHttpClientHttpClientGenerator() { 196 | return httpClientHttpClientGenerator; 197 | } 198 | 199 | public CloseableHttpClient getHttpClient() { 200 | return httpClient; 201 | } 202 | 203 | public CloseableHttpAsyncClient getHttpAsyncClient() { 204 | return httpAsyncClient; 205 | } 206 | 207 | public Cetty setThreadNum(int threadNum) { 208 | this.threadNum = threadNum; 209 | return this; 210 | } 211 | 212 | public String getName() { 213 | if (name != null) { 214 | return name; 215 | } 216 | if (payload.getDomain() != null) { 217 | return payload.getDomain(); 218 | } 219 | if (startSeeds.size() > 0) { 220 | Seed seed = startSeeds.get(0); 221 | return UrlUtils.getDomain(seed.getUrl()); 222 | } 223 | name = UUID.randomUUID().toString(); 224 | return name; 225 | } 226 | 227 | @Override 228 | public void run() { 229 | checkRunningStat(); 230 | initComponent(); 231 | logger.info("Crawler {} started!", getName()); 232 | while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { 233 | final Seed seed = scheduler.poll(); 234 | 235 | if (seed == null) { 236 | if (!duration) { 237 | if (countableThreadPool.getThreadAliveCount() == 0 || stat.get() == STAT_STOPPED) { 238 | break; 239 | } 240 | } 241 | waitTask(); 242 | } else { 243 | countableThreadPool.execute(new SeedTask(seed)); 244 | } 245 | } 246 | if (!countableThreadPool.isShutdown()) { 247 | countableThreadPool.isShutdown(); 248 | try { 249 | countableThreadPool.getThreadPoolExecutor().awaitTermination(stopAwaitTime, TimeUnit.SECONDS); 250 | } catch (InterruptedException e) { 251 | logger.error("Cetty {} crawler wait failed !", getName()); 252 | } 253 | } 254 | stopCrawler(); 255 | } 256 | 257 | private class SeedTask implements Runnable { 258 | 259 | private Seed seed; 260 | 261 | SeedTask(Seed seed) { 262 | this.seed = seed; 263 | } 264 | 265 | @Override 266 | public void run() { 267 | try { 268 | pipeline.download(seed); 269 | } catch (Exception e) { 270 | logger.error("Cetty crawler run error {}", e); 271 | } finally { 272 | signalTask(); 273 | } 274 | } 275 | } 276 | 277 | public void stopCrawler() { 278 | if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { 279 | logger.info("Cetty {} crawler closed!", getName()); 280 | } 281 | 282 | releaseObject(); 283 | 284 | if (!Thread.currentThread().isInterrupted()) { 285 | Thread.currentThread().interrupt(); 286 | } 287 | } 288 | 289 | public void startCrawler() { 290 | Thread thread = new Thread(this); 291 | thread.setDaemon(false); 292 | thread.start(); 293 | } 294 | 295 | private void releaseObject() { 296 | if (httpAsyncClient != null) { 297 | try { 298 | httpAsyncClient.close(); 299 | } catch (IOException e) { 300 | logger.warn("close httpAsyncClient error {}", e); 301 | } 302 | } 303 | if (httpClient != null) { 304 | try { 305 | httpClient.close(); 306 | } catch (IOException e) { 307 | logger.warn("close httpClient error {}", e); 308 | } 309 | } 310 | } 311 | 312 | public void close() { 313 | releaseObject(); 314 | countableThreadPool.shutdown(); 315 | } 316 | 317 | private void waitTask() { 318 | newTask.lock(); 319 | try { 320 | newTaskCondition.await(newTaskWaitTime, TimeUnit.MILLISECONDS); 321 | } catch (InterruptedException e) { 322 | logger.warn("waitNewTask interrupted, error {}", e); 323 | } finally { 324 | newTask.unlock(); 325 | } 326 | } 327 | 328 | private void signalTask() { 329 | try { 330 | newTask.lock(); 331 | newTaskCondition.signalAll(); 332 | } finally { 333 | newTask.unlock(); 334 | } 335 | } 336 | 337 | protected void checkRunningStat() { 338 | if (stat.get() == STAT_RUNNING) { 339 | throw new IllegalStateException("Crawler is already running!"); 340 | } 341 | } 342 | 343 | private void pushSeed(Seed seed) { 344 | if (seed != null && seed.getUrl() != null) { 345 | scheduler.push(seed); 346 | } 347 | } 348 | 349 | private void initComponent() { 350 | HandlerPipeline pipeline = this.pipeline(); 351 | 352 | if (async) { 353 | asyncHttpClientGenerator = new AsyncHttpClientGenerator(); 354 | httpAsyncClient = asyncHttpClientGenerator.getClient(getPayload()); 355 | httpAsyncClient.start(); 356 | } else { 357 | httpClientHttpClientGenerator = new SyncHttpClientGenerator(); 358 | httpClient = httpClientHttpClientGenerator.getClient(getPayload()); 359 | } 360 | 361 | boolean threadPoolAvailable = threadNum > 0 && countableThreadPool == null || countableThreadPool.isShutdown(); 362 | if (threadPoolAvailable) { 363 | if (threadPoolExecutor != null && !threadPoolExecutor.isShutdown()) { 364 | countableThreadPool = new CountableThreadPool(threadNum, threadPoolExecutor); 365 | } else { 366 | countableThreadPool = new CountableThreadPool(threadNum); 367 | } 368 | } 369 | 370 | if (startSeeds != null) { 371 | startSeeds.forEach(seed -> { 372 | pushSeed(seed); 373 | }); 374 | } 375 | 376 | pipeline.start(); 377 | 378 | stat.set(STAT_RUNNING); 379 | } 380 | 381 | } 382 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Page.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.seimicrawler.xpath.JXDocument; 7 | import org.seimicrawler.xpath.JXNode; 8 | 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | /** 13 | * @author heyingcai 14 | */ 15 | public class Page { 16 | 17 | private String url; 18 | 19 | private Seed seed; 20 | 21 | private Result result = new Result(); 22 | 23 | private String rawData; 24 | 25 | private byte[] bytes; 26 | 27 | private List nextSeeds = Lists.newLinkedList(); 28 | 29 | private Map> headers; 30 | 31 | private Html html; 32 | 33 | public String getUrl() { 34 | return url; 35 | } 36 | 37 | public void setUrl(String url) { 38 | this.url = url; 39 | } 40 | 41 | public Seed getSeed() { 42 | return seed; 43 | } 44 | 45 | public void setSeed(Seed seed) { 46 | this.seed = seed; 47 | } 48 | 49 | public Result getResult() { 50 | result.setSeed(seed); 51 | return result; 52 | } 53 | 54 | public void setResult(Result result) { 55 | this.result = result; 56 | } 57 | 58 | public void addNextSeed(Seed seed) { 59 | nextSeeds.add(seed); 60 | } 61 | 62 | public void addNextSeed(String url) { 63 | nextSeeds.add(new Seed(url)); 64 | } 65 | 66 | public void addNextSeed(List seeds) { 67 | nextSeeds.addAll(seeds); 68 | } 69 | 70 | public List getNextSeeds() { 71 | return nextSeeds; 72 | } 73 | 74 | public String getRawData() { 75 | return rawData; 76 | } 77 | 78 | public void setRawData(String rawData) { 79 | this.rawData = rawData; 80 | } 81 | 82 | public byte[] getBytes() { 83 | return bytes; 84 | } 85 | 86 | public void setBytes(byte[] bytes) { 87 | this.bytes = bytes; 88 | } 89 | 90 | public void setNextSeeds(List nextSeeds) { 91 | this.nextSeeds = nextSeeds; 92 | } 93 | 94 | public Map> getHeaders() { 95 | return headers; 96 | } 97 | 98 | public void setHeaders(Map> headers) { 99 | this.headers = headers; 100 | } 101 | 102 | public void setDocument(String text, String url) { 103 | try { 104 | this.html = new Html(Jsoup.parse(text, url)); 105 | } catch (Exception e) { 106 | this.html = new Html(null); 107 | } 108 | } 109 | 110 | public Document getDocument() { 111 | return html.document; 112 | } 113 | 114 | public Html getHtml() { 115 | return html; 116 | } 117 | 118 | public class Html { 119 | 120 | private JXDocument jxDocument; 121 | 122 | private Document document; 123 | 124 | Html(Document document) { 125 | this.document = document; 126 | this.jxDocument = JXDocument.create(document); 127 | } 128 | 129 | public List select(String xpath) { 130 | return html.jxDocument.sel(xpath); 131 | } 132 | 133 | public Object selectOne(String xpath) { 134 | return html.jxDocument.selOne(xpath); 135 | } 136 | 137 | public List selectNode(String xpath) { 138 | return html.jxDocument.selN(xpath); 139 | } 140 | 141 | public JXNode selectOneNode(String xpath) { 142 | return html.jxDocument.selNOne(xpath); 143 | } 144 | } 145 | 146 | } 147 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Payload.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.jibug.cetty.core.net.Proxy; 4 | import com.google.common.collect.Maps; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | /** 10 | * http request payload 11 | * 12 | * @author heyingcai 13 | */ 14 | public class Payload { 15 | 16 | /** 17 | * set domain 18 | */ 19 | private String domain; 20 | 21 | /** 22 | * set ua 23 | */ 24 | private String userAgent; 25 | 26 | /** 27 | * set charset 28 | */ 29 | private String charset = "utf-8"; 30 | 31 | /** 32 | * set proxy 33 | */ 34 | private Proxy proxy; 35 | 36 | /** 37 | * set origin cookies 38 | */ 39 | private Map originCookies = Maps.newHashMap(); 40 | 41 | /** 42 | * set specific cookies 43 | */ 44 | private Map> cookies = Maps.newHashMap(); 45 | 46 | /** 47 | * set http request headers 48 | */ 49 | private Map headers = Maps.newHashMap(); 50 | 51 | /** 52 | * set socket timeout 53 | */ 54 | private int socketTimeout = 5000; 55 | 56 | /** 57 | * set connect timeout 58 | */ 59 | private int connectTimeout = 2000; 60 | 61 | /** 62 | * set connection default pool capacity 63 | */ 64 | private int connectionPoolCapacity = 5; 65 | 66 | /** 67 | * set request retryTimes 68 | */ 69 | private int retryTimes = 0; 70 | 71 | /** 72 | * each seed sleep time 73 | */ 74 | private int sleepTime = 500; 75 | 76 | /** 77 | * sometime there unsupported cookie 78 | */ 79 | private boolean unsupportedCookie = false; 80 | 81 | /** 82 | * return a new payload instance 83 | * 84 | * @return 85 | */ 86 | public static Payload custom() { 87 | return new Payload(); 88 | } 89 | 90 | public Payload setDomain(String domain) { 91 | this.domain = domain; 92 | return this; 93 | } 94 | 95 | public String getDomain() { 96 | return domain; 97 | } 98 | 99 | public String getCharset() { 100 | return charset; 101 | } 102 | 103 | public Payload setCharset(String charset) { 104 | this.charset = charset; 105 | return this; 106 | } 107 | 108 | public Payload setUserAgent(String userAgent) { 109 | this.userAgent = userAgent; 110 | return this; 111 | } 112 | 113 | public String getUserAgent() { 114 | return userAgent; 115 | } 116 | 117 | public Payload setProxy(Proxy proxy) { 118 | this.proxy = proxy; 119 | return this; 120 | } 121 | 122 | public Proxy getProxy() { 123 | return proxy; 124 | } 125 | 126 | public Payload addOriginCookie(String name, String value) { 127 | originCookies.put(name, value); 128 | return this; 129 | } 130 | 131 | public Map getOriginCookies() { 132 | return originCookies; 133 | } 134 | 135 | public Payload addOriginCookies(Map cookies) { 136 | originCookies.putAll(cookies); 137 | return this; 138 | } 139 | 140 | public Payload addCookie(String domain, String name, String value) { 141 | if (!cookies.containsKey(domain)) { 142 | HashMap cookie = Maps.newHashMap(); 143 | cookies.put(domain, cookie); 144 | } 145 | cookies.get(domain).put(name, value); 146 | return this; 147 | } 148 | 149 | public Payload addCookies(Map> cookieMap) { 150 | for (Map.Entry> cookie : cookieMap.entrySet()) { 151 | if (!cookies.containsKey(cookie.getKey())) { 152 | cookies.putAll(cookieMap); 153 | } 154 | } 155 | return this; 156 | } 157 | 158 | public Map> getCookies() { 159 | return cookies; 160 | } 161 | 162 | public Payload addHeader(String key, String value) { 163 | headers.put(key, value); 164 | return this; 165 | } 166 | 167 | public Payload addHeaders(Map headerMap) { 168 | headers.putAll(headerMap); 169 | return this; 170 | } 171 | 172 | public Map getHeaders() { 173 | return headers; 174 | } 175 | 176 | public Payload setSocketTimeout(int socketTimeout) { 177 | this.socketTimeout = socketTimeout; 178 | return this; 179 | } 180 | 181 | public int getSocketTimeout() { 182 | return socketTimeout; 183 | } 184 | 185 | public Payload setConnectTimeout(int connectTimeout) { 186 | this.connectTimeout = connectTimeout; 187 | return this; 188 | } 189 | 190 | public int getConnectTimeout() { 191 | return connectTimeout; 192 | } 193 | 194 | public Payload setConnectionPoolCapacity(int connectionPoolCapacity) { 195 | this.connectionPoolCapacity = connectionPoolCapacity; 196 | return this; 197 | } 198 | 199 | public int getConnectionPoolCapacity() { 200 | return connectionPoolCapacity; 201 | } 202 | 203 | public Payload setRetryTimes(int retryTimes) { 204 | this.retryTimes = retryTimes; 205 | return this; 206 | } 207 | 208 | public int getSleepTime() { 209 | return sleepTime; 210 | } 211 | 212 | public Payload setSleepTime(int sleepTime) { 213 | this.sleepTime = sleepTime; 214 | return this; 215 | } 216 | 217 | public int getRetryTimes() { 218 | return retryTimes; 219 | } 220 | 221 | public Payload setUnsupportedCookie(boolean unsupportedCookie) { 222 | this.unsupportedCookie = unsupportedCookie; 223 | return this; 224 | } 225 | 226 | public boolean isUnsupportedCookie() { 227 | return unsupportedCookie; 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Result.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.google.common.collect.Maps; 5 | 6 | import java.util.Collection; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | /** 11 | * @author heyingcai 12 | */ 13 | public class Result { 14 | 15 | private Seed seed; 16 | 17 | private List resultList = Lists.newLinkedList(); 18 | 19 | private Map fieldResult = Maps.newHashMap(); 20 | 21 | public void addResult(String result) { 22 | resultList.add(result); 23 | } 24 | 25 | public void addResult(Object result) { 26 | resultList.add(result); 27 | } 28 | 29 | public void addResults(Collection results) { 30 | resultList.addAll(results); 31 | } 32 | 33 | public void putField(String key, Object value) { 34 | fieldResult.put(key, value); 35 | } 36 | 37 | public void putFieldMap(Map resultMap) { 38 | fieldResult.putAll(resultMap); 39 | } 40 | 41 | public List getResultList() { 42 | return resultList; 43 | } 44 | 45 | public Map getFieldResult() { 46 | return fieldResult; 47 | } 48 | 49 | public void setSeed(Seed seed) { 50 | this.seed = seed; 51 | } 52 | 53 | public Seed getSeed() { 54 | return seed; 55 | } 56 | 57 | @Override 58 | public String toString() { 59 | Object result = resultList.isEmpty() ? fieldResult : resultList; 60 | return result.toString(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/Seed.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core; 2 | 3 | import com.jibug.cetty.core.model.RequestBody; 4 | import com.google.common.collect.Maps; 5 | 6 | import java.io.Serializable; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author heyingcai 11 | */ 12 | public class Seed implements Serializable { 13 | 14 | private String url; 15 | 16 | private String method; 17 | 18 | private RequestBody requestBody; 19 | 20 | private Map cookies = Maps.newHashMap(); 21 | 22 | private Map headers = Maps.newHashMap(); 23 | 24 | private String charset; 25 | 26 | /** 27 | * in one request, there perhaps have some attach information for pass to next handler 28 | */ 29 | private Map attach; 30 | 31 | public Seed(String url) { 32 | this.url = url; 33 | } 34 | 35 | public String getUrl() { 36 | return url; 37 | } 38 | 39 | public Seed setMethod(String method) { 40 | this.method = method; 41 | return this; 42 | } 43 | 44 | public String getMethod() { 45 | return method; 46 | } 47 | 48 | public RequestBody getRequestBody() { 49 | return requestBody; 50 | } 51 | 52 | public void setRequestBody(RequestBody requestBody) { 53 | this.requestBody = requestBody; 54 | } 55 | 56 | public Seed addCookie(String key, String value) { 57 | cookies.put(key, value); 58 | return this; 59 | } 60 | 61 | public Seed addCookies(Map cookieMap) { 62 | cookies.putAll(cookieMap); 63 | return this; 64 | } 65 | 66 | public Seed addHeader(String key, String value) { 67 | headers.put(key, value); 68 | return this; 69 | } 70 | 71 | public Seed addHeaders(Map headerMap) { 72 | headers.putAll(headerMap); 73 | return this; 74 | } 75 | 76 | public String getCharset() { 77 | return charset; 78 | } 79 | 80 | public Seed setCharset(String charset) { 81 | this.charset = charset; 82 | return this; 83 | } 84 | 85 | public Seed putAttach(String key, String value) { 86 | if (attach == null) { 87 | attach = Maps.newHashMap(); 88 | } 89 | attach.put(key, value); 90 | return this; 91 | } 92 | 93 | public Seed putAttach(String key, Object value) { 94 | if (attach == null) { 95 | attach = Maps.newHashMap(); 96 | } 97 | attach.put(key, value); 98 | return this; 99 | } 100 | 101 | public Object getAttach(String key) { 102 | if (attach == null) { 103 | return null; 104 | } 105 | return attach.get(key); 106 | } 107 | 108 | public Map getAttach() { 109 | return attach; 110 | } 111 | 112 | public Map getCookies() { 113 | return cookies; 114 | } 115 | 116 | public Map getHeaders() { 117 | return headers; 118 | } 119 | 120 | @Override 121 | public int hashCode() { 122 | int result = url != null ? url.hashCode() : 0; 123 | result = 31 * result + (method != null ? method.hashCode() : 0); 124 | return result; 125 | } 126 | 127 | @Override 128 | public boolean equals(Object o) { 129 | if (this == o) { 130 | return true; 131 | } 132 | if (o == null || getClass() != o.getClass()) { 133 | return false; 134 | } 135 | 136 | Seed seed = (Seed) o; 137 | 138 | if (url != null ? !url.equals(seed.url) : seed.url != null) { 139 | return false; 140 | } 141 | return method != null ? method.equals(seed.method) : seed.method == null; 142 | } 143 | 144 | @Override 145 | public String toString() { 146 | return "Seed{" + 147 | "url='" + url + '\'' + 148 | ", method='" + method + '\'' + 149 | ", requestBody=" + requestBody + 150 | ", cookies=" + cookies + 151 | ", headers=" + headers + 152 | ", charset='" + charset + '\'' + 153 | ", attach=" + attach + 154 | '}'; 155 | } 156 | 157 | } 158 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/annotation/Query.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | @Retention(RetentionPolicy.RUNTIME) 9 | @Target(ElementType.TYPE) 10 | public @interface Query { 11 | 12 | String value(); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/annotation/TargetUrl.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.annotation; 2 | 3 | 4 | import java.lang.annotation.ElementType; 5 | import java.lang.annotation.Retention; 6 | import java.lang.annotation.RetentionPolicy; 7 | import java.lang.annotation.Target; 8 | 9 | @Retention(RetentionPolicy.RUNTIME) 10 | @Target(ElementType.FIELD) 11 | public @interface TargetUrl { 12 | 13 | String url(); 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/concurrent/CettyAbortPolicy.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.concurrent; 2 | 3 | import java.util.concurrent.RejectedExecutionException; 4 | import java.util.concurrent.ThreadPoolExecutor; 5 | 6 | /** 7 | * Cetty Thread Reject Execution Exception Policy 8 | * 9 | * @author heyingcai 10 | */ 11 | public class CettyAbortPolicy extends ThreadPoolExecutor.AbortPolicy { 12 | 13 | private final String threadName; 14 | 15 | public CettyAbortPolicy(String threadName) { 16 | this.threadName = threadName; 17 | } 18 | 19 | @Override 20 | public void rejectedExecution(Runnable r, ThreadPoolExecutor e) { 21 | String msg = String.format("Cetty[" 22 | + " Thread Name: %s, Pool Size: %d (active: %d, core: %d, max: %d, largest: %d), Task: %d (completed: %d)," 23 | + " Executor status:(isShutdown:%s, isTerminated:%s, isTerminating:%s)]", 24 | threadName, e.getPoolSize(), e.getActiveCount(), e.getCorePoolSize(), e.getMaximumPoolSize(), e.getLargestPoolSize(), 25 | e.getTaskCount(), e.getCompletedTaskCount(), e.isShutdown(), e.isTerminated(), e.isTerminating()); 26 | System.out.println(msg); 27 | throw new RejectedExecutionException(msg); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/concurrent/CettyThreadPool.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.concurrent; 2 | 3 | import java.util.concurrent.Executor; 4 | import java.util.concurrent.LinkedBlockingQueue; 5 | import java.util.concurrent.SynchronousQueue; 6 | import java.util.concurrent.ThreadPoolExecutor; 7 | import java.util.concurrent.TimeUnit; 8 | 9 | /** 10 | * @author heyingcai 11 | */ 12 | public class CettyThreadPool { 13 | 14 | private static final String name = "CettyThreadPool"; 15 | 16 | /** 17 | * return new Executor 18 | * 19 | * @param threads corePoolSize 20 | * @param queueSize task queue size 21 | * @return 22 | */ 23 | public static Executor newExecutor(int threads, int queueSize) { 24 | return new ThreadPoolExecutor(threads, threads, 0, TimeUnit.MILLISECONDS, 25 | queueSize == 0 ? new SynchronousQueue<>() : (queueSize < 0 ? new LinkedBlockingQueue<>() : new LinkedBlockingQueue<>(queueSize)), 26 | new NamedThreadFactory(name, true), new CettyAbortPolicy(name)); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/concurrent/CountableThreadPool.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.concurrent; 2 | 3 | import java.util.concurrent.LinkedBlockingQueue; 4 | import java.util.concurrent.ThreadPoolExecutor; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.atomic.AtomicInteger; 7 | import java.util.concurrent.locks.Condition; 8 | import java.util.concurrent.locks.ReentrantLock; 9 | 10 | /** 11 | * refer to webmagic 12 | * https://github.com/code4craft/webmagic/blob/master/webmagic-core/src/main/java/us/codecraft/webmagic/thread/CountableThreadPool.java 13 | * 14 | * @author heyingcai 15 | */ 16 | public class CountableThreadPool { 17 | 18 | private int threadNum; 19 | 20 | private AtomicInteger threadAlive = new AtomicInteger(); 21 | 22 | private ReentrantLock reentrantLock = new ReentrantLock(); 23 | 24 | private Condition condition = reentrantLock.newCondition(); 25 | 26 | private ThreadPoolExecutor threadPoolExecutor; 27 | 28 | public CountableThreadPool(int threadNum) { 29 | this.threadNum = threadNum; 30 | this.threadPoolExecutor = new ThreadPoolExecutor(threadNum, threadNum, 0L, TimeUnit.MILLISECONDS, 31 | new LinkedBlockingQueue<>(), new NamedThreadFactory("Cetty-crawler", false)); 32 | } 33 | 34 | public CountableThreadPool(int threadNum, ThreadPoolExecutor threadPoolExecutor) { 35 | this.threadNum = threadNum; 36 | this.threadPoolExecutor = threadPoolExecutor; 37 | } 38 | 39 | 40 | public int getThreadAliveCount() { 41 | return threadAlive.get(); 42 | } 43 | 44 | public int getThreadNum() { 45 | return threadNum; 46 | } 47 | 48 | public boolean isShutdown() { 49 | return threadPoolExecutor.isShutdown(); 50 | } 51 | 52 | public void shutdown() { 53 | threadPoolExecutor.shutdown(); 54 | } 55 | 56 | public ThreadPoolExecutor getThreadPoolExecutor() { 57 | return threadPoolExecutor; 58 | } 59 | 60 | public void execute(final Runnable runnable) { 61 | 62 | 63 | if (threadAlive.get() >= threadNum) { 64 | try { 65 | reentrantLock.lock(); 66 | while (threadAlive.get() >= threadNum) { 67 | try { 68 | condition.await(); 69 | } catch (InterruptedException e) { 70 | } 71 | } 72 | } finally { 73 | reentrantLock.unlock(); 74 | } 75 | } 76 | threadAlive.incrementAndGet(); 77 | threadPoolExecutor.execute(new Runnable() { 78 | @Override 79 | public void run() { 80 | try { 81 | runnable.run(); 82 | } finally { 83 | try { 84 | reentrantLock.lock(); 85 | threadAlive.decrementAndGet(); 86 | condition.signal(); 87 | } finally { 88 | reentrantLock.unlock(); 89 | } 90 | } 91 | } 92 | }); 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/concurrent/NamedThreadFactory.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.concurrent; 2 | 3 | import java.util.concurrent.ThreadFactory; 4 | import java.util.concurrent.atomic.AtomicInteger; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public class NamedThreadFactory implements ThreadFactory { 10 | 11 | /** 12 | * define current thread factory number 13 | */ 14 | private static final AtomicInteger poolId = new AtomicInteger(1); 15 | 16 | /** 17 | * define current thread number 18 | */ 19 | private final AtomicInteger mThreadNumber = new AtomicInteger(1); 20 | 21 | private final String prefix; 22 | 23 | private final boolean daemon; 24 | 25 | protected final ThreadGroup threadGroup; 26 | 27 | public NamedThreadFactory() { 28 | this("cetty-threadpool-" + poolId.getAndIncrement(), false); 29 | } 30 | 31 | public NamedThreadFactory(String prefix, boolean daemon) { 32 | this.prefix = prefix + "-thread-"; 33 | this.daemon = daemon; 34 | SecurityManager securityManager = System.getSecurityManager(); 35 | threadGroup = (securityManager == null) ? Thread.currentThread().getThreadGroup() : securityManager.getThreadGroup(); 36 | } 37 | 38 | public NamedThreadFactory(String prefix, boolean daemon, ThreadGroup threadGroup) { 39 | this.prefix = prefix; 40 | this.daemon = daemon; 41 | this.threadGroup = threadGroup; 42 | } 43 | 44 | public NamedThreadFactory(String prefix) { 45 | this(prefix, false); 46 | } 47 | 48 | @Override 49 | public Thread newThread(Runnable runnable) { 50 | String name = prefix + mThreadNumber.getAndIncrement(); 51 | Thread thread = new Thread(threadGroup, runnable, name, 0); 52 | thread.setDaemon(daemon); 53 | return thread; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/concurrent/ThreadPerTaskExecutor.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.concurrent; 2 | 3 | import java.util.concurrent.Executor; 4 | import java.util.concurrent.ThreadFactory; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public final class ThreadPerTaskExecutor implements Executor { 10 | 11 | private final ThreadFactory threadFactory; 12 | 13 | public ThreadPerTaskExecutor(ThreadFactory threadFactory) { 14 | if (threadFactory == null) { 15 | throw new NullPointerException("threadFactory"); 16 | } 17 | this.threadFactory = threadFactory; 18 | } 19 | 20 | @Override 21 | public void execute(Runnable command) { 22 | threadFactory.newThread(command).start(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/constants/HttpConstants.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.constants; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public class HttpConstants { 7 | 8 | public static final String GET = "GET"; 9 | 10 | public static final String HEAD = "HEAD"; 11 | 12 | public static final String POST = "POST"; 13 | 14 | public static final String PUT = "PUT"; 15 | 16 | public static final String DELETE = "DELETE"; 17 | 18 | public static final String TRACE = "TRACE"; 19 | 20 | public static final String CONNECT = "CONNECT"; 21 | 22 | } 23 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/context/CettyContext.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.context; 2 | 3 | import com.google.common.collect.Maps; 4 | import com.jibug.cetty.core.Bootstrap; 5 | 6 | import java.util.Map; 7 | 8 | /** 9 | * @author heyingcai 10 | */ 11 | public final class CettyContext { 12 | 13 | private static Map contextMap = Maps.newConcurrentMap(); 14 | 15 | public static Map getContextMap() { 16 | return contextMap; 17 | } 18 | 19 | public static void addContext(String key, Bootstrap bootstrap) { 20 | if (contextMap.get(key) == null) { 21 | contextMap.put(key, bootstrap); 22 | } 23 | } 24 | 25 | public static Bootstrap getContext(String key) { 26 | return contextMap.get(key); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/AbstractHandlerContext.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Cetty; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Seed; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | /** 10 | * @author heyingcai 11 | */ 12 | public abstract class AbstractHandlerContext implements HandlerContext { 13 | 14 | private static final Logger logger = LoggerFactory.getLogger(AbstractHandlerContext.class); 15 | 16 | volatile AbstractHandlerContext prev; 17 | volatile AbstractHandlerContext next; 18 | 19 | private final boolean processEvent; 20 | private final boolean reduceEvent; 21 | private final HandlerPipeline pipeline; 22 | private final String name; 23 | 24 | public AbstractHandlerContext(boolean processEvent, boolean reduceEvent, HandlerPipeline pipeline, String name) { 25 | this.processEvent = processEvent; 26 | this.reduceEvent = reduceEvent; 27 | this.pipeline = pipeline; 28 | this.name = name; 29 | } 30 | 31 | @Override 32 | public HandlerPipeline pipeline() { 33 | return pipeline; 34 | } 35 | 36 | @Override 37 | public Cetty cetty() { 38 | return pipeline.cetty(); 39 | } 40 | 41 | public String name() { 42 | return name; 43 | } 44 | 45 | private AbstractHandlerContext findContextProcess() { 46 | AbstractHandlerContext ctx = this; 47 | do { 48 | ctx = ctx.next; 49 | } while (!ctx.processEvent); 50 | return ctx; 51 | } 52 | 53 | private AbstractHandlerContext findContextReduce() { 54 | AbstractHandlerContext ctx = this; 55 | do { 56 | ctx = ctx.next; 57 | } while (!ctx.reduceEvent); 58 | return ctx; 59 | } 60 | 61 | @Override 62 | public void fireDownload(Seed seed) { 63 | final AbstractHandlerContext next = findContextProcess(); 64 | next.invokeDownload(seed); 65 | } 66 | 67 | private void invokeDownload(Seed seed) { 68 | ProcessHandler processHandler = (ProcessHandler) handler(); 69 | processHandler.download(this, seed); 70 | } 71 | 72 | @Override 73 | public void fireReduce(Page page) { 74 | final AbstractHandlerContext next = findContextReduce(); 75 | if (next.prev != null && next.prev.isProcessEvent()) { 76 | if (!page.getNextSeeds().isEmpty()) { 77 | page.getNextSeeds().forEach(seed -> pipeline.cetty().getScheduler().push(seed)); 78 | } 79 | } 80 | next.invokeReduce(page); 81 | if (next.next.name().equals("tail")) { 82 | try { 83 | Thread.sleep(pipeline.cetty().getPayload().getSleepTime()); 84 | } catch (InterruptedException e) { 85 | logger.error("Thread interrupted when sleep", e); 86 | } 87 | } 88 | } 89 | 90 | private void invokeReduce(Page page) { 91 | ReduceHandler reduceHandler = (ReduceHandler) handler(); 92 | reduceHandler.reduce(this, page); 93 | } 94 | 95 | @Override 96 | public void fireProcess(Page page) { 97 | final AbstractHandlerContext next = findContextProcess(); 98 | next.invokeProcess(page); 99 | } 100 | 101 | private void invokeProcess(Page page) { 102 | ProcessHandler processHandler = (ProcessHandler) handler(); 103 | processHandler.process(this, page); 104 | } 105 | 106 | @Override 107 | public void fireReceive() { 108 | AbstractHandlerContext head = findContextProcess(); 109 | head.invokeReceive(); 110 | 111 | } 112 | 113 | private void invokeReceive() { 114 | ProcessHandler processHandler = (ProcessHandler) handler(); 115 | processHandler.receive(this); 116 | } 117 | 118 | public boolean isProcessEvent() { 119 | return processEvent; 120 | } 121 | 122 | public boolean isReduceEvent() { 123 | return reduceEvent; 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/ConsoleReduceHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public class ConsoleReduceHandler extends ReduceHandlerAdapter{ 9 | 10 | @Override 11 | public void reduce(HandlerContext ctx, Page page) { 12 | System.out.println(page.getResult()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/DefaultHandlerContext.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public class DefaultHandlerContext extends AbstractHandlerContext{ 7 | 8 | private Handler handler; 9 | 10 | public DefaultHandlerContext(HandlerPipeline pipeline, String name,Handler handler) { 11 | super(isProcess(handler), isReduce(handler), pipeline, name); 12 | this.handler = handler; 13 | } 14 | 15 | @Override 16 | public Handler handler() { 17 | return handler; 18 | } 19 | 20 | private static boolean isProcess(Handler handler) { 21 | return handler instanceof ProcessHandler; 22 | } 23 | 24 | private static boolean isReduce(Handler handler) { 25 | return handler instanceof ReduceHandler; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/Handler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public interface Handler { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/HandlerContext.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Cetty; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Seed; 6 | 7 | /** 8 | * @author heyingcai 9 | */ 10 | public interface HandlerContext { 11 | 12 | /** 13 | * return page 14 | * 15 | * @return 16 | */ 17 | Cetty cetty(); 18 | 19 | /** 20 | * return pipeline 21 | * 22 | * @return 23 | */ 24 | HandlerPipeline pipeline(); 25 | 26 | /** 27 | * return handler 28 | * 29 | * @return 30 | */ 31 | Handler handler(); 32 | 33 | /** 34 | * trigger receive page request 35 | */ 36 | void fireReceive(); 37 | 38 | /** 39 | * pass the download event to next handler 40 | * 41 | * @param seed 42 | */ 43 | void fireDownload(Seed seed); 44 | 45 | /** 46 | * pass the process event to next handler 47 | * 48 | * @param page 49 | */ 50 | void fireProcess(Page page); 51 | 52 | /** 53 | * pass the reduce event to next handler 54 | * 55 | * @param page 56 | */ 57 | void fireReduce(Page page); 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/HandlerInitializer.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Cetty; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Seed; 6 | 7 | /** 8 | * page handler initial 9 | * 10 | * @author heyingcai 11 | */ 12 | public abstract class HandlerInitializer implements ProcessHandler { 13 | 14 | @Override 15 | public void receive(HandlerContext ctx) { 16 | initPages(ctx); 17 | ctx.fireReceive(); 18 | } 19 | 20 | @Override 21 | public void download(HandlerContext ctx, Seed seed) { 22 | 23 | } 24 | 25 | @Override 26 | public void process(HandlerContext ctx, Page page) { 27 | 28 | } 29 | 30 | /** 31 | * init page 32 | * 33 | * @param cetty 34 | */ 35 | public abstract void initCetty(Cetty cetty); 36 | 37 | private void initPages(HandlerContext ctx) { 38 | try { 39 | Cetty cetty = ctx.cetty(); 40 | initCetty(cetty); 41 | } finally { 42 | remove(ctx); 43 | } 44 | } 45 | 46 | private void remove(HandlerContext ctx) { 47 | HandlerPipeline pipeline = ctx.pipeline(); 48 | if (pipeline.context(this) != null) { 49 | pipeline.remove(this); 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/HandlerPipeline.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Cetty; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Seed; 6 | 7 | import java.util.NoSuchElementException; 8 | import java.util.concurrent.atomic.AtomicInteger; 9 | 10 | /** 11 | * @author heyingcai 12 | */ 13 | public class HandlerPipeline { 14 | 15 | private Cetty cetty; 16 | final AbstractHandlerContext head; 17 | final AbstractHandlerContext tail; 18 | 19 | final AtomicInteger processCounter = new AtomicInteger(0); 20 | final AtomicInteger reduceCounter = new AtomicInteger(0); 21 | 22 | public HandlerPipeline(Cetty cetty) { 23 | this.cetty = cetty; 24 | head = new HeadContext(this); 25 | tail = new TailContext(this); 26 | 27 | head.next = tail; 28 | tail.prev = head; 29 | } 30 | 31 | public Cetty cetty() { 32 | return cetty; 33 | } 34 | 35 | public boolean checkDownloadHandler() { 36 | AbstractHandlerContext context = head.next; 37 | while (context != null) { 38 | if (context.isProcessEvent() && context.name().equals("downloader")) { 39 | return false; 40 | } 41 | context = context.next; 42 | } 43 | return false; 44 | } 45 | 46 | public final AbstractHandlerContext context(Handler handler) { 47 | if (handler == null) { 48 | throw new NullPointerException("handler"); 49 | } 50 | 51 | AbstractHandlerContext ctx = head.next; 52 | for (; ; ) { 53 | if (ctx == null) { 54 | return null; 55 | } 56 | if (ctx.handler() == handler) { 57 | return ctx; 58 | } 59 | ctx = ctx.next; 60 | } 61 | } 62 | 63 | public final HandlerPipeline remove(Handler handler) { 64 | remove(getContextOrDie(handler)); 65 | return this; 66 | } 67 | 68 | private AbstractHandlerContext getContextOrDie(Handler handler) { 69 | AbstractHandlerContext ctx = context(handler); 70 | if (ctx == null) { 71 | throw new NoSuchElementException(handler.getClass().getName()); 72 | } else { 73 | return ctx; 74 | } 75 | } 76 | 77 | private AbstractHandlerContext remove(final AbstractHandlerContext ctx) { 78 | assert ctx != head && ctx != tail; 79 | 80 | synchronized (this) { 81 | remove0(ctx); 82 | } 83 | return ctx; 84 | } 85 | 86 | private static void remove0(AbstractHandlerContext ctx) { 87 | AbstractHandlerContext prev = ctx.prev; 88 | AbstractHandlerContext next = ctx.next; 89 | prev.next = next; 90 | next.prev = prev; 91 | } 92 | 93 | public HandlerPipeline addLast(Handler handler, String name) { 94 | DefaultHandlerContext newCtx = null; 95 | if (handler instanceof ReduceHandler) { 96 | newCtx = new DefaultHandlerContext(this, name, handler); 97 | } else if (handler instanceof ProcessHandler) { 98 | newCtx = new DefaultHandlerContext(this, name, handler); 99 | } else { 100 | throw new IllegalArgumentException("handler must be ProcessHandler or ReduceHandler"); 101 | } 102 | addLast0(newCtx); 103 | return this; 104 | } 105 | 106 | public HandlerPipeline addLast(Handler handler) { 107 | DefaultHandlerContext newCtx = null; 108 | if (handler instanceof ReduceHandler) { 109 | newCtx = new DefaultHandlerContext(this, "ReduceHandler#" + reduceCounter.getAndAdd(1), handler); 110 | } else if (handler instanceof ProcessHandler) { 111 | newCtx = new DefaultHandlerContext(this, "ProcessHandler#" + processCounter.getAndAdd(1), handler); 112 | } else { 113 | throw new IllegalArgumentException("handler must be ProcessHandler or ReduceHandler"); 114 | } 115 | addLast0(newCtx); 116 | return this; 117 | } 118 | 119 | private void addLast0(AbstractHandlerContext newCtx) { 120 | AbstractHandlerContext prev = tail.prev; 121 | newCtx.prev = prev; 122 | newCtx.next = tail; 123 | prev.next = newCtx; 124 | tail.prev = newCtx; 125 | } 126 | 127 | 128 | public void start() { 129 | head.fireReceive(); 130 | } 131 | 132 | public void download(Seed seed) { 133 | head.fireDownload(seed); 134 | } 135 | 136 | 137 | final class HeadContext extends AbstractHandlerContext implements ProcessHandler, ReduceHandler { 138 | 139 | public HeadContext(HandlerPipeline pipeline) { 140 | super(false, true, pipeline, "head"); 141 | } 142 | 143 | @Override 144 | public void receive(HandlerContext ctx) { 145 | 146 | } 147 | 148 | @Override 149 | public void download(HandlerContext ctx, Seed seed) { 150 | 151 | } 152 | 153 | @Override 154 | public void process(HandlerContext ctx, Page page) { 155 | 156 | } 157 | 158 | @Override 159 | public Handler handler() { 160 | return this; 161 | } 162 | 163 | @Override 164 | public void reduce(HandlerContext ctx, Page page) { 165 | ctx.fireReduce(page); 166 | } 167 | } 168 | 169 | final class TailContext extends AbstractHandlerContext implements ProcessHandler { 170 | 171 | public TailContext(HandlerPipeline pipeline) { 172 | super(true, false, pipeline, "tail"); 173 | } 174 | 175 | @Override 176 | public void receive(HandlerContext ctx) { 177 | 178 | } 179 | 180 | @Override 181 | public void download(HandlerContext ctx, Seed seed) { 182 | throw new UnsupportedOperationException(); 183 | } 184 | 185 | @Override 186 | public void process(HandlerContext ctx, Page page) { 187 | 188 | } 189 | 190 | @Override 191 | public Handler handler() { 192 | return this; 193 | } 194 | 195 | } 196 | 197 | } 198 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/HttpDownloadHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Payload; 5 | import com.jibug.cetty.core.Seed; 6 | import com.jibug.cetty.core.constants.HttpConstants; 7 | import com.jibug.cetty.core.net.Proxy; 8 | import com.jibug.cetty.core.utils.UrlUtils; 9 | import org.apache.commons.io.IOUtils; 10 | import org.apache.http.Header; 11 | import org.apache.http.HttpHost; 12 | import org.apache.http.HttpResponse; 13 | import org.apache.http.auth.AuthState; 14 | import org.apache.http.auth.ChallengeState; 15 | import org.apache.http.auth.UsernamePasswordCredentials; 16 | import org.apache.http.client.CookieStore; 17 | import org.apache.http.client.config.CookieSpecs; 18 | import org.apache.http.client.config.RequestConfig; 19 | import org.apache.http.client.methods.CloseableHttpResponse; 20 | import org.apache.http.client.methods.HttpUriRequest; 21 | import org.apache.http.client.methods.RequestBuilder; 22 | import org.apache.http.client.protocol.HttpClientContext; 23 | import org.apache.http.concurrent.FutureCallback; 24 | import org.apache.http.entity.ByteArrayEntity; 25 | import org.apache.http.impl.auth.BasicScheme; 26 | import org.apache.http.impl.client.BasicCookieStore; 27 | import org.apache.http.impl.client.CloseableHttpClient; 28 | import org.apache.http.impl.cookie.BasicClientCookie; 29 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; 30 | import org.apache.http.util.EntityUtils; 31 | import org.slf4j.Logger; 32 | import org.slf4j.LoggerFactory; 33 | 34 | import java.io.IOException; 35 | import java.util.ArrayList; 36 | import java.util.HashMap; 37 | import java.util.List; 38 | import java.util.Map; 39 | 40 | /** 41 | * @author heyingcai 42 | */ 43 | public class HttpDownloadHandler extends ProcessHandlerAdapter { 44 | 45 | private static final Logger logger = LoggerFactory.getLogger(HttpDownloadHandler.class); 46 | 47 | @Override 48 | public void download(HandlerContext ctx, Seed seed) { 49 | Payload payload = ctx.cetty().getPayload(); 50 | if (payload == null) { 51 | throw new NullPointerException("payload can not be null"); 52 | } 53 | //get cetty state 54 | boolean async = ctx.cetty().isAsync(); 55 | if (async) { 56 | asyncHttpClientDownload(ctx, seed); 57 | } else { 58 | httpClientDownload(ctx, seed); 59 | } 60 | } 61 | 62 | private void httpClientDownload(HandlerContext ctx, Seed seed) { 63 | Payload payload = ctx.cetty().getPayload(); 64 | 65 | CloseableHttpClient httpClient = ctx.cetty().getHttpClient(); 66 | CloseableHttpResponse httpResponse = null; 67 | 68 | Page page; 69 | try { 70 | httpResponse = httpClient.execute(convertHttpUriRequest(seed, payload), convertHttpClientContext(seed, payload)); 71 | page = handleResponse(seed, seed.getCharset() != null ? seed.getCharset() : payload.getCharset(), httpResponse); 72 | logger.info("download {} page success !", seed.getUrl()); 73 | ctx.fireProcess(page); 74 | } catch (IOException e) { 75 | logger.warn("download {} page error !", seed.getUrl(), e); 76 | } finally { 77 | if (httpResponse != null) { 78 | EntityUtils.consumeQuietly(httpResponse.getEntity()); 79 | } 80 | } 81 | 82 | } 83 | 84 | private void asyncHttpClientDownload(HandlerContext ctx, Seed seed) { 85 | Payload payload = ctx.cetty().getPayload(); 86 | CloseableHttpAsyncClient httpAsyncClient = ctx.cetty().getHttpAsyncClient(); 87 | 88 | try { 89 | httpAsyncClient.execute(convertHttpUriRequest(seed, payload), convertHttpClientContext(seed, payload), new CallBack(seed, ctx, payload)); 90 | } catch (Exception e) { 91 | logger.warn("download {} page error !", seed.getUrl(), e); 92 | } 93 | } 94 | 95 | class CallBack implements FutureCallback { 96 | 97 | private final Seed seed; 98 | private final HandlerContext ctx; 99 | private final Payload payload; 100 | 101 | public CallBack(Seed seed, HandlerContext ctx, Payload payload) { 102 | this.seed = seed; 103 | this.ctx = ctx; 104 | this.payload = payload; 105 | } 106 | 107 | @Override 108 | public void completed(HttpResponse httpResponse) { 109 | try { 110 | Page page = handleResponse(seed, seed.getCharset() != null ? seed.getCharset() : payload.getCharset(), httpResponse); 111 | logger.info("download {} page success !", seed.getUrl()); 112 | ctx.fireProcess(page); 113 | } catch (IOException e) { 114 | logger.warn("download {} page error !", seed.getUrl(), e); 115 | } finally { 116 | if (httpResponse != null) { 117 | EntityUtils.consumeQuietly(httpResponse.getEntity()); 118 | } 119 | } 120 | } 121 | 122 | @Override 123 | public void failed(Exception e) { 124 | logger.warn("download {} page error !", seed.getUrl(), e); 125 | } 126 | 127 | @Override 128 | public void cancelled() { 129 | logger.warn("download {} page cancelled", seed.getUrl()); 130 | } 131 | } 132 | 133 | 134 | private Page handleResponse(Seed seed, String charset, HttpResponse httpResponse) throws IOException { 135 | byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); 136 | Page page = new Page(); 137 | page.setBytes(bytes); 138 | page.setRawData(new String(bytes, charset)); 139 | page.setUrl(seed.getUrl()); 140 | page.setSeed(seed); 141 | page.setHeaders(convertHeaders(httpResponse.getAllHeaders())); 142 | page.setDocument(new String(bytes, charset), seed.getUrl()); 143 | return page; 144 | } 145 | 146 | private Map> convertHeaders(Header[] headers) { 147 | Map> results = new HashMap<>(); 148 | for (Header header : headers) { 149 | List list = results.get(header.getName()); 150 | if (list == null) { 151 | list = new ArrayList<>(); 152 | results.put(header.getName(), list); 153 | } 154 | list.add(header.getValue()); 155 | } 156 | return results; 157 | } 158 | 159 | private HttpClientContext convertHttpClientContext(Seed seed, Payload payload) { 160 | Proxy proxy = payload.getProxy(); 161 | HttpClientContext httpContext = new HttpClientContext(); 162 | if (proxy != null && proxy.getUsername() != null) { 163 | AuthState authState = new AuthState(); 164 | authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); 165 | httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); 166 | } 167 | if (payload.getCookies() != null && !payload.getCookies().isEmpty()) { 168 | CookieStore cookieStore = new BasicCookieStore(); 169 | Map> cookies = payload.getCookies(); 170 | for (Map.Entry> cookieEntry : cookies.entrySet()) { 171 | Map value = cookieEntry.getValue(); 172 | for (Map.Entry entry : value.entrySet()) { 173 | BasicClientCookie cookie1 = new BasicClientCookie(entry.getKey(), entry.getValue()); 174 | cookie1.setDomain(cookieEntry.getKey()); 175 | cookieStore.addCookie(cookie1); 176 | } 177 | } 178 | httpContext.setCookieStore(cookieStore); 179 | } 180 | 181 | if (!seed.getCookies().isEmpty()) { 182 | CookieStore cookieStore = new BasicCookieStore(); 183 | Map cookies = seed.getCookies(); 184 | for (Map.Entry cookie : cookies.entrySet()) { 185 | BasicClientCookie cookie1 = new BasicClientCookie(cookie.getKey(), cookie.getValue()); 186 | cookie1.setDomain(UrlUtils.getDomain(seed.getUrl()) 187 | ); 188 | cookieStore.addCookie(cookie1); 189 | } 190 | httpContext.setCookieStore(cookieStore); 191 | } 192 | return httpContext; 193 | } 194 | 195 | private HttpUriRequest convertHttpUriRequest(Seed seed, Payload payload) { 196 | RequestBuilder requestBuilder = getRequestMethod(seed).setUri(seed.getUrl()); 197 | if (payload.getHeaders() != null) { 198 | for (Map.Entry headerEntry : payload.getHeaders().entrySet()) { 199 | requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); 200 | } 201 | } 202 | 203 | if (!seed.getHeaders().isEmpty()) { 204 | for (Map.Entry headerEntry : seed.getHeaders().entrySet()) { 205 | requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); 206 | } 207 | } 208 | 209 | RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); 210 | if (payload != null) { 211 | requestConfigBuilder.setConnectionRequestTimeout(payload.getConnectTimeout()) 212 | .setSocketTimeout(payload.getSocketTimeout()) 213 | .setConnectTimeout(payload.getConnectTimeout()) 214 | .setCookieSpec(CookieSpecs.STANDARD); 215 | } 216 | 217 | if (payload.getProxy() != null) { 218 | requestConfigBuilder.setProxy(new HttpHost(payload.getProxy().getHost(), payload.getProxy().getPort())); 219 | } 220 | requestBuilder.setConfig(requestConfigBuilder.build()); 221 | HttpUriRequest httpUriRequest = requestBuilder.build(); 222 | return httpUriRequest; 223 | } 224 | 225 | private RequestBuilder getRequestMethod(Seed seed) { 226 | String method = seed.getMethod(); 227 | if (method == null || method.equalsIgnoreCase(HttpConstants.GET)) { 228 | return RequestBuilder.get(); 229 | } else if (method.equalsIgnoreCase(HttpConstants.POST)) { 230 | return addFormParams(RequestBuilder.post(), seed); 231 | } else if (method.equalsIgnoreCase(HttpConstants.HEAD)) { 232 | return RequestBuilder.head(); 233 | } else if (method.equalsIgnoreCase(HttpConstants.PUT)) { 234 | return addFormParams(RequestBuilder.put(), seed); 235 | } else if (method.equalsIgnoreCase(HttpConstants.DELETE)) { 236 | return RequestBuilder.delete(); 237 | } else if (method.equalsIgnoreCase(HttpConstants.TRACE)) { 238 | return RequestBuilder.trace(); 239 | } 240 | throw new IllegalArgumentException("Illegal HTTP Method " + method); 241 | } 242 | 243 | private RequestBuilder addFormParams(RequestBuilder requestBuilder, Seed seed) { 244 | if (seed.getRequestBody() != null) { 245 | ByteArrayEntity entity = new ByteArrayEntity(seed.getRequestBody().getBody()); 246 | entity.setContentType(seed.getRequestBody().getContentType()); 247 | requestBuilder.setEntity(entity); 248 | } 249 | return requestBuilder; 250 | } 251 | 252 | } 253 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/PageProcessHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public class PageProcessHandler extends ProcessHandlerAdapter { 9 | 10 | @Override 11 | public void process(HandlerContext ctx, Page page) { 12 | super.process(ctx, page); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/ProcessHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Seed; 5 | 6 | /** 7 | * process handler 8 | * 9 | * @author heyingcai 10 | */ 11 | public interface ProcessHandler extends Handler { 12 | 13 | /** 14 | * receive new request even 15 | * 16 | * @param ctx 17 | */ 18 | void receive(HandlerContext ctx); 19 | 20 | /** 21 | * download the request 22 | * 23 | * @param ctx 24 | * @param seed 25 | */ 26 | void download(HandlerContext ctx, Seed seed); 27 | 28 | /** 29 | * process the request 30 | * 31 | * @param ctx 32 | * @param page 33 | */ 34 | void process(HandlerContext ctx, Page page); 35 | } 36 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/ProcessHandlerAdapter.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Seed; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public abstract class ProcessHandlerAdapter implements ProcessHandler { 10 | 11 | @Override 12 | public void receive(HandlerContext ctx) { 13 | ctx.fireReceive(); 14 | } 15 | 16 | @Override 17 | public void download(HandlerContext ctx, Seed seed) { 18 | ctx.fireDownload(seed); 19 | } 20 | 21 | @Override 22 | public void process(HandlerContext ctx, Page page) { 23 | ctx.fireReduce(page); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/ReduceHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public interface ReduceHandler extends Handler { 9 | 10 | /** 11 | * reduce the request result from process handler 12 | * 13 | * @param ctx 14 | * @param page 15 | */ 16 | void reduce(HandlerContext ctx, Page page); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/handler/ReduceHandlerAdapter.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public class ReduceHandlerAdapter implements ReduceHandler { 9 | 10 | @Override 11 | public void reduce(HandlerContext ctx, Page page) { 12 | ctx.fireReduce(page); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/model/AnnotationConfig.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.model; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public class AnnotationConfig { 7 | 8 | private String targetUrl; 9 | 10 | private String threadNum; 11 | 12 | private boolean async; 13 | 14 | } 15 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/model/RequestBody.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.model; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.apache.http.NameValuePair; 5 | import org.apache.http.client.utils.URLEncodedUtils; 6 | import org.apache.http.message.BasicNameValuePair; 7 | 8 | import java.io.Serializable; 9 | import java.io.UnsupportedEncodingException; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | /** 14 | * @author heyingcai 15 | */ 16 | public class RequestBody implements Serializable { 17 | 18 | private byte[] body; 19 | private String contentType; 20 | private String charsetName; 21 | 22 | enum ContentType { 23 | /** 24 | * content-type: json 25 | */ 26 | JSON("application/json"), 27 | /** 28 | * content-type:form 29 | */ 30 | FORM("application/x-www-form-urlencoded"), 31 | /** 32 | * content-type:xml 33 | */ 34 | XML("text/xml"); 35 | 36 | private String type; 37 | 38 | ContentType(String type) { 39 | this.type = type; 40 | } 41 | 42 | public String getType() { 43 | return type; 44 | } 45 | } 46 | 47 | public RequestBody(byte[] body, String contentType, String charsetName) { 48 | this.body = body; 49 | this.contentType = contentType; 50 | this.charsetName = charsetName; 51 | } 52 | 53 | public byte[] getBody() { 54 | return body; 55 | } 56 | 57 | public void setBody(byte[] body) { 58 | this.body = body; 59 | } 60 | 61 | public String getContentType() { 62 | return contentType; 63 | } 64 | 65 | public void setContentType(String contentType) { 66 | this.contentType = contentType; 67 | } 68 | 69 | public String getCharsetName() { 70 | return charsetName; 71 | } 72 | 73 | public void setCharsetName(String charsetName) { 74 | this.charsetName = charsetName; 75 | } 76 | 77 | public static RequestBody custom(byte[] body, String contentType, String charsetName) { 78 | return new RequestBody(body, contentType, charsetName); 79 | } 80 | 81 | public static RequestBody toJson(String json, String charsetName) { 82 | try { 83 | return new RequestBody(json.getBytes(charsetName), ContentType.JSON.getType(), charsetName); 84 | } catch (UnsupportedEncodingException e) { 85 | throw new IllegalArgumentException("illegal charset " + charsetName, e); 86 | } 87 | } 88 | 89 | public static RequestBody toForm(Map params, String charsetName) { 90 | List nameValuePairs = Lists.newArrayListWithCapacity(params.size()); 91 | for (Map.Entry entry : params.entrySet()) { 92 | nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); 93 | } 94 | try { 95 | return new RequestBody(URLEncodedUtils.format(nameValuePairs, charsetName).getBytes(charsetName), ContentType.FORM.getType(), charsetName); 96 | } catch (UnsupportedEncodingException e) { 97 | throw new IllegalArgumentException("illegal charset " + charsetName, e); 98 | } 99 | } 100 | 101 | public static RequestBody toXml(String xml, String charsetName) { 102 | try { 103 | return new RequestBody(xml.getBytes(charsetName), ContentType.XML.getType(), charsetName); 104 | } catch (UnsupportedEncodingException e) { 105 | throw new IllegalArgumentException("illegal charset " + charsetName, e); 106 | } 107 | } 108 | 109 | 110 | } 111 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/AbstractHttpClientGenerator.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | import com.jibug.cetty.core.constants.HttpConstants; 5 | import org.apache.http.HttpRequest; 6 | import org.apache.http.HttpResponse; 7 | import org.apache.http.ProtocolException; 8 | import org.apache.http.client.CookieStore; 9 | import org.apache.http.client.methods.HttpGet; 10 | import org.apache.http.client.methods.HttpPost; 11 | import org.apache.http.client.methods.HttpRequestWrapper; 12 | import org.apache.http.client.methods.HttpUriRequest; 13 | import org.apache.http.config.Registry; 14 | import org.apache.http.conn.ssl.DefaultHostnameVerifier; 15 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 16 | import org.apache.http.impl.client.BasicCookieStore; 17 | import org.apache.http.impl.client.LaxRedirectStrategy; 18 | import org.apache.http.impl.cookie.BasicClientCookie; 19 | import org.apache.http.nio.conn.ssl.SSLIOSessionStrategy; 20 | import org.apache.http.protocol.HttpContext; 21 | import org.apache.http.ssl.SSLContexts; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | import javax.net.ssl.SSLContext; 26 | import javax.net.ssl.TrustManager; 27 | import javax.net.ssl.X509TrustManager; 28 | import java.net.URI; 29 | import java.security.KeyManagementException; 30 | import java.security.NoSuchAlgorithmException; 31 | import java.security.cert.CertificateException; 32 | import java.security.cert.X509Certificate; 33 | import java.util.Map; 34 | 35 | /** 36 | * @author heyingcai 37 | */ 38 | public abstract class AbstractHttpClientGenerator implements HttpClientGenerator { 39 | 40 | private static final Logger logger = LoggerFactory.getLogger(AbstractHttpClientGenerator.class); 41 | 42 | /** 43 | * building httpclient 44 | * 45 | * @param payload 46 | * @return 47 | */ 48 | protected abstract T build(Payload payload); 49 | 50 | /** 51 | * RegistryBuilder construct 52 | * 53 | * @return 54 | */ 55 | protected abstract Registry registry(); 56 | 57 | public AbstractHttpClientGenerator() { 58 | } 59 | 60 | protected SSLIOSessionStrategy buildSSLIOSessionStrategy() { 61 | SSLContext sslcontext = SSLContexts.createDefault(); 62 | return new SSLIOSessionStrategy(sslcontext); 63 | } 64 | 65 | protected SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { 66 | try { 67 | return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, 68 | null, 69 | new DefaultHostnameVerifier()); 70 | } catch (KeyManagementException e) { 71 | logger.error("ssl connection fail", e); 72 | } catch (NoSuchAlgorithmException e) { 73 | logger.error("ssl connection fail", e); 74 | } 75 | return SSLConnectionSocketFactory.getSocketFactory(); 76 | } 77 | 78 | private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { 79 | // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 80 | X509TrustManager trustManager = new X509TrustManager() { 81 | 82 | @Override 83 | public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { 84 | } 85 | 86 | @Override 87 | public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { 88 | } 89 | 90 | @Override 91 | public X509Certificate[] getAcceptedIssuers() { 92 | return null; 93 | } 94 | 95 | }; 96 | 97 | SSLContext sc = SSLContext.getInstance("SSLv3"); 98 | sc.init(null, new TrustManager[]{trustManager}, null); 99 | return sc; 100 | } 101 | 102 | public class CustomRedirectStrategy extends LaxRedirectStrategy { 103 | 104 | @Override 105 | public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { 106 | URI uri = getLocationURI(request, response, context); 107 | String method = request.getRequestLine().getMethod(); 108 | if (HttpConstants.POST.equalsIgnoreCase(method)) { 109 | try { 110 | HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; 111 | httpRequestWrapper.setURI(uri); 112 | httpRequestWrapper.removeHeaders("Content-Length"); 113 | return httpRequestWrapper; 114 | } catch (Exception e) { 115 | e.printStackTrace(); 116 | } 117 | return new HttpPost(uri); 118 | } else { 119 | return new HttpGet(uri); 120 | } 121 | } 122 | } 123 | 124 | protected CookieStore generatorCookie(Payload payload) { 125 | CookieStore cookieStore = new BasicCookieStore(); 126 | if (!payload.getOriginCookies().isEmpty()) { 127 | for (Map.Entry cookieEntry : payload.getOriginCookies().entrySet()) { 128 | BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); 129 | cookie.setDomain(payload.getDomain()); 130 | cookieStore.addCookie(cookie); 131 | } 132 | } 133 | if (!payload.getCookies().isEmpty()) { 134 | for (Map.Entry> domainEntry : payload.getCookies().entrySet()) { 135 | for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) { 136 | BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); 137 | cookie.setDomain(domainEntry.getKey()); 138 | cookieStore.addCookie(cookie); 139 | } 140 | } 141 | } 142 | return cookieStore; 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/AsyncHttpClientGenerator.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | import org.apache.commons.lang3.StringUtils; 5 | import org.apache.http.Consts; 6 | import org.apache.http.HttpHost; 7 | import org.apache.http.client.config.RequestConfig; 8 | import org.apache.http.config.ConnectionConfig; 9 | import org.apache.http.config.Registry; 10 | import org.apache.http.config.RegistryBuilder; 11 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; 12 | import org.apache.http.impl.nio.client.HttpAsyncClientBuilder; 13 | import org.apache.http.impl.nio.client.HttpAsyncClients; 14 | import org.apache.http.impl.nio.conn.PoolingNHttpClientConnectionManager; 15 | import org.apache.http.impl.nio.reactor.DefaultConnectingIOReactor; 16 | import org.apache.http.impl.nio.reactor.IOReactorConfig; 17 | import org.apache.http.nio.conn.NoopIOSessionStrategy; 18 | import org.apache.http.nio.conn.SchemeIOSessionStrategy; 19 | import org.apache.http.nio.reactor.ConnectingIOReactor; 20 | import org.apache.http.nio.reactor.IOReactorException; 21 | 22 | import java.nio.charset.CodingErrorAction; 23 | 24 | /** 25 | * @author heyingcai 26 | */ 27 | public class AsyncHttpClientGenerator extends AbstractHttpClientGenerator { 28 | 29 | private PoolingNHttpClientConnectionManager poolingNHttpClientConnectionManager; 30 | 31 | public AsyncHttpClientGenerator() { 32 | IOReactorConfig ioReactorConfig = IOReactorConfig.custom() 33 | .setIoThreadCount(Runtime.getRuntime().availableProcessors()) 34 | .build(); 35 | ConnectingIOReactor ioReactor = null; 36 | try { 37 | ioReactor = new DefaultConnectingIOReactor(ioReactorConfig); 38 | } catch (IOReactorException e) { 39 | e.printStackTrace(); 40 | } 41 | poolingNHttpClientConnectionManager = new PoolingNHttpClientConnectionManager(ioReactor, null, registry(), null); 42 | poolingNHttpClientConnectionManager.setDefaultMaxPerRoute(100); 43 | } 44 | 45 | @Override 46 | public CloseableHttpAsyncClient getClient(Payload payload) { 47 | return build(payload); 48 | } 49 | 50 | @Override 51 | protected CloseableHttpAsyncClient build(Payload payload) { 52 | HttpAsyncClientBuilder asyncClientBuilder = HttpAsyncClients.custom(); 53 | 54 | if (StringUtils.isNotBlank(payload.getUserAgent())) { 55 | asyncClientBuilder.setUserAgent(payload.getUserAgent()); 56 | } else { 57 | asyncClientBuilder.setUserAgent(""); 58 | } 59 | 60 | asyncClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); 61 | 62 | asyncClientBuilder.setConnectionManagerShared(true); 63 | 64 | RequestConfig requestConfig = RequestConfig.custom() 65 | .setConnectTimeout(payload.getConnectTimeout()) 66 | .setSocketTimeout(payload.getSocketTimeout()).build(); 67 | 68 | ConnectionConfig connectionConfig = ConnectionConfig.custom() 69 | .setMalformedInputAction(CodingErrorAction.IGNORE) 70 | .setUnmappableInputAction(CodingErrorAction.IGNORE) 71 | .setCharset(Consts.UTF_8).build(); 72 | 73 | poolingNHttpClientConnectionManager.setDefaultConnectionConfig(connectionConfig); 74 | asyncClientBuilder.setConnectionManager(poolingNHttpClientConnectionManager); 75 | asyncClientBuilder.setDefaultRequestConfig(requestConfig); 76 | if (payload.getProxy() != null) { 77 | Proxy proxy = payload.getProxy(); 78 | HttpHost httpHost = new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()); 79 | asyncClientBuilder.setProxy(httpHost); 80 | } 81 | reduceCookie(asyncClientBuilder,payload); 82 | return asyncClientBuilder.build(); 83 | } 84 | 85 | @Override 86 | protected Registry registry() { 87 | return RegistryBuilder 88 | .create() 89 | .register("http", NoopIOSessionStrategy.INSTANCE) 90 | .register("https", buildSSLIOSessionStrategy()) 91 | .build(); 92 | } 93 | 94 | private void reduceCookie(HttpAsyncClientBuilder asyncClientBuilder, Payload payload) { 95 | if (payload.isUnsupportedCookie()) { 96 | asyncClientBuilder.disableCookieManagement(); 97 | return; 98 | } 99 | asyncClientBuilder.setDefaultCookieStore(generatorCookie(payload)); 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/HttpClientFactory.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import org.apache.http.impl.client.CloseableHttpClient; 4 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public class HttpClientFactory { 10 | 11 | private static HttpClientGenerator asyncHttpClientGenerator; 12 | 13 | private static HttpClientGenerator httpClientHttpClientGenerator; 14 | 15 | public void initHttpclient(boolean async) { 16 | 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/HttpClientGenerator.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public interface HttpClientGenerator { 9 | 10 | /** 11 | * get the real httpclient instance 12 | * 13 | * @param payload 14 | * @return 15 | */ 16 | T getClient(Payload payload); 17 | } 18 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/Proxy.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | /** 4 | * Http proxy 5 | * @author heyingcai 6 | */ 7 | public class Proxy { 8 | 9 | private String host; 10 | private int port; 11 | private String username; 12 | private String password; 13 | private String scheme = "http"; 14 | 15 | public Proxy(String host, int port) { 16 | this.host = host; 17 | this.port = port; 18 | } 19 | 20 | public Proxy(String host, int port, String scheme) { 21 | this.host = host; 22 | this.port = port; 23 | this.scheme = scheme; 24 | } 25 | 26 | public Proxy(String host, int port, String username, String password, String scheme) { 27 | this.host = host; 28 | this.port = port; 29 | this.username = username; 30 | this.password = password; 31 | this.scheme = scheme; 32 | } 33 | 34 | public String getHost() { 35 | return host; 36 | } 37 | 38 | public void setHost(String host) { 39 | this.host = host; 40 | } 41 | 42 | public int getPort() { 43 | return port; 44 | } 45 | 46 | public void setPort(int port) { 47 | this.port = port; 48 | } 49 | 50 | public String getUsername() { 51 | return username; 52 | } 53 | 54 | public void setUsername(String username) { 55 | this.username = username; 56 | } 57 | 58 | public String getPassword() { 59 | return password; 60 | } 61 | 62 | public void setPassword(String password) { 63 | this.password = password; 64 | } 65 | 66 | public String getScheme() { 67 | return scheme; 68 | } 69 | 70 | public void setScheme(String scheme) { 71 | this.scheme = scheme; 72 | } 73 | 74 | @Override 75 | public String toString() { 76 | return "Proxy{" + 77 | "host='" + host + '\'' + 78 | ", port=" + port + 79 | ", username='" + username + '\'' + 80 | ", password='" + password + '\'' + 81 | ", scheme='" + scheme + '\'' + 82 | '}'; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/ProxyStrategy.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | /** 4 | * @author heyingcai 5 | * @date 2018/11/5 6 | * @verison 1.0 7 | */ 8 | public interface ProxyStrategy { 9 | 10 | /** 11 | * return proxy 12 | * 13 | * @return 14 | */ 15 | Proxy getProxy(); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/net/SyncHttpClientGenerator.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | import org.apache.http.config.Registry; 5 | import org.apache.http.config.RegistryBuilder; 6 | import org.apache.http.config.SocketConfig; 7 | import org.apache.http.conn.socket.ConnectionSocketFactory; 8 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; 11 | import org.apache.http.impl.client.HttpClientBuilder; 12 | import org.apache.http.impl.client.HttpClients; 13 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 14 | 15 | /** 16 | * @author heyingcai 17 | */ 18 | public class SyncHttpClientGenerator extends AbstractHttpClientGenerator { 19 | 20 | private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager; 21 | 22 | public SyncHttpClientGenerator() { 23 | poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registry()); 24 | poolingHttpClientConnectionManager.setDefaultMaxPerRoute(100); 25 | } 26 | 27 | @Override 28 | public CloseableHttpClient getClient(Payload payload) { 29 | return build(payload); 30 | } 31 | 32 | @Override 33 | protected CloseableHttpClient build(Payload payload) { 34 | HttpClientBuilder httpClientBuilder = HttpClients.custom(); 35 | 36 | httpClientBuilder.setConnectionManager(poolingHttpClientConnectionManager); 37 | if (payload.getUserAgent() != null) { 38 | httpClientBuilder.setUserAgent(payload.getUserAgent()); 39 | } else { 40 | httpClientBuilder.setUserAgent(""); 41 | } 42 | 43 | httpClientBuilder.setConnectionManagerShared(true); 44 | 45 | httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); 46 | 47 | SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); 48 | socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); 49 | socketConfigBuilder.setSoTimeout(payload.getSocketTimeout()); 50 | SocketConfig socketConfig = socketConfigBuilder.build(); 51 | httpClientBuilder.setDefaultSocketConfig(socketConfig); 52 | poolingHttpClientConnectionManager.setDefaultSocketConfig(socketConfig); 53 | httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(payload.getRetryTimes(), true)); 54 | reduceCookie(httpClientBuilder, payload); 55 | return httpClientBuilder.build(); 56 | } 57 | 58 | @Override 59 | protected Registry registry() { 60 | return RegistryBuilder.create() 61 | .register("http", PlainConnectionSocketFactory.INSTANCE) 62 | .register("https", buildSSLConnectionSocketFactory()) 63 | .build(); 64 | } 65 | 66 | private void reduceCookie(HttpClientBuilder httpClientBuilder, Payload payload) { 67 | if (payload.isUnsupportedCookie()) { 68 | httpClientBuilder.disableCookieManagement(); 69 | return; 70 | } 71 | httpClientBuilder.setDefaultCookieStore(generatorCookie(payload)); 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/scheduler/DuplicateStrategy.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.scheduler; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public interface DuplicateStrategy { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/scheduler/QueueScheduler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.scheduler; 2 | 3 | import com.jibug.cetty.core.Seed; 4 | import com.google.common.collect.Queues; 5 | 6 | import java.util.concurrent.BlockingQueue; 7 | 8 | /** 9 | * @author heyingcai 10 | */ 11 | public class QueueScheduler implements Scheduler { 12 | 13 | private BlockingQueue queue = Queues.newLinkedBlockingQueue(); 14 | 15 | @Override 16 | public void push(Seed seed) { 17 | queue.add(seed); 18 | } 19 | 20 | @Override 21 | public Seed poll() { 22 | return queue.poll(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/scheduler/Scheduler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.scheduler; 2 | 3 | import com.jibug.cetty.core.Seed; 4 | 5 | /** 6 | * @author heyingcai 7 | */ 8 | public interface Scheduler { 9 | 10 | /** 11 | * push the seed to scheduler 12 | * 13 | * @param seed 14 | */ 15 | void push(Seed seed); 16 | 17 | /** 18 | * poll the seed from scheduler 19 | * 20 | * @return 21 | */ 22 | Seed poll(); 23 | 24 | } 25 | -------------------------------------------------------------------------------- /cetty-core/src/main/java/com/jibug/cetty/core/utils/UrlUtils.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.utils; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import java.net.MalformedURLException; 6 | import java.net.URL; 7 | import java.nio.charset.Charset; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | /** 12 | * url and html utils. 13 | * 14 | * @author code4crafter@gmail.com
15 | * @since 0.1.0 16 | */ 17 | public class UrlUtils { 18 | 19 | /** 20 | * canonicalizeUrl 21 | *
22 | * Borrowed from Jsoup. 23 | * 24 | * @param url url 25 | * @param refer refer 26 | * @return canonicalizeUrl 27 | */ 28 | public static String canonicalizeUrl(String url, String refer) { 29 | URL base; 30 | try { 31 | try { 32 | base = new URL(refer); 33 | } catch (MalformedURLException e) { 34 | // the base is unsuitable, but the attribute may be abs on its own, so try that 35 | URL abs = new URL(refer); 36 | return abs.toExternalForm(); 37 | } 38 | // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired 39 | if (url.startsWith("?")) 40 | url = base.getPath() + url; 41 | URL abs = new URL(base, url); 42 | return abs.toExternalForm(); 43 | } catch (MalformedURLException e) { 44 | return ""; 45 | } 46 | } 47 | 48 | /** 49 | * @param url url 50 | * @return new url 51 | * @deprecated 52 | */ 53 | public static String encodeIllegalCharacterInUrl(String url) { 54 | return url.replace(" ", "%20"); 55 | } 56 | 57 | public static String fixIllegalCharacterInUrl(String url) { 58 | //TODO more charator support 59 | return url.replace(" ", "%20").replaceAll("#+", "#"); 60 | } 61 | 62 | public static String getHost(String url) { 63 | String host = url; 64 | int i = StringUtils.ordinalIndexOf(url, "/", 3); 65 | if (i > 0) { 66 | host = StringUtils.substring(url, 0, i); 67 | } 68 | return host; 69 | } 70 | 71 | private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); 72 | 73 | public static String removeProtocol(String url) { 74 | return patternForProtocal.matcher(url).replaceAll(""); 75 | } 76 | 77 | public static String getDomain(String url) { 78 | String domain = removeProtocol(url); 79 | int i = StringUtils.indexOf(domain, "/", 1); 80 | if (i > 0) { 81 | domain = StringUtils.substring(domain, 0, i); 82 | } 83 | return removePort(domain); 84 | } 85 | 86 | public static String removePort(String domain) { 87 | int portIndex = domain.indexOf(":"); 88 | if (portIndex != -1) { 89 | return domain.substring(0, portIndex); 90 | } else { 91 | return domain; 92 | } 93 | } 94 | 95 | 96 | private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); 97 | 98 | public static String getCharset(String contentType) { 99 | Matcher matcher = patternForCharset.matcher(contentType); 100 | if (matcher.find()) { 101 | String charset = matcher.group(1); 102 | if (Charset.isSupported(charset)) { 103 | return charset; 104 | } 105 | } 106 | return null; 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /cetty-core/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss} %-5level [%thread] %class{5}:%line>>%msg%n 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/core/bootstrap/BootstrapTest.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.bootstrap; 2 | 3 | import com.jibug.cetty.core.Bootstrap; 4 | import com.jibug.cetty.core.Payload; 5 | import com.jibug.cetty.core.handler.PageProcessHandler; 6 | 7 | /** 8 | * @author heyingcai 9 | */ 10 | public class BootstrapTest { 11 | 12 | public static void main(String[] args) { 13 | //启动引导类 14 | Bootstrap. 15 | me(). 16 | //是否异步抓取 17 | isAsync(true). 18 | //开启线程数 19 | setThreadNum(1). 20 | //起始url 21 | startUrl("http://www.baidu.com"). 22 | //全局请求信息 23 | setPayload(Payload.custom()). 24 | //自定处理器handler 25 | addHandler(new PageProcessHandler()). 26 | //启动 27 | start(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/core/handler/HandlerTest.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.handler; 2 | 3 | import com.jibug.cetty.core.Cetty; 4 | import com.jibug.cetty.core.Seed; 5 | 6 | /** 7 | * @author heyingcai 8 | */ 9 | public class HandlerTest { 10 | 11 | public static void main(String[] args) { 12 | Cetty cetty = new Cetty(); 13 | 14 | HandlerPipeline pipeline = cetty.pipeline(); 15 | 16 | pipeline.addLast(new HandlerInitializer() { 17 | @Override 18 | public void initCetty(Cetty cetty) { 19 | HandlerPipeline pipeline = cetty.pipeline(); 20 | pipeline.addLast(new HttpDownloadHandler()); 21 | pipeline.addLast(new PageProcessHandler()); 22 | } 23 | }); 24 | 25 | pipeline.start(); 26 | 27 | Seed seed = new Seed(""); 28 | 29 | pipeline.download(seed); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/core/net/AsyncHttpClientTest.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | import org.apache.http.HttpResponse; 5 | import org.apache.http.client.methods.HttpGet; 6 | import org.apache.http.concurrent.FutureCallback; 7 | import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; 8 | import org.apache.http.util.EntityUtils; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * @author heyingcai 14 | */ 15 | public class AsyncHttpClientTest { 16 | 17 | public static void main(String[] args) { 18 | HttpClientGenerator asyncHttpClientGenerator = new AsyncHttpClientGenerator(); 19 | Payload payload = new Payload(); 20 | CloseableHttpAsyncClient client = asyncHttpClientGenerator.getClient(payload); 21 | 22 | client.start(); 23 | 24 | final HttpGet[] requests = new HttpGet[]{ 25 | new HttpGet("http://www.apache.org/"), 26 | new HttpGet("http://www.baidu.com/"), 27 | new HttpGet("http://www.oschina.net/") 28 | }; 29 | 30 | for(final HttpGet request: requests){ 31 | client.execute(request, new FutureCallback(){ 32 | @Override 33 | public void completed(Object obj) { 34 | final HttpResponse response = (HttpResponse)obj; 35 | System.out.println(request.getRequestLine() + "->" + response.getStatusLine()); 36 | } 37 | 38 | @Override 39 | public void failed(Exception excptn) { 40 | System.out.println(request.getRequestLine() + "->" + excptn); 41 | } 42 | 43 | @Override 44 | public void cancelled() { 45 | System.out.println(request.getRequestLine() + "cancelled"); 46 | } 47 | }); 48 | } 49 | 50 | 51 | // for (int i = 0; i < 2; i++) { 52 | // HttpRequestBase httpGet = new HttpGet("http://www.baidu.com"); 53 | // client.execute(httpGet, new Back()); 54 | // System.out.println("index :" + i); 55 | // httpGet.releaseConnection(); 56 | // } 57 | 58 | // client.execute(httpGet, new Back()); 59 | } 60 | 61 | static class Back implements FutureCallback { 62 | 63 | @Override 64 | public void completed(HttpResponse httpResponse) { 65 | try { 66 | System.out.println(EntityUtils.toString(httpResponse.getEntity())); 67 | } catch (IOException e) { 68 | e.printStackTrace(); 69 | } 70 | } 71 | 72 | @Override 73 | public void failed(Exception e) { 74 | System.err.println("error:" + e); 75 | } 76 | 77 | @Override 78 | public void cancelled() { 79 | 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/core/net/SyncHttpClientTest.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.core.net; 2 | 3 | import com.jibug.cetty.core.Payload; 4 | import org.apache.http.client.methods.CloseableHttpResponse; 5 | import org.apache.http.client.methods.HttpGet; 6 | import org.apache.http.client.methods.HttpRequestBase; 7 | import org.apache.http.impl.client.CloseableHttpClient; 8 | import org.apache.http.util.EntityUtils; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * @author heyingcai 14 | */ 15 | public class SyncHttpClientTest { 16 | public static void main(String[] args) { 17 | HttpClientGenerator httpClientHttpClientGenerator = new SyncHttpClientGenerator(); 18 | Payload payload = new Payload(); 19 | CloseableHttpClient client = httpClientHttpClientGenerator.getClient(payload); 20 | 21 | HttpRequestBase httpGet = new HttpGet("http://www.baidu.com"); 22 | 23 | try { 24 | CloseableHttpResponse execute = client.execute(httpGet); 25 | 26 | System.out.println(EntityUtils.toString(execute.getEntity())); 27 | } catch (IOException e) { 28 | e.printStackTrace(); 29 | } 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/sample/kuaidaili/Kuaidaili.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.kuaidaili; 2 | 3 | import com.jibug.cetty.core.Bootstrap; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Payload; 6 | import com.jibug.cetty.core.handler.ConsoleReduceHandler; 7 | import com.jibug.cetty.core.handler.HandlerContext; 8 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter; 9 | import com.jibug.cetty.core.net.Proxy; 10 | import com.google.common.collect.Lists; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | 15 | import java.util.List; 16 | 17 | /** 18 | * @author heyingcai 19 | */ 20 | public class Kuaidaili extends ProcessHandlerAdapter { 21 | 22 | @Override 23 | public void process(HandlerContext ctx, Page page) { 24 | Document document = page.getDocument(); 25 | 26 | //jsoup 27 | Elements elements = document.select("div#content>div.con-body>div>div#list>table>tbody>tr"); 28 | 29 | //xpath 30 | List select = page.getHtml().select("//div[@id='content']/div[@class='con-body']/div/div[@id='list']/table/tbody/tr/td[@data-title='IP']/text()"); 31 | 32 | List proxies = Lists.newArrayList(); 33 | for (Element element : elements) { 34 | String ip = element.select("td[data-title=IP]").text(); 35 | String port = element.select("td[data-title=PORT]").text(); 36 | String scheme = element.select("td[data-title=类型]").text(); 37 | Proxy proxy = new Proxy(ip, Integer.parseInt(port),scheme); 38 | proxies.add(proxy); 39 | } 40 | 41 | System.out.println(select); 42 | 43 | page.getResult().addResults(proxies); 44 | 45 | ctx.fireReduce(page); 46 | } 47 | 48 | public static void main(String[] args) throws InterruptedException { 49 | Bootstrap async = Bootstrap.me(). 50 | startUrl("https://www.kuaidaili.com/free"). 51 | addHandler(new Kuaidaili()).setThreadNum(1). 52 | addHandler(new ConsoleReduceHandler()). 53 | setPayload(Payload.custom()). 54 | isAsync(false).isDuration(true); 55 | 56 | async.start(); 57 | 58 | Thread.sleep(15000); 59 | 60 | // async.stop(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /cetty-core/src/test/java/com/jibug/cetty/sample/tianya/Tianya.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.tianya; 2 | 3 | import com.jibug.cetty.core.Bootstrap; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.Payload; 6 | import com.jibug.cetty.core.Result; 7 | import com.jibug.cetty.core.Seed; 8 | import com.jibug.cetty.core.handler.ConsoleReduceHandler; 9 | import com.jibug.cetty.core.handler.HandlerContext; 10 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter; 11 | import com.google.common.collect.Lists; 12 | import org.jsoup.nodes.Document; 13 | import org.jsoup.nodes.Element; 14 | import org.jsoup.select.Elements; 15 | 16 | import java.util.List; 17 | 18 | /** 19 | * 抓取天涯论坛文章列表标题 20 | * http://bbs.tianya.cn/list-333-1.shtml 21 | * 22 | * @author heyingcai 23 | */ 24 | public class Tianya extends ProcessHandlerAdapter { 25 | 26 | @Override 27 | public void process(HandlerContext ctx, Page page) { 28 | //获取 Document 29 | Document document = page.getDocument(); 30 | //dom解析 31 | Elements itemElements = document. 32 | select("div#bbsdoc>div#bd>div#main>div.mt5>table>tbody"). 33 | get(2). 34 | select("tr"); 35 | List titles = Lists.newArrayList(); 36 | for (Element item : itemElements) { 37 | String title = item.select("td.td-title").text(); 38 | titles.add(title); 39 | } 40 | 41 | String href = document.select("div#bbsdoc>div#bd>div#main>div.short-pages-2>div.links>a[rel=nofollow]").first().attr("abs:href"); 42 | 43 | //添加下一页的请求 44 | page.addNextSeed(new Seed(href)); 45 | 46 | //添加下一页的请求 47 | // page.addNextSeed(new Seed("http://bbs.tianya.cn/list.jsp?item=333&nextid=1542249901000")); 48 | 49 | //获取Result对象,将我们解析出来的结果向下一个handler传递 50 | Result result = page.getResult(); 51 | result.addResults(titles); 52 | 53 | //通过fireXXX 方法将本handler 处理的结果向下传递 54 | //本教程直接将结果传递给ConsoleHandler,将结果直接输出控制台 55 | ctx.fireReduce(page); 56 | } 57 | 58 | public static void main(String[] args) { 59 | 60 | //启动引导类 61 | Bootstrap. 62 | me(). 63 | //使用同步抓取 64 | isAsync(true). 65 | //开启一个线程 66 | setThreadNum(1). 67 | //抓取入口url 68 | startUrl("http://bbs.tianya.cn/list-333-1.shtml"). 69 | //通用请求信息 70 | setPayload(Payload.custom().setSleepTime(5000)). 71 | //添加自定处理器 72 | addHandler(new Tianya()). 73 | //添加默认结果处理器,输出致控制台 74 | addHandler(new ConsoleReduceHandler()). 75 | start(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /cetty-samples/.gitignore: -------------------------------------------------------------------------------- 1 | HELP.md 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | !**/src/main/** 5 | !**/src/test/** 6 | 7 | ### STS ### 8 | .apt_generated 9 | .classpath 10 | .factorypath 11 | .project 12 | .settings 13 | .springBeans 14 | .sts4-cache 15 | 16 | ### IntelliJ IDEA ### 17 | .idea 18 | *.iws 19 | *.iml 20 | *.ipr 21 | 22 | ### NetBeans ### 23 | /nbproject/private/ 24 | /nbbuild/ 25 | /dist/ 26 | /nbdist/ 27 | /.nb-gradle/ 28 | build/ 29 | 30 | ### VS Code ### 31 | .vscode/ 32 | -------------------------------------------------------------------------------- /cetty-samples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.springframework.boot 7 | spring-boot-starter-parent 8 | 2.1.6.RELEASE 9 | 10 | 11 | com.jibug.cetty 12 | sample 13 | 0.0.1-SNAPSHOT 14 | cetty-sample 15 | sample for cetty 16 | 17 | 18 | 1.8 19 | 20 | 21 | 22 | 23 | org.springframework.boot 24 | spring-boot-starter 25 | 26 | 27 | 28 | org.springframework.boot 29 | spring-boot-starter-test 30 | test 31 | 32 | 33 | com.jibug.cetty 34 | cetty-core 35 | 0.1.8 36 | 37 | 38 | 39 | 40 | 41 | 42 | org.springframework.boot 43 | spring-boot-maven-plugin 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/SampleApplication.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class SampleApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(SampleApplication.class, args); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/entity/Article.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.entity; 2 | 3 | /** 4 | * @author heyingcai 5 | */ 6 | public class Article { 7 | 8 | private int id; 9 | 10 | private String title; 11 | 12 | private String summary; 13 | 14 | private String publishTime; 15 | 16 | private String url; 17 | 18 | private String listPhoto; 19 | 20 | private String content; 21 | 22 | private String via; 23 | 24 | public int getId() { 25 | return id; 26 | } 27 | 28 | public void setId(int id) { 29 | this.id = id; 30 | } 31 | 32 | public String getTitle() { 33 | return title; 34 | } 35 | 36 | public void setTitle(String title) { 37 | this.title = title; 38 | } 39 | 40 | public String getSummary() { 41 | return summary; 42 | } 43 | 44 | public void setSummary(String summary) { 45 | this.summary = summary; 46 | } 47 | 48 | public String getPublishTime() { 49 | return publishTime; 50 | } 51 | 52 | public void setPublishTime(String publishTime) { 53 | this.publishTime = publishTime; 54 | } 55 | 56 | public String getUrl() { 57 | return url; 58 | } 59 | 60 | public void setUrl(String url) { 61 | this.url = url; 62 | } 63 | 64 | public String getListPhoto() { 65 | return listPhoto; 66 | } 67 | 68 | public void setListPhoto(String listPhoto) { 69 | this.listPhoto = listPhoto; 70 | } 71 | 72 | public String getContent() { 73 | return content; 74 | } 75 | 76 | public void setContent(String content) { 77 | this.content = content; 78 | } 79 | 80 | public String getVia() { 81 | return via; 82 | } 83 | 84 | public void setVia(String via) { 85 | this.via = via; 86 | } 87 | 88 | @Override 89 | public String toString() { 90 | return "Article{" + 91 | "id=" + id + 92 | ", title='" + title + '\'' + 93 | ", summary='" + summary + '\'' + 94 | ", publishTime='" + publishTime + '\'' + 95 | ", url='" + url + '\'' + 96 | ", listPhoto='" + listPhoto + '\'' + 97 | ", content='" + content + '\'' + 98 | ", via='" + via + '\'' + 99 | '}'; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/handler/BasePageHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.handler; 2 | 3 | import com.google.common.base.Strings; 4 | import com.jibug.cetty.core.Page; 5 | import com.jibug.cetty.core.handler.HandlerContext; 6 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter; 7 | import com.jibug.cetty.sample.entity.Article; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.parser.Tag; 10 | import org.jsoup.select.Elements; 11 | 12 | import java.text.ParseException; 13 | import java.text.SimpleDateFormat; 14 | import java.util.Date; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | /** 19 | * @author heyingcai 20 | */ 21 | public abstract class BasePageHandler extends ProcessHandlerAdapter { 22 | 23 | /** 24 | * 路由解析 25 | * 26 | * @param ctx 27 | * @param page 28 | */ 29 | protected abstract void parseRoute(HandlerContext ctx, Page page); 30 | 31 | /** 32 | * 解析文章列表抽象方法 33 | * 34 | * @param ctx 35 | * @param page 36 | */ 37 | protected abstract void parseListing(HandlerContext ctx, Page page); 38 | 39 | /** 40 | * 解析文章内容体抽象方法 41 | * 42 | * @param ctx 43 | * @param page 44 | */ 45 | protected abstract void parseBody(HandlerContext ctx, Page page); 46 | 47 | /** 48 | * 生成我们自己的文章内容体 49 | * 50 | * @param tempBody 51 | */ 52 | public Element appendBody(Elements tempBody) { 53 | final Element articleBody = new Element(Tag.valueOf("div"), ""); 54 | for (final Element pEl : tempBody) { 55 | Element imgEl = pEl.select("img").first(); 56 | if (imgEl != null) { 57 | articleBody.appendChild(buildFigure(imgEl)); 58 | continue; 59 | } 60 | articleBody.appendChild(pEl); 61 | } 62 | return articleBody; 63 | } 64 | 65 | /** 66 | * 解析图片格式 67 | * 68 | * @param bodyElement 69 | * @return 70 | */ 71 | protected Element buildFigure(Element bodyElement) { 72 | final Element figure = new Element(Tag.valueOf("figure"), ""); 73 | figure.appendChild(bodyElement); 74 | return figure; 75 | } 76 | 77 | /** 78 | * 解析图文格式 79 | * 80 | * @param figcaptionText 81 | * @return 82 | */ 83 | protected Element buildFigcaption(String figcaptionText) { 84 | final Element figcaption = new Element(Tag.valueOf("figcaption"), ""); 85 | figcaption.append(figcaptionText); 86 | return figcaption; 87 | } 88 | 89 | /** 90 | * 解析段落块格式 91 | * 92 | * @param blockquote 93 | * @param articleBody 94 | */ 95 | protected void buildBlockquote(String blockquote, Element articleBody) { 96 | final Element blockquoteEl = new Element(Tag.valueOf("blockquote"), ""); 97 | blockquoteEl.append(blockquote); 98 | articleBody.appendChild(blockquoteEl); 99 | } 100 | 101 | public void buildArticle(Article article, Page page, Element articleBody) { 102 | article.setContent(articleBody.toString()); 103 | article.setVia(page.getSeed().getAttach("via").toString()); 104 | article.setUrl(page.getUrl()); 105 | article.setSummary(page.getSeed().getAttach("summary").toString()); 106 | Object listPhoto = page.getSeed().getAttach("listPhoto"); 107 | article.setListPhoto(listPhoto == null ? null : listPhoto.toString()); 108 | article.setTitle(page.getSeed().getAttach("title").toString()); 109 | } 110 | 111 | public static String dealDateFormat(String dateStr) throws ParseException { 112 | SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 113 | SimpleDateFormat sd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX"); 114 | Date date = sd.parse(dateStr); 115 | return sdf.format(date); 116 | } 117 | 118 | public String regex(String str, String regex) { 119 | final Pattern p = Pattern.compile(regex); 120 | final Matcher match = p.matcher(str); 121 | if (!match.find()) { 122 | return null; 123 | } 124 | 125 | String Str = match.group(1); 126 | if (Strings.isNullOrEmpty(Str)) { 127 | return null; 128 | } 129 | return Str; 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/handler/CifnewsPageHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Seed; 5 | import com.jibug.cetty.core.handler.HandlerContext; 6 | import com.jibug.cetty.sample.entity.Article; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.parser.Tag; 10 | import org.jsoup.select.Elements; 11 | import org.springframework.stereotype.Component; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | /** 19 | * 雨果网抓取 20 | * https://www.cifnews.com/Search/1?keyword=shopify 21 | * 22 | * @author heyingcai 23 | */ 24 | @Component 25 | public class CifnewsPageHandler extends BasePageHandler { 26 | 27 | private static final Pattern PAGE_REGEX_PATTERN = Pattern.compile("/Search/(\\d+)"); 28 | 29 | 30 | @Override 31 | public void process(HandlerContext ctx, Page page) { 32 | parseRoute(ctx, page); 33 | } 34 | 35 | @Override 36 | protected void parseRoute(HandlerContext ctx, Page page) { 37 | String pageUrl = page.getUrl(); 38 | if (pageUrl.contains("/Search/")) { 39 | parseListing(ctx, page); 40 | } else { 41 | parseBody(ctx, page); 42 | } 43 | } 44 | 45 | @Override 46 | protected void parseListing(HandlerContext ctx, Page page) { 47 | Document document = page.getDocument(); 48 | 49 | Elements articles = document.select("ul.search_list>li"); 50 | 51 | List seeds = new ArrayList<>(); 52 | for (Element article : articles) { 53 | String url = article.select("a").attr("abs:href"); 54 | String title = article.select("a").text(); 55 | 56 | String summary = article.select("p").text(); 57 | 58 | Seed seed = new Seed(url); 59 | seed.putAttach("via", page.getSeed().getAttach("via")); 60 | seed.putAttach("summary", summary); 61 | seed.putAttach("title", title); 62 | 63 | seeds.add(seed); 64 | } 65 | 66 | Matcher matcher = PAGE_REGEX_PATTERN.matcher(page.getUrl()); 67 | if (!matcher.find()) { 68 | return; 69 | } 70 | 71 | final String pageNumStr = matcher.group(0).replace("/Search/", ""); 72 | int nextPageNum = Integer.parseInt(pageNumStr); 73 | int pageLimit = Integer.parseInt(page.getSeed().getAttach("pageLimit").toString()); 74 | 75 | if (++nextPageNum <= pageLimit) { 76 | String nextPageUrl = String.format("https://www.cifnews.com/Search/%d?keyword=shopify", nextPageNum); 77 | Seed seed = new Seed(nextPageUrl); 78 | seed.putAttach("pageLimit", page.getSeed().getAttach("pageLimit").toString()); 79 | seed.putAttach("via", page.getSeed().getAttach("via").toString()); 80 | page.addNextSeed(seed); 81 | } 82 | 83 | page.addNextSeed(seeds); 84 | 85 | ctx.fireReduce(page); 86 | } 87 | 88 | @Override 89 | protected void parseBody(HandlerContext ctx, Page page) { 90 | Document document = page.getDocument(); 91 | 92 | Elements content = document.select("div.article-box>div.leftcont>*"); 93 | 94 | String publishTime = content.select("div.info-bar>div.time").text(); 95 | Article article = new Article(); 96 | article.setPublishTime(publishTime); 97 | 98 | buildArticle(article, page, appendBody(content)); 99 | 100 | page.getResult().putField("article", article); 101 | 102 | ctx.fireReduce(page); 103 | } 104 | 105 | @Override 106 | public Element appendBody(Elements tempBody) { 107 | final Element articleBody = new Element(Tag.valueOf("div"), ""); 108 | String blockquote = tempBody.select("div.fetch-read>div.summary").text(); 109 | buildBlockquote(blockquote, articleBody); 110 | Elements inner = tempBody.select("div.article-inner>*"); 111 | for (Element pEl : inner) { 112 | if (pEl.select("div.fetch-present").size() != 0) { 113 | continue; 114 | } 115 | Element imgEl = pEl.select("p>img").first(); 116 | if (imgEl != null) { 117 | Element figure = buildFigure(imgEl); 118 | if (imgEl.nextElementSibling() != null && imgEl.nextElementSibling().tagName().equals("p")) { 119 | Element figcaption = buildFigcaption(imgEl.nextElementSibling().text()); 120 | figure.appendChild(figcaption); 121 | articleBody.appendChild(figure); 122 | continue; 123 | } 124 | articleBody.appendChild(figure); 125 | continue; 126 | } 127 | articleBody.appendChild(pEl); 128 | } 129 | return articleBody; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/handler/GuxiaobeiPageHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Seed; 5 | import com.jibug.cetty.core.handler.HandlerContext; 6 | import com.jibug.cetty.sample.entity.Article; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.parser.Tag; 10 | import org.jsoup.select.Elements; 11 | import org.springframework.stereotype.Component; 12 | 13 | import java.text.ParseException; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.regex.Matcher; 17 | import java.util.regex.Pattern; 18 | 19 | /** 20 | * 抓取顾小北博客 21 | * http://www.guxiaobei.com/shopify 22 | * 23 | * @author heyingcai 24 | */ 25 | @Component 26 | public class GuxiaobeiPageHandler extends BasePageHandler { 27 | 28 | private static final Pattern PAGE_REGEX_PATTERN = Pattern.compile("/page/(\\d+)"); 29 | 30 | @Override 31 | public void process(HandlerContext ctx, Page page) { 32 | parseRoute(ctx, page); 33 | } 34 | 35 | @Override 36 | protected void parseRoute(HandlerContext ctx, Page page) { 37 | String pageUrl = page.getUrl(); 38 | 39 | if (pageUrl.contains("/page/")) { 40 | parseListing(ctx, page); 41 | } else { 42 | parseBody(ctx, page); 43 | } 44 | } 45 | 46 | /** 47 | * 解析文章列表 48 | * 49 | * @param page 50 | */ 51 | @Override 52 | public void parseListing(HandlerContext ctx, Page page) { 53 | Document document = page.getDocument(); 54 | 55 | Elements articles = document.select("article.excerpt"); 56 | 57 | List seeds = new ArrayList<>(); 58 | for (Element article : articles) { 59 | Elements header = article.select("header>h2>a"); 60 | String url = header.attr("href"); 61 | String title = header.attr("title"); 62 | 63 | Elements listPhoto = article.select("div.focus>a>img"); 64 | String listPhotoUrl = listPhoto.attr("src"); 65 | 66 | String summary = article.select("span.note").first().ownText(); 67 | 68 | Seed seed = new Seed(url); 69 | seed.putAttach("via", page.getSeed().getAttach("via")); 70 | seed.putAttach("listPhoto", listPhotoUrl); 71 | seed.putAttach("summary", summary); 72 | seed.putAttach("title", title); 73 | 74 | seeds.add(seed); 75 | } 76 | 77 | Matcher matcher = PAGE_REGEX_PATTERN.matcher(page.getUrl()); 78 | if (!matcher.find()) { 79 | return; 80 | } 81 | 82 | final String pageNumStr = matcher.group(0).replace("/page/", ""); 83 | int nextPageNum = Integer.parseInt(pageNumStr); 84 | int pageLimit = Integer.parseInt(page.getSeed().getAttach("pageLimit").toString()); 85 | 86 | if (++nextPageNum <= pageLimit) { 87 | String nextPageUrl = String.format("http://www.guxiaobei.com/search/shopify/page/%d", nextPageNum); 88 | Seed seed = new Seed(nextPageUrl); 89 | seed.putAttach("pageLimit", page.getSeed().getAttach("pageLimit").toString()); 90 | seed.putAttach("via", page.getSeed().getAttach("via").toString()); 91 | page.addNextSeed(seed); 92 | } 93 | 94 | page.addNextSeed(seeds); 95 | 96 | ctx.fireReduce(page); 97 | } 98 | 99 | /** 100 | * 解析文章内容 101 | * 102 | * @param page 103 | */ 104 | @Override 105 | public void parseBody(HandlerContext ctx, Page page) { 106 | Document document = page.getDocument(); 107 | 108 | String publishTime = ""; 109 | try { 110 | publishTime = dealDateFormat(document.select("meta[property=article:published_time]").attr("content")); 111 | } catch (ParseException e) { 112 | e.printStackTrace(); 113 | } 114 | 115 | Article article = new Article(); 116 | article.setPublishTime(publishTime); 117 | 118 | Elements content = document.select("article.article-content>*"); 119 | 120 | buildArticle(article, page, appendBody(content)); 121 | 122 | page.getResult().putField("article", article); 123 | 124 | ctx.fireReduce(page); 125 | } 126 | 127 | @Override 128 | public Element appendBody(Elements tempBody) { 129 | final Element articleBody = new Element(Tag.valueOf("div"), ""); 130 | for (final Element pEl : tempBody) { 131 | if (pEl.select("div.open-message,div.jp-relatedposts,div.article-social").size() != 0) { 132 | continue; 133 | } 134 | if (pEl.tagName().equals("p")) { 135 | Element imgEl = pEl.select("img").first(); 136 | if (imgEl != null) { 137 | String src = imgEl.attr("src"); 138 | if (src.contains("data:image")) { 139 | src = imgEl.attr("data-src"); 140 | } else if (!src.contains("www.guxiaobei.com")) { 141 | src = "http://www.guxiaobei.com" + src; 142 | } 143 | imgEl.attr("src", src); 144 | 145 | articleBody.appendChild(buildFigure(imgEl)); 146 | continue; 147 | } 148 | } 149 | articleBody.appendChild(pEl); 150 | } 151 | return articleBody; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/handler/Waimaob2cPageHandler.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.handler; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Seed; 5 | import com.jibug.cetty.core.handler.HandlerContext; 6 | import com.jibug.cetty.sample.entity.Article; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.parser.Tag; 10 | import org.jsoup.select.Elements; 11 | import org.springframework.stereotype.Component; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | /** 19 | * waimaob2c博客抓取 20 | * https://www.waimaob2c.com/ 21 | * 22 | * @author heyingcai 23 | */ 24 | @Component 25 | public class Waimaob2cPageHandler extends BasePageHandler { 26 | 27 | private static final Pattern PAGE_REGEX_PATTERN = Pattern.compile("/page/(\\d+)"); 28 | 29 | @Override 30 | public void process(HandlerContext ctx, Page page) { 31 | parseRoute(ctx, page); 32 | } 33 | 34 | @Override 35 | protected void parseRoute(HandlerContext ctx, Page page) { 36 | String pageUrl = page.getUrl(); 37 | 38 | if (pageUrl.contains("/page/")) { 39 | parseListing(ctx, page); 40 | } else { 41 | parseBody(ctx, page); 42 | } 43 | } 44 | 45 | @Override 46 | protected void parseListing(HandlerContext ctx, Page page) { 47 | Document document = page.getDocument(); 48 | 49 | Elements articles = document.select("article.excerpt"); 50 | List seeds = new ArrayList<>(); 51 | for (Element article : articles) { 52 | String tagName = article.select("header>a").first().ownText(); 53 | if (tagName.contains("资讯动态")) { 54 | continue; 55 | } 56 | 57 | Element header = article.select("header").first(); 58 | String title = header.select("h2>a").first().ownText(); 59 | String url = header.select("h2>a").attr("href"); 60 | 61 | Seed seed = new Seed(url); 62 | seed.putAttach("title", title); 63 | Elements listPhoto = article.select("p.focus"); 64 | if (listPhoto != null) { 65 | String listPhotoUrl = listPhoto.select("a>img,a>span>img").attr("data-original"); 66 | seed.putAttach("listPhoto", listPhotoUrl); 67 | } 68 | 69 | String summary = article.select("p.note").text(); 70 | seed.putAttach("summary", summary); 71 | seed.putAttach("via", page.getSeed().getAttach("via")); 72 | 73 | seeds.add(seed); 74 | } 75 | 76 | Matcher matcher = PAGE_REGEX_PATTERN.matcher(page.getUrl()); 77 | if (!matcher.find()) { 78 | return; 79 | } 80 | 81 | final String pageNumStr = matcher.group(0).replace("/page/", ""); 82 | int nextPageNum = Integer.parseInt(pageNumStr); 83 | int pageLimit = Integer.parseInt(page.getSeed().getAttach("pageLimit").toString()); 84 | 85 | if (++nextPageNum <= pageLimit) { 86 | String nextPageUrl = String.format("https://www.waimaob2c.com/page/%d", nextPageNum); 87 | Seed seed = new Seed(nextPageUrl); 88 | seed.putAttach("pageLimit", page.getSeed().getAttach("pageLimit").toString()); 89 | seed.putAttach("via", page.getSeed().getAttach("via").toString()); 90 | page.addNextSeed(seed); 91 | } 92 | 93 | page.addNextSeed(seeds); 94 | 95 | ctx.fireReduce(page); 96 | } 97 | 98 | @Override 99 | protected void parseBody(HandlerContext ctx, Page page) { 100 | Document document = page.getDocument(); 101 | 102 | Element metaEl = document.select("header.article-header>ul.article-meta>li").first(); 103 | String publishTime = metaEl.ownText().replace("Shopify 发布于 ", ""); 104 | 105 | Article article = new Article(); 106 | article.setPublishTime(publishTime); 107 | 108 | Elements content = document.select("article.article-content>*"); 109 | 110 | buildArticle(article, page, appendBody(content)); 111 | 112 | page.getResult().putField("article", article); 113 | 114 | ctx.fireReduce(page); 115 | 116 | } 117 | 118 | @Override 119 | public Element appendBody(Elements tempBody) { 120 | final Element articleBody = new Element(Tag.valueOf("div"), ""); 121 | for (Element body : tempBody) { 122 | if (body.tagName().equals("p")) { 123 | boolean skipRegister = body.select("p").text().contains("即刻注册SHOPIFY账户, 跟着我们精心准备的SHOPIFY教程开始外贸独立站之旅!"); 124 | boolean skipCopyRight = body.classNames().contains("post-copyright"); 125 | if (skipRegister || skipCopyRight) { 126 | continue; 127 | } 128 | } 129 | Element imgEl = body.select("img").first(); 130 | if (imgEl != null) { 131 | articleBody.appendChild(buildFigure(imgEl)); 132 | continue; 133 | } 134 | articleBody.appendChild(body); 135 | } 136 | return articleBody; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/reduce/ArticleReducer.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.reduce; 2 | 3 | import com.jibug.cetty.core.Page; 4 | import com.jibug.cetty.core.Result; 5 | import com.jibug.cetty.core.handler.HandlerContext; 6 | import com.jibug.cetty.core.handler.ReduceHandlerAdapter; 7 | import com.jibug.cetty.sample.entity.Article; 8 | import org.springframework.stereotype.Component; 9 | 10 | import java.util.Map; 11 | 12 | /** 13 | * reduce聚合处理器 14 | * 15 | * @author heyingcai 16 | */ 17 | @Component 18 | public class ArticleReducer extends ReduceHandlerAdapter { 19 | 20 | 21 | @Override 22 | public void reduce(HandlerContext ctx, Page page) { 23 | Result result = page.getResult(); 24 | 25 | Map fieldResult = result.getFieldResult(); 26 | if (fieldResult.isEmpty()) { 27 | return; 28 | } 29 | 30 | Article article = (Article) fieldResult.get("article"); 31 | 32 | 33 | System.out.println(article); 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/runner/CrawlerRunner.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.runner; 2 | 3 | import com.jibug.cetty.sample.service.CrawlerService; 4 | import org.springframework.boot.CommandLineRunner; 5 | import org.springframework.stereotype.Component; 6 | 7 | import javax.annotation.Resource; 8 | 9 | /** 10 | * @author heyingcai 11 | * @date 2019-07-02 17:53 12 | */ 13 | @Component 14 | public class CrawlerRunner implements CommandLineRunner { 15 | 16 | @Resource 17 | private CrawlerService crawlerService; 18 | 19 | @Override 20 | public void run(String... args) throws Exception { 21 | crawlerService.start(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/service/CrawlerService.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.service; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.alibaba.fastjson.JSONObject; 5 | import com.jibug.cetty.core.Bootstrap; 6 | import com.jibug.cetty.core.Payload; 7 | import com.jibug.cetty.core.Seed; 8 | import com.jibug.cetty.core.handler.ProcessHandlerAdapter; 9 | import com.jibug.cetty.core.handler.ReduceHandlerAdapter; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.Resource; 16 | 17 | /** 18 | * @author heyingcai 19 | * @date 2019-07-02 17:43 20 | */ 21 | @Component 22 | public class CrawlerService { 23 | 24 | private static final Logger logger = LoggerFactory.getLogger(CrawlerService.class); 25 | 26 | @Resource 27 | private TaskService taskService; 28 | 29 | @Resource 30 | private ApplicationContext applicationContext; 31 | 32 | public void start() { 33 | JSONArray taskObject = taskService.getTaskObject(); 34 | 35 | if (taskObject.size() != 0) { 36 | logger.info("获取到待抓取任务 {}", taskObject.toJSONString()); 37 | for (int i = 0; i < taskObject.size(); i++) { 38 | JSONObject task = taskObject.getJSONObject(i); 39 | if (task.getInteger("status") == 0) { 40 | continue; 41 | } 42 | 43 | Seed seed = new Seed(task.getString("url")); 44 | seed.putAttach("via", task.getString("via")); 45 | seed.putAttach("pageLimit", task.getString("pageLimit")); 46 | 47 | //定义爬虫引导程序 48 | Bootstrap bootstrap = Bootstrap.me() 49 | .startSeed(seed) 50 | .addHandler(applicationContext.getBean(task.getString("pageHandler"), ProcessHandlerAdapter.class)) 51 | .addHandler(applicationContext.getBean(task.getString("pageReducer"), ReduceHandlerAdapter.class)) 52 | .setThreadNum(1) 53 | .setPayload(Payload.custom() 54 | .addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36")) 55 | .isAsync(false); 56 | 57 | bootstrap.start(); 58 | } 59 | } 60 | 61 | 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /cetty-samples/src/main/java/com/jibug/cetty/sample/service/TaskService.java: -------------------------------------------------------------------------------- 1 | package com.jibug.cetty.sample.service; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import org.apache.commons.io.IOUtils; 5 | import org.springframework.beans.factory.annotation.Value; 6 | import org.springframework.core.io.Resource; 7 | import org.springframework.stereotype.Component; 8 | 9 | import javax.annotation.PostConstruct; 10 | import java.io.IOException; 11 | import java.nio.charset.Charset; 12 | 13 | /** 14 | * @author heyingcai 15 | * @date 2019-07-02 17:41 16 | */ 17 | @Component 18 | public class TaskService { 19 | 20 | @Value("classpath:crawler.json") 21 | private Resource resource; 22 | 23 | private JSONArray taskObject; 24 | 25 | @PostConstruct 26 | private void parse() { 27 | try { 28 | String jsonString = IOUtils.toString(resource.getInputStream(), Charset.forName("UTF-8").toString()); 29 | taskObject = JSONArray.parseArray(jsonString); 30 | } catch (IOException e) { 31 | e.printStackTrace(); 32 | } 33 | } 34 | 35 | public JSONArray getTaskObject() { 36 | return taskObject; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /cetty-samples/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cetty-samples/src/main/resources/crawler.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url": "http://www.guxiaobei.com/search/shopify/page/1", 4 | "pageLimit": 2, 5 | "threadNum": 1, 6 | "pageHandler": "guxiaobeiPageHandler", 7 | "pageReducer": "articleReducer", 8 | "via": "顾小北", 9 | "status": 1 10 | }, 11 | { 12 | "url": "https://www.cifnews.com/Search/1?keyword=shopify", 13 | "pageLimit": 16, 14 | "threadNum": 1, 15 | "pageHandler": "cifnewsPageHandler", 16 | "pageReducer": "articleReducer", 17 | "via": "雨果网", 18 | "status": 0 19 | }, 20 | { 21 | "url": "https://www.waimaob2c.com/page/1", 22 | "pageLimit": 10, 23 | "threadNum": 1, 24 | "pageHandler": "waimaob2cPageHandler", 25 | "pageReducer": "articleReducer", 26 | "via": "waimaob2c", 27 | "status": 0 28 | } 29 | ] -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | cetty-parent 8 | com.jibug.cetty 9 | cetty-parent 10 | pom 11 | 0.1.8 12 | 13 | cetty is an event dispatch crawler framework 14 | 15 | https://github.com/heyingcai/cetty 16 | 17 | 18 | bronson 19 | Yingcai He 20 | admin@jibug.com 21 | 22 | 23 | 24 | 25 | 26 | Apache License, Version 2.0 27 | http://www.apache.org/licenses/LICENSE-2.0 28 | 29 | 30 | 31 | scm:git:git@github.com:heyingcai/cetty.git 32 | scm:git:git@github.com:heyingcai/cetty.git 33 | git@github.com:heyingcai/cetty.git 34 | 35 | 36 | cetty-core 37 | cetty-samples 38 | 39 | 40 | 41 | 21.0 42 | 1.16.20 43 | 1.7.12 44 | 1.0.13 45 | 4.5.1 46 | 4.4.6 47 | 4.3.1 48 | 4.1.3 49 | 1.2.28 50 | 3.1 51 | 3.2.2 52 | 1.3.2 53 | 1.10.3 54 | 2.2.1 55 | ${java.home}/../bin/javadoc 56 | 57 | 58 | 59 | 60 | 61 | 62 | junit 63 | junit 64 | 4.11 65 | test 66 | 67 | 68 | com.google.guava 69 | guava 70 | ${guava.version} 71 | 72 | 73 | org.slf4j 74 | slf4j-api 75 | ${slf4j.version} 76 | 77 | 78 | ch.qos.logback 79 | logback-classic 80 | ${logback.version} 81 | 82 | 83 | org.projectlombok 84 | lombok 85 | ${lombok.version} 86 | 87 | 88 | org.apache.httpcomponents 89 | httpclient 90 | ${httpclient.version} 91 | 92 | 93 | 94 | org.apache.httpcomponents 95 | httpmime 96 | ${httpclient-httpmime.version} 97 | 98 | 99 | org.apache.httpcomponents 100 | httpcore 101 | ${httpclient-core.version} 102 | 103 | 104 | org.apache.httpcomponents 105 | httpasyncclient 106 | ${httpclient-async.version} 107 | 108 | 109 | com.alibaba 110 | fastjson 111 | ${fastjson.version} 112 | 113 | 114 | org.apache.commons 115 | commons-lang3 116 | ${common-lang.version} 117 | 118 | 119 | commons-collections 120 | commons-collections 121 | ${commons-collections.version} 122 | 123 | 124 | org.apache.commons 125 | commons-io 126 | ${commons-io.version} 127 | 128 | 129 | org.jsoup 130 | jsoup 131 | ${jsoup.version} 132 | 133 | 134 | cn.wanghaomiao 135 | JsoupXpath 136 | ${jsoupXpath.version} 137 | 138 | 139 | 140 | 141 | 142 | 143 | release 144 | 145 | 146 | 147 | 148 | org.apache.maven.plugins 149 | maven-source-plugin 150 | 2.2.1 151 | 152 | 153 | package 154 | 155 | jar-no-fork 156 | 157 | 158 | 159 | 160 | 161 | 162 | org.apache.maven.plugins 163 | maven-javadoc-plugin 164 | 2.9.1 165 | 166 | private 167 | true 168 | UTF-8 169 | UTF-8 170 | UTF-8 171 | -Xdoclint:none 172 | 173 | 174 | 175 | package 176 | 177 | jar 178 | 179 | 180 | 181 | 182 | 183 | 184 | org.apache.maven.plugins 185 | maven-gpg-plugin 186 | 1.5 187 | 188 | 189 | verify 190 | 191 | sign 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | snapshots 201 | https://oss.sonatype.org/content/repositories/snapshots/ 202 | 203 | 204 | oss 205 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 206 | 207 | 208 | 209 | 210 | --------------------------------------------------------------------------------