├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── --elise---.md │ ├── bug_report.md │ └── feature_request.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── Elise-client ├── build.gradle └── src │ ├── main │ └── java │ │ └── site │ │ └── zido │ │ └── elise │ │ └── client │ │ ├── SpiderBuilder.java │ │ └── scheduler │ │ ├── MultiThreadTaskScheduler.java │ │ └── SyncTaskScheduler.java │ └── test │ ├── java │ └── site │ │ └── zido │ │ └── elise │ │ └── client │ │ └── SpiderTest.java │ └── resources │ └── log4j2.xml ├── Elise-core ├── build.gradle └── src │ ├── main │ └── java │ │ └── site │ │ └── zido │ │ └── elise │ │ ├── E.java │ │ ├── Operator.java │ │ ├── Spider.java │ │ ├── custom │ │ ├── Config.java │ │ ├── ConfigUtils.java │ │ ├── GlobalConfig.java │ │ ├── GlobalConfigBuilder.java │ │ ├── HttpClientConfig.java │ │ ├── MappedConfig.java │ │ └── SiteConfig.java │ │ ├── downloader │ │ ├── AbstractDownloaderFactory.java │ │ ├── DefaultDownloaderFactory.java │ │ ├── DownloadException.java │ │ ├── Downloader.java │ │ ├── DownloaderFactory.java │ │ ├── HtmlUnitDownloader.java │ │ ├── HtmlUnitDownloaderFacotory.java │ │ ├── HttpClientDownloader.java │ │ ├── HttpClientDownloaderFactory.java │ │ ├── ProxiableDownloader.java │ │ └── httpclient │ │ │ ├── CustomRedirectStrategy.java │ │ │ └── HttpClientHeaderWrapper.java │ │ ├── events │ │ ├── EventListener.java │ │ ├── EventSupport.java │ │ ├── SingleEventListener.java │ │ ├── SingleListenerContainer.java │ │ ├── SingleProcessorEventListener.java │ │ └── TaskEventListener.java │ │ ├── http │ │ ├── Body.java │ │ ├── Cookie.java │ │ ├── Header.java │ │ ├── Http.java │ │ ├── HttpModel.java │ │ ├── Pair.java │ │ ├── Request.java │ │ ├── RequestBuilder.java │ │ ├── Response.java │ │ ├── SimpleHttpClient.java │ │ └── impl │ │ │ ├── DefaultBody.java │ │ │ ├── DefaultCookie.java │ │ │ ├── DefaultHeader.java │ │ │ ├── DefaultRequest.java │ │ │ ├── DefaultResponse.java │ │ │ └── HttpClientBodyWrapper.java │ │ ├── processor │ │ ├── BlankSaver.java │ │ ├── DefaultResponseProcessor.java │ │ ├── ListenableResponseProcessor.java │ │ ├── MemorySaver.java │ │ ├── ProcessorEventListener.java │ │ ├── ResponseContextHolder.java │ │ ├── ResponseHolder.java │ │ ├── ResponseProcessor.java │ │ ├── ResultItem.java │ │ ├── Saver.java │ │ └── SaverHandler.java │ │ ├── proxy │ │ ├── Proxy.java │ │ ├── ProxyProvider.java │ │ └── SimpleProxyProvider.java │ │ ├── scheduler │ │ ├── AbstractScheduler.java │ │ ├── CountManager.java │ │ ├── DefaultMemoryCountManager.java │ │ ├── DefaultOperator.java │ │ ├── DuplicationProcessor.java │ │ ├── HashSetDeduplicationProcessor.java │ │ ├── MonitorableScheduler.java │ │ ├── NoDepuplicationProcessor.java │ │ ├── OperationalTaskScheduler.java │ │ ├── Seed.java │ │ └── TaskScheduler.java │ │ ├── select │ │ ├── CssSelectHandler.java │ │ ├── CssSelector.java │ │ ├── ElementSelector.java │ │ ├── FieldType.java │ │ ├── Fragment.java │ │ ├── HtmlLinkSelector.java │ │ ├── LinkSelectHandler.java │ │ ├── ModelExtractor.java │ │ ├── NumberMatcherSelectHandler.java │ │ ├── OriginSelectorHandler.java │ │ ├── Paragraph.java │ │ ├── RegexSelectHandler.java │ │ ├── RichType.java │ │ ├── SelectHandler.java │ │ ├── SelectorMatchException.java │ │ ├── SiteMatcherSelectHandler.java │ │ ├── XpathSelectHandler.java │ │ └── XpathSelector.java │ │ ├── task │ │ ├── DefaultTask.java │ │ ├── Task.java │ │ ├── annotations │ │ │ ├── EliseHelper.java │ │ │ ├── EliseModel.java │ │ │ ├── ElisePartition.java │ │ │ └── EliseTarget.java │ │ ├── api │ │ │ ├── DataDescriptor.java │ │ │ ├── DefaultSelectableResponse.java │ │ │ ├── ElementSelectable.java │ │ │ ├── ElementValue.java │ │ │ ├── HelpDescriptor.java │ │ │ ├── PartitionDescriptor.java │ │ │ ├── RepeatMatchException.java │ │ │ ├── ResponseHandler.java │ │ │ ├── SelectableResponse.java │ │ │ ├── Source.java │ │ │ ├── TargetDescriptor.java │ │ │ └── Value.java │ │ └── model │ │ │ ├── Action.java │ │ │ ├── Model.java │ │ │ ├── ModelField.java │ │ │ └── Partition.java │ │ └── utils │ │ ├── ActionUtils.java │ │ ├── Asserts.java │ │ ├── EventUtils.java │ │ ├── HtmlUtils.java │ │ ├── IdWorker.java │ │ ├── ModuleNamedDefaultThreadFactory.java │ │ ├── Safe.java │ │ ├── StringUtils.java │ │ ├── SystemClock.java │ │ ├── UrlUtils.java │ │ └── ValidateUtils.java │ └── test │ ├── java │ └── site │ │ └── zido │ │ └── elise │ │ ├── task │ │ └── api │ │ │ └── DefaultSelectableResponseTest.java │ │ └── utils │ │ └── HtmlUtilsTest.java │ └── resources │ ├── db.properties │ ├── log4j2.xml │ └── task │ └── api │ └── model1.json ├── Elise-distributed ├── build.gradle └── src │ ├── main │ └── java │ │ └── site │ │ └── zido │ │ └── elise │ │ └── distributed │ │ └── AbstractQueueScheduler.java │ └── test │ └── resources │ └── log4j2.xml ├── Elise-jedis-support └── build.gradle ├── Elise-kafka-support ├── build.gradle └── src │ ├── main │ ├── java │ │ └── site │ │ │ └── zido │ │ │ └── elise │ │ │ └── kafka │ │ │ ├── SpringKafkaTaskScheduler.java │ │ │ └── pojo │ │ │ └── Seed.java │ └── resources │ │ └── .gitkeep │ └── test │ └── java │ └── .gitkeep ├── Elise-redis-support ├── build.gradle └── src │ └── main │ └── java │ └── site │ └── zido │ └── elise │ └── support │ └── redis │ └── scheduler │ └── RedisTaskScheduler.java ├── Elise-test-server ├── build.gradle └── src │ ├── main │ ├── java │ │ └── site │ │ │ └── zido │ │ │ └── elise │ │ │ └── test │ │ │ ├── Server.java │ │ │ ├── StaticHandler.java │ │ │ ├── handlers │ │ │ └── OneHandler.java │ │ │ └── utils │ │ │ ├── ResourcesUtils.java │ │ │ └── TemplateUtils.java │ └── resources │ │ └── log4j2.xml │ └── test │ └── java │ └── site │ └── zido │ └── elise │ └── test │ └── TestServer.java ├── LICENSE ├── build.gradle ├── docs ├── CNAME ├── CODE_OF_CONDUCT.md ├── CODE_REQUIREMENTS.md ├── CONTRIBUTING.md ├── README.md ├── ROADMAP.md ├── TUTORIAL.md ├── _config.yml └── elise.png └── settings.gradle /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | [*] 3 | charset = utf-8 4 | indent_style = space 5 | indent_size = 4 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | end_of_line = lf 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/--elise---.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 关于Elise的疑问 3 | about: 提出任何你使用或开发的问题 4 | title: '' 5 | labels: question 6 | assignees: zidoshare 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 缺陷反馈 3 | about: 报告缺陷以帮助我们改进 4 | title: '' 5 | labels: bug 6 | assignees: zidoshare 7 | 8 | --- 9 | 10 | ### 描述问题 11 | 12 | 请尽量清晰精准地描述你碰到的问题。 13 | 14 | ### 重现步骤 15 | 16 | 请描述如何重现这个问题: 17 | 18 | 1. Go to '...' 19 | 2. use '...' 20 | 3. See error 21 | 22 | ### 期待的结果 23 | 24 | 请尽量清晰精准地描述你所期待的结果。 25 | 26 | ### 截屏或录像 27 | 28 | 如果可能,请尽量附加截图或录像来描述你遇到的问题。 29 | 30 | ### 其他信息 31 | 32 | 请提供其他附加信息帮助我们诊断问题。 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 意见或建议 3 | about: 提出你期待的功能特性或者对已有功能的意见或者建议 4 | title: '' 5 | labels: help wanted 6 | assignees: zidoshare 7 | 8 | --- 9 | 10 | ### 你在什么场景下需要该功能? 11 | 12 | 请尽量清晰精准地描述你碰到的问题。 13 | 14 | ### 描述可能的解决方案 15 | 16 | 请尽量清晰精准地描述你期待我们要做的,描述你想到的实现方案。 17 | 18 | ### 描述你认为的候选方案 19 | 20 | 请尽量清晰精准地描述你能接受的候选解决方案。 21 | 22 | ### 其他信息 23 | 24 | 请提供关于该功能建议的其他附加信息。 25 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 首先,非常感谢您的贡献!:smile: 2 | 3 | 请确保在提交拉取请求之前检查这些复选框,谢谢! 4 | 5 | * [ ] 请确保提交pull request的分支为`master`分支 6 | * [ ] 请确保已经阅读过[关于代码](../CODE_REQUIREMENTS.md)的所有约定 7 | * [ ] 请确保代码已经经过格式化,与已有代码持有一致的代码风格 8 | * [ ] 在创建拉取请求之前使用`rebase`以保持提交历史记录清除 9 | * [ ] 添加一些描述并为您拉取请求引用相关问题。 10 | 11 | 额外清单: 12 | 13 | **if** *bug处理* **:** 14 | 15 | * [ ] 确保为已修复的错误添加至少一个单元测试 16 | 17 | **elif** *新功能添加* **:** 18 | 19 | * [ ] 文档更新 20 | * [ ] 更新/添加演示以演示新功能。 21 | * [ ] 添加功能的单元测试。 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/target/ 2 | 3 | **/.gitignore 4 | # IntelliJ project files 5 | .idea 6 | *.iml 7 | **/out 8 | html 9 | *.ipr 10 | *.iws 11 | 12 | # Eclipse project files 13 | **/.classpath 14 | **/.project 15 | **/.settings/ 16 | **/bin/ 17 | 18 | # gradle 19 | .gradle 20 | **/build 21 | 22 | # MacOS 23 | .DS_Store 24 | /repo/ 25 | gradle/ 26 | gradlew 27 | gradlew.bat -------------------------------------------------------------------------------- /Elise-client/build.gradle: -------------------------------------------------------------------------------- 1 | description = 'Elise client' 2 | dependencies { 3 | implementation project(':Elise-core') 4 | testImplementation project(':Elise-test-server') 5 | } 6 | -------------------------------------------------------------------------------- /Elise-client/src/main/java/site/zido/elise/client/scheduler/SyncTaskScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.client.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.scheduler.AbstractScheduler; 6 | import site.zido.elise.task.Task; 7 | 8 | /** 9 | * 同步任务调度器简单实现 10 | * 11 | * @author zido 12 | */ 13 | public class SyncTaskScheduler extends AbstractScheduler { 14 | private boolean start = true; 15 | 16 | @Override 17 | protected void pushWhenNoDuplicate(Task task, Request request) { 18 | if (!start) { 19 | return; 20 | } 21 | Response response = super.onDownload(task, request); 22 | super.onProcess(task, request, response); 23 | } 24 | 25 | @Override 26 | public void cancel(boolean ifRunning) { 27 | start = false; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Elise-client/src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] [%level] %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Elise-core/build.gradle: -------------------------------------------------------------------------------- 1 | description = 'Elise-core' 2 | dependencies { 3 | compile(group: 'com.virjar', name: 'sipsoup', version: '1.6') { 4 | exclude(module: 'logback-core') 5 | exclude(module: 'logback-classic') 6 | exclude(module: 'log4j') 7 | exclude(module: 'slf4j-log4j12') 8 | } 9 | compile group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.3' 10 | compile group: 'org.jsoup', name: 'jsoup', version: '1.10.3' 11 | compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25' 12 | compile group: 'net.sourceforge.htmlunit', name: 'htmlunit', version: '2.30' 13 | compile group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: '2.9.8' 14 | compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.9.10.4' 15 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.11.0' 16 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.11.0' 17 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.11.0' 18 | testCompile group: 'junit', name: 'junit', version: '4.12' 19 | } 20 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/E.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise; 2 | 3 | import java.nio.charset.Charset; 4 | 5 | /** 6 | * this class provide some constants 7 | * 8 | * @author zido 9 | */ 10 | public class E { 11 | /** 12 | * The constant UTF_8. 13 | */ 14 | public static final Charset UTF_8 = Charset.forName("utf-8"); 15 | /** 16 | * The constant ASCII. 17 | */ 18 | public static final Charset ASCII = Charset.forName("US-ASCII"); 19 | /** 20 | * The constant ISO_8859_1. 21 | */ 22 | public static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); 23 | 24 | /** 25 | * The type Status code. 26 | * 27 | * @author zido 28 | */ 29 | public static class StatusCode { 30 | /** 31 | * The constant CODE_DOWNLOAD_ERROR. 32 | */ 33 | public static final int CODE_DOWNLOAD_ERROR = -1; 34 | /** 35 | * The constant CODE_200. 36 | */ 37 | public static final int CODE_200 = 200; 38 | } 39 | 40 | public static class Action { 41 | public static final String MATCH_LINK = "match_link"; 42 | public static final String MATCH_NUMBER = "match_number"; 43 | public static final String LINK_SELECTOR = "select_link"; 44 | public static final String SELECT_URL = "select_url"; 45 | public static final String SELECT_ORIGIN = "select_origin"; 46 | public static final String CSS_SELECTOR = "css_selector"; 47 | public static final String XPATH_SELECTOR = "xpath_selector"; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/Operator.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise; 2 | 3 | import site.zido.elise.events.SingleEventListener; 4 | import site.zido.elise.http.Request; 5 | 6 | /** 7 | * The interface Operator. 8 | * 9 | * @author zido 10 | */ 11 | public interface Operator { 12 | /** 13 | * Cancel task of the operator.The Spider will no longer accept any new requests of the task. 14 | * 15 | * @param ifRunning If true,the task will wait until the existing request is completed before ending the crawler. and else,will end all request immediately. 16 | * @return the operator 17 | */ 18 | Operator cancel(boolean ifRunning); 19 | 20 | /** 21 | * Pause task of the operator.And Spider will no long accept any new requests of the task. 22 | * 23 | * @return the boolean 24 | */ 25 | Operator pause(); 26 | 27 | /** 28 | * recover task of the operator.And the spider will re-accept the new request of the task. 29 | * 30 | * @return the operator 31 | */ 32 | Operator recover(); 33 | 34 | /** 35 | * waiting until the task success or cancel 36 | * 37 | * @return this operator 38 | * @throws InterruptedException thread interrupted 39 | */ 40 | Operator block() throws InterruptedException; 41 | 42 | /** 43 | * Add url operator. 44 | * 45 | * @param url the url 46 | * @return the operator 47 | */ 48 | Operator execute(String... url); 49 | 50 | /** 51 | * Execute operator. 52 | * 53 | * @param request the request 54 | * @return the operator 55 | */ 56 | Operator execute(Request request); 57 | 58 | /** 59 | * Add event listener. 60 | * 61 | * @param listener the listener 62 | * @return the operator 63 | */ 64 | Operator addEventListener(SingleEventListener listener); 65 | } 66 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/Spider.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise; 2 | 3 | import site.zido.elise.custom.Config; 4 | import site.zido.elise.events.EventSupport; 5 | import site.zido.elise.task.api.ResponseHandler; 6 | 7 | /** 8 | * The Spider Interface. 9 | *

10 | * Is the core of the operation of the entire crawler. 11 | * 12 | * @author zido 13 | */ 14 | public interface Spider extends EventSupport { 15 | 16 | /** 17 | * create a new task by response handler api. 18 | * 19 | * @param handler the handler 20 | * @param config the config 21 | * @return the operator 22 | */ 23 | Operator of(ResponseHandler handler, Config config); 24 | 25 | /** 26 | * create a new task by response handler api. 27 | * 28 | * @param handler the handler 29 | * @return the operator 30 | */ 31 | default Operator of(ResponseHandler handler) { 32 | return of(handler, null); 33 | } 34 | 35 | /** 36 | * create a new task by model class 37 | * 38 | * @param modelClass the model class 39 | * @return the operator 40 | */ 41 | default Operator of(Class modelClass) { 42 | return of(modelClass, null); 43 | } 44 | 45 | /** 46 | * create a new task by model class and config 47 | * 48 | * @param modelClass the model class 49 | * @param config the config 50 | * @return the operator 51 | */ 52 | Operator of(Class modelClass, Config config); 53 | 54 | 55 | /** 56 | * Cancel the spider.The Spider will no longer accept any new tasks/requests. 57 | * 58 | * @param ifRunning If true,the Spider will wait until the existing task is completed before ending the crawler. and else,will end all tasks immediately. 59 | */ 60 | void cancel(boolean ifRunning); 61 | 62 | /** 63 | * Pause the spider. 64 | * 65 | * @return the boolean 66 | */ 67 | boolean pause(); 68 | 69 | /** 70 | * Recover the spider. 71 | */ 72 | void recover(); 73 | } 74 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/Config.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.Reader; 6 | import java.util.Map; 7 | 8 | /** 9 | * the config interface 10 | * 11 | * @author zido 12 | */ 13 | public interface Config extends Map { 14 | 15 | /** 16 | * parse from reader. 17 | * 18 | * @param reader the reader 19 | * @throws IOException the io exception 20 | */ 21 | void from(Reader reader) throws IOException; 22 | 23 | /** 24 | * parse from input stream 25 | * 26 | * @param in the in 27 | * @throws IOException the io exception 28 | */ 29 | void from(InputStream in) throws IOException; 30 | 31 | /** 32 | * set the key value pair 33 | * 34 | * @param key the key 35 | * @param value the value 36 | */ 37 | void set(String key, Object value); 38 | 39 | /** 40 | * get the value by key 41 | * 42 | * @param the type parameter 43 | * @param key the key 44 | * @return the t 45 | */ 46 | T get(String key); 47 | } 48 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/ConfigUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | /** 7 | * The type Config utils. 8 | * 9 | * @author zido 10 | */ 11 | public class ConfigUtils { 12 | private ConfigUtils() { 13 | } 14 | 15 | /** 16 | * Merge config t. 17 | * 18 | * @param the type parameter 19 | * @param key the key 20 | * @param config the config 21 | * @return the t 22 | */ 23 | public static T mergeConfig(String key, Config... config) { 24 | T result = null; 25 | for (Config c : config) { 26 | final T o = c.get(key); 27 | if (o != null) { 28 | result = o; 29 | } 30 | } 31 | return result; 32 | } 33 | 34 | /** 35 | * Merge config config. 36 | * 37 | * @param config the config 38 | * @return the config 39 | */ 40 | public static Config mergeConfig(Config... config) { 41 | Map result = new HashMap<>(); 42 | for (Config c : config) { 43 | if (c != null) { 44 | for (String s : c.keySet()) { 45 | final Object value = c.get(s); 46 | if (value != null) { 47 | result.put(s, value); 48 | } 49 | } 50 | } 51 | } 52 | return new MappedConfig(result); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/GlobalConfig.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | import site.zido.elise.http.Header; 4 | import site.zido.elise.proxy.Proxy; 5 | 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Objects; 10 | 11 | /** 12 | * The type Global config. 13 | * 14 | * @author zido 15 | */ 16 | public class GlobalConfig extends MappedConfig { 17 | /** 18 | * The constant KEY_USER_AGENT. 19 | */ 20 | public static final String KEY_USER_AGENT = "userAgent"; 21 | /** 22 | * The constant KEY_COOKIE. 23 | */ 24 | public static final String KEY_COOKIE = "cookie"; 25 | /** 26 | * The constant KEY_CHARSET. 27 | */ 28 | public static final String KEY_CHARSET = "charset"; 29 | /** 30 | * The constant KEY_SLEEP_TIME. 31 | */ 32 | public static final String KEY_SLEEP_TIME = "sleepTime"; 33 | /** 34 | * The number of times the request was retried when the download failed 35 | */ 36 | public static final String KEY_RETRY_TIMES = "retryTimes"; 37 | /** 38 | * The constant KEY_TIME_OUT. 39 | */ 40 | public static final String KEY_TIME_OUT = "outTime"; 41 | /** 42 | * The constant KEY_DOWNLOAD_MODE. 43 | */ 44 | public static final String KEY_DOWNLOAD_MODE = "downloadMode"; 45 | /** 46 | * The constant KEY_SUCCESS_CODE. 47 | */ 48 | public static final String KEY_SUCCESS_CODE = "successCode"; 49 | /** 50 | * The constant KEY_DISABLE_COOKIE. 51 | */ 52 | public static final String KEY_DISABLE_COOKIE = "disableCookie"; 53 | /** 54 | * The constant KEY_HEADERS. 55 | */ 56 | public static final String KEY_HEADERS = "headers"; 57 | /** 58 | * The number of retries that were added to the task scheduler when the download failed 59 | */ 60 | public static final String KEY_SCHEDULE_RETRY_TIMES = "scheduleRetryTimes"; 61 | /** 62 | * The constant KEY_POOL_SIZE. 63 | */ 64 | public static final String KEY_POOL_SIZE = "poolSize"; 65 | /** 66 | * The constant KEY_USE_GZIP. 67 | */ 68 | public static final String KEY_USE_GZIP = "useGzip"; 69 | /** 70 | * The constant KEY_PROXY. 71 | */ 72 | public static final String KEY_PROXY = "proxy"; 73 | private static final List

EMPTY_HEADERS = new LinkedList<>(); 74 | private static final long serialVersionUID = -6234664119002484979L; 75 | 76 | /** 77 | * Instantiates a new Global config. 78 | */ 79 | public GlobalConfig() { 80 | } 81 | 82 | /** 83 | * Instantiates a new Global config. 84 | * 85 | * @param config the config 86 | */ 87 | public GlobalConfig(Map config) { 88 | super(config); 89 | } 90 | 91 | /** 92 | * Gets user agent. 93 | * 94 | * @return the user agent 95 | */ 96 | public String getUserAgent() { 97 | return get(KEY_USER_AGENT); 98 | } 99 | 100 | /** 101 | * Gets cookies. 102 | * 103 | * @return the cookies 104 | */ 105 | public Map getCookies() { 106 | return get(KEY_COOKIE); 107 | } 108 | 109 | /** 110 | * Gets charset. 111 | * 112 | * @return the charset 113 | */ 114 | public String getCharset() { 115 | return get(KEY_CHARSET); 116 | } 117 | 118 | /** 119 | * Gets disable cookie. 120 | * 121 | * @return the disable cookie 122 | */ 123 | public boolean getDisableCookie() { 124 | return (boolean) get(KEY_DISABLE_COOKIE); 125 | } 126 | 127 | @Override 128 | public boolean equals(Object o) { 129 | if (!(o instanceof Map)) { 130 | return false; 131 | } 132 | Object otherValue; 133 | for (String s : this.keySet()) { 134 | otherValue = ((Map) o).get(s); 135 | if (!Objects.equals(get(s), otherValue)) { 136 | return false; 137 | } 138 | } 139 | return true; 140 | } 141 | 142 | /** 143 | * Gets headers. 144 | * 145 | * @return the headers 146 | */ 147 | public List
getHeaders() { 148 | return get(KEY_HEADERS); 149 | } 150 | 151 | /** 152 | * Gets use gzip. 153 | * 154 | * @return the use gzip 155 | */ 156 | public boolean getUseGzip() { 157 | return (boolean) get(KEY_USE_GZIP); 158 | } 159 | 160 | /** 161 | * Gets timeout. 162 | * 163 | * @return the timeout 164 | */ 165 | public int getTimeout() { 166 | return (int) get(KEY_TIME_OUT); 167 | } 168 | 169 | /** 170 | * Gets retry times. 171 | * 172 | * @return the retry times 173 | */ 174 | public int getRetryTimes() { 175 | return (int) get(KEY_RETRY_TIMES); 176 | } 177 | 178 | /** 179 | * Gets proxy. 180 | * 181 | * @return the proxy 182 | */ 183 | public Proxy getProxy() { 184 | return get(KEY_PROXY); 185 | } 186 | 187 | } 188 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/HttpClientConfig.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | import java.util.Map; 4 | 5 | /** 6 | * The type Http client config. 7 | * 8 | * @author zido 9 | */ 10 | public class HttpClientConfig extends GlobalConfig { 11 | private static final long serialVersionUID = -7610251519485407931L; 12 | 13 | /** 14 | * Instantiates a new Http client config. 15 | */ 16 | public HttpClientConfig() { 17 | } 18 | 19 | /** 20 | * Instantiates a new Http client config. 21 | * 22 | * @param config the config 23 | */ 24 | public HttpClientConfig(Map config) { 25 | super(config); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/MappedConfig.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | import site.zido.elise.utils.Asserts; 4 | 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.Reader; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | import java.util.Properties; 11 | 12 | /** 13 | * The type Mapped config. 14 | * 15 | * @author zido 16 | */ 17 | public class MappedConfig extends HashMap implements Config { 18 | private static final long serialVersionUID = 8315976702547630336L; 19 | 20 | /** 21 | * Instantiates a new Mapped config. 22 | */ 23 | public MappedConfig() { 24 | } 25 | 26 | /** 27 | * Instantiates a new Mapped config. 28 | * 29 | * @param map the map 30 | */ 31 | public MappedConfig(Map map) { 32 | super(map); 33 | } 34 | 35 | @Override 36 | public void from(Reader reader) throws IOException { 37 | Asserts.notNull(reader, "can't read config from a null reader"); 38 | final Properties props = new Properties(); 39 | try { 40 | props.load(reader); 41 | } finally { 42 | reader.close(); 43 | } 44 | for (Entry entry : props.entrySet()) { 45 | this.put((String) entry.getKey(), entry.getValue()); 46 | } 47 | } 48 | 49 | @Override 50 | public void from(InputStream in) throws IOException { 51 | Asserts.notNull(in, "can't read config from a null input stream"); 52 | final Properties props = new Properties(); 53 | try { 54 | props.load(in); 55 | } finally { 56 | in.close(); 57 | } 58 | for (Entry entry : props.entrySet()) { 59 | this.put((String) entry.getKey(), entry.getValue()); 60 | } 61 | } 62 | 63 | @Override 64 | public void set(String key, Object value) { 65 | super.put(key, value); 66 | } 67 | 68 | @Override 69 | @SuppressWarnings("unchecked") 70 | public T get(String key) { 71 | return (T) super.get(key); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/custom/SiteConfig.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.custom; 2 | 3 | /** 4 | * The type Site config. 5 | * 6 | * @author zido 7 | */ 8 | public class SiteConfig extends GlobalConfig { 9 | /** 10 | * The constant KEY_SITE. 11 | */ 12 | public static final String KEY_SITE = "site"; 13 | private static final long serialVersionUID = 7284323147466259820L; 14 | 15 | /** 16 | * Sets site. 17 | * 18 | * @param site the site 19 | */ 20 | public void setSite(String site) { 21 | put(KEY_SITE, site); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/AbstractDownloaderFactory.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | import java.util.Map; 6 | import java.util.concurrent.ConcurrentHashMap; 7 | 8 | public abstract class AbstractDownloaderFactory implements DownloaderFactory { 9 | protected Map downloaderContainer = new ConcurrentHashMap<>(); 10 | 11 | @Override 12 | public void release(Task task) { 13 | //release downloader 14 | downloaderContainer.remove(task.getId()); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/DefaultDownloaderFactory.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.custom.Config; 4 | import site.zido.elise.custom.GlobalConfig; 5 | import site.zido.elise.task.Task; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.Set; 10 | 11 | /** 12 | * The type Default downloader factory. 13 | * 14 | * @author zido 15 | */ 16 | public final class DefaultDownloaderFactory extends AbstractDownloaderFactory { 17 | private Map factoryMap = new HashMap<>(); 18 | 19 | /** 20 | * Instantiates a new Default downloader factory. 21 | */ 22 | public DefaultDownloaderFactory() { 23 | registerFactory("httpclient", new HttpClientDownloaderFactory()); 24 | } 25 | 26 | @Override 27 | public Downloader create(Task task) { 28 | final Config config = task.getConfig(); 29 | final String key = config.get(GlobalConfig.KEY_DOWNLOAD_MODE); 30 | final DownloaderFactory factory = factoryMap.get(key); 31 | if (key == null) { 32 | throw new RuntimeException("not wrap any factory"); 33 | } 34 | return factory.create(task); 35 | } 36 | 37 | /** 38 | * Register factory default downloader factory. 39 | * 40 | * @param key the key 41 | * @param factory the factory 42 | * @return the default downloader factory 43 | */ 44 | public DefaultDownloaderFactory registerFactory(String key, DownloaderFactory factory) { 45 | factoryMap.put(key, factory); 46 | return this; 47 | } 48 | 49 | /** 50 | * Key set set. 51 | * 52 | * @return the set 53 | */ 54 | public Set keySet() { 55 | return factoryMap.keySet(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/DownloadException.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | public class DownloadException extends Exception { 4 | public DownloadException(String msg) { 5 | super(msg); 6 | } 7 | 8 | public DownloadException(Throwable e) { 9 | super(e); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/Downloader.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.task.Task; 6 | 7 | /** 8 | * downloader interface 9 | * 10 | * @author zido 11 | */ 12 | public interface Downloader { 13 | /** 14 | * download by task and request 15 | * 16 | * @param task the task 17 | * @param request the request 18 | * @return the response 19 | */ 20 | Response download(Task task, Request request); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/DownloaderFactory.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | /** 6 | * The interface Downloader factory. 7 | * 8 | * @author zido 9 | */ 10 | public interface DownloaderFactory { 11 | /** 12 | * Create downloader. 13 | * 14 | * @param task the task 15 | * @return the downloader 16 | */ 17 | Downloader create(Task task); 18 | 19 | void release(Task task); 20 | } 21 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/HtmlUnitDownloader.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import com.gargoylesoftware.htmlunit.BrowserVersion; 4 | import com.gargoylesoftware.htmlunit.WebClient; 5 | import com.gargoylesoftware.htmlunit.WebRequest; 6 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 7 | import org.apache.http.auth.AuthScope; 8 | import org.apache.http.auth.UsernamePasswordCredentials; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import site.zido.elise.http.Request; 12 | import site.zido.elise.http.Response; 13 | import site.zido.elise.http.impl.DefaultBody; 14 | import site.zido.elise.http.impl.DefaultResponse; 15 | import site.zido.elise.proxy.Proxy; 16 | import site.zido.elise.proxy.ProxyProvider; 17 | import site.zido.elise.task.Task; 18 | import site.zido.elise.utils.ValidateUtils; 19 | 20 | import java.io.IOException; 21 | import java.net.MalformedURLException; 22 | import java.net.URL; 23 | 24 | /** 25 | * The Html unit downloader. 26 | * 27 | * @author zido 28 | */ 29 | public class HtmlUnitDownloader implements Downloader { 30 | private static final Logger logger = LoggerFactory.getLogger(HtmlUnitDownloader.class); 31 | private ProxyProvider proxyProvider; 32 | 33 | @Override 34 | public Response download(Task task, Request request) { 35 | WebClient webClient = null; 36 | Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; 37 | DefaultResponse response = DefaultResponse.fail(request.getUrl()); 38 | try { 39 | if (proxy != null) { 40 | webClient = new WebClient(BrowserVersion.CHROME, proxy.getHost(), proxy.getPort()); 41 | if (!ValidateUtils.isEmpty(proxy.getUsername()) && !ValidateUtils.isEmpty(proxy.getPassword())) { 42 | webClient.getCredentialsProvider().setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); 43 | } 44 | } else { 45 | webClient = new WebClient(BrowserVersion.CHROME); 46 | } 47 | String urlStr = request.getUrl(); 48 | URL url = new URL(urlStr); 49 | 50 | WebRequest webRequest = new WebRequest(url); 51 | HtmlPage htmlPage = webClient.getPage(webRequest); 52 | int statusCode = htmlPage.getWebResponse().getStatusCode(); 53 | response = new DefaultResponse(); 54 | response.setStatusCode(statusCode); 55 | response.setUrl(request.getUrl()); 56 | //TODO set body 57 | response.setBody(new DefaultBody()); 58 | response.setDownloadSuccess(true); 59 | } catch (MalformedURLException e) { 60 | logger.error(String.format("url is invalid [%s]", request.getUrl()), e); 61 | } catch (IOException e) { 62 | logger.error(String.format("download response fail [%s]", request.getUrl()), e); 63 | } finally { 64 | if (webClient != null) { 65 | webClient.close(); 66 | } 67 | } 68 | return response; 69 | } 70 | 71 | /** 72 | * Sets proxy provider. 73 | * 74 | * @param proxyProvider the proxy provider 75 | * @return the proxy provider 76 | */ 77 | public HtmlUnitDownloader setProxyProvider(ProxyProvider proxyProvider) { 78 | this.proxyProvider = proxyProvider; 79 | return this; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/HtmlUnitDownloaderFacotory.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | /** 6 | * The type Html unit downloader facotory. 7 | * 8 | * @author zido 9 | */ 10 | public class HtmlUnitDownloaderFacotory extends AbstractDownloaderFactory { 11 | @Override 12 | public Downloader create(Task task) { 13 | return null; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/ProxiableDownloader.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader; 2 | 3 | import site.zido.elise.proxy.ProxyProvider; 4 | 5 | /** 6 | * The interface Proxiable downloader. 7 | * 8 | * @author zido 9 | */ 10 | public interface ProxiableDownloader extends Downloader { 11 | /** 12 | * Sets proxy provider. 13 | * 14 | * @param provider the provider 15 | */ 16 | void setProxyProvider(ProxyProvider provider); 17 | } 18 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/httpclient/CustomRedirectStrategy.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader.httpclient; 2 | 3 | import org.apache.http.HttpRequest; 4 | import org.apache.http.HttpResponse; 5 | import org.apache.http.ProtocolException; 6 | import org.apache.http.client.methods.HttpGet; 7 | import org.apache.http.client.methods.HttpPost; 8 | import org.apache.http.client.methods.HttpRequestWrapper; 9 | import org.apache.http.client.methods.HttpUriRequest; 10 | import org.apache.http.impl.client.LaxRedirectStrategy; 11 | import org.apache.http.protocol.HttpContext; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | import site.zido.elise.http.Http; 15 | 16 | import java.net.URI; 17 | 18 | /** 19 | * 支持post 302跳转策略实现类 20 | * HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy()); 21 | * 上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。 22 | * 23 | * @author zido 24 | */ 25 | public class CustomRedirectStrategy extends LaxRedirectStrategy { 26 | private final Logger logger = LoggerFactory.getLogger(getClass()); 27 | 28 | @Override 29 | public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { 30 | URI uri = getLocationURI(request, response, context); 31 | String method = request.getRequestLine().getMethod(); 32 | if (Http.Method.POST.equalsIgnoreCase(method)) { 33 | try { 34 | HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; 35 | httpRequestWrapper.setURI(uri); 36 | httpRequestWrapper.removeHeaders("Content-Length"); 37 | return httpRequestWrapper; 38 | } catch (Exception e) { 39 | logger.debug("wrap error", e); 40 | } 41 | return new HttpPost(uri); 42 | } else { 43 | return new HttpGet(uri); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/downloader/httpclient/HttpClientHeaderWrapper.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.downloader.httpclient; 2 | 3 | import org.apache.http.Header; 4 | import org.apache.http.message.BasicHeader; 5 | 6 | /** 7 | * wrap site.zido.elise.Header to http client's header 8 | * 9 | * @author zido 10 | */ 11 | public class HttpClientHeaderWrapper extends BasicHeader implements Header { 12 | private static final long serialVersionUID = -8918531998903473871L; 13 | 14 | /** 15 | * Instantiates a new Http client header wrapper. 16 | * 17 | * @param header the header 18 | */ 19 | public HttpClientHeaderWrapper(site.zido.elise.http.Header header) { 20 | super(header.getName(), header.getValue()); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/EventListener.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | import site.zido.elise.processor.ProcessorEventListener; 4 | 5 | /** 6 | * The interface Event listener. 7 | * 8 | * @author zido 9 | */ 10 | public interface EventListener extends ProcessorEventListener, java.util.EventListener { 11 | /** 12 | * On pause. 13 | */ 14 | default void onPause() { 15 | } 16 | 17 | /** 18 | * On cancel. 19 | */ 20 | default void onCancel() { 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/EventSupport.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | /** 4 | * The interface Event support. 5 | * 6 | * @author zido 7 | */ 8 | public interface EventSupport { 9 | /** 10 | * Add event listener. 11 | * 12 | * @param listener the listener 13 | */ 14 | void addEventListener(EventListener listener); 15 | 16 | /** 17 | * Remove event listener. 18 | * 19 | * @param listener the listener 20 | */ 21 | void removeEventListener(EventListener listener); 22 | } 23 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/SingleEventListener.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | 6 | /** 7 | * The interface Single event listener. 8 | * 9 | * @author zido 10 | */ 11 | public interface SingleEventListener extends SingleProcessorEventListener { 12 | /** 13 | * On download success. 14 | * 15 | * @param request the request 16 | * @param response the response 17 | */ 18 | default void onDownloadSuccess(Request request, Response response) { 19 | } 20 | 21 | /** 22 | * On download error. 23 | * 24 | * @param request the request 25 | * @param response the response 26 | */ 27 | default void onDownloadError(Request request, Response response) { 28 | } 29 | 30 | /** 31 | * On success. 32 | */ 33 | default void onSuccess() { 34 | } 35 | 36 | /** 37 | * On recover. 38 | */ 39 | default void onRecover() { 40 | } 41 | 42 | /** 43 | * On pause. 44 | */ 45 | default void onPause() { 46 | } 47 | 48 | /** 49 | * On cancel. 50 | */ 51 | default void onCancel() { 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/SingleListenerContainer.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.processor.ResultItem; 6 | import site.zido.elise.task.Task; 7 | import site.zido.elise.utils.EventUtils; 8 | 9 | import java.util.HashSet; 10 | import java.util.Set; 11 | 12 | /** 13 | * The type Single listener container. 14 | * 15 | * @author zido 16 | */ 17 | public final class SingleListenerContainer implements TaskEventListener { 18 | private Set listeners = new HashSet<>(); 19 | private long taskId; 20 | private RecyclingCallback callback; 21 | 22 | /** 23 | * Instantiates a new Single listener container. 24 | * 25 | * @param taskId the task id 26 | */ 27 | public SingleListenerContainer(long taskId) { 28 | this.taskId = taskId; 29 | } 30 | 31 | /** 32 | * Sets callback. 33 | * 34 | * @param callback the callback 35 | */ 36 | public void setCallback(RecyclingCallback callback) { 37 | this.callback = callback; 38 | } 39 | 40 | /** 41 | * Add listener. 42 | * 43 | * @param listener the listener 44 | */ 45 | public void addListener(SingleEventListener listener) { 46 | listeners.add(listener); 47 | } 48 | 49 | @Override 50 | public void onDownloadSuccess(Task task, Request request, Response response) { 51 | if (this.taskId == task.getId()) { 52 | EventUtils.notifyListeners(listeners, listener -> listener.onDownloadSuccess(request, response)); 53 | } 54 | } 55 | 56 | @Override 57 | public void onDownloadError(Task task, Request request, Response response) { 58 | if (this.taskId == task.getId()) { 59 | EventUtils.notifyListeners(listeners, listener -> listener.onDownloadSuccess(request, response)); 60 | } 61 | } 62 | 63 | @Override 64 | public void onSuccess(Task task) { 65 | if (this.taskId == task.getId()) { 66 | EventUtils.notifyListeners(listeners, SingleEventListener::onSuccess); 67 | } 68 | callback.onRecycling(); 69 | } 70 | 71 | @Override 72 | public void onPause(Task task) { 73 | if (this.taskId == task.getId()) { 74 | EventUtils.notifyListeners(listeners, SingleEventListener::onPause); 75 | } 76 | } 77 | 78 | @Override 79 | public void onRecover(Task task) { 80 | if (this.taskId == task.getId()) { 81 | EventUtils.notifyListeners(listeners, SingleEventListener::onRecover); 82 | } 83 | } 84 | 85 | @Override 86 | public void onCancel(Task task) { 87 | if (this.taskId == task.getId()) { 88 | EventUtils.notifyListeners(listeners, SingleEventListener::onCancel); 89 | } 90 | callback.onRecycling(); 91 | } 92 | 93 | @Override 94 | public void onSaveSuccess(Task task, ResultItem resultItem) { 95 | if (this.taskId == task.getId()) { 96 | EventUtils.notifyListeners(listeners, listener -> listener.onSaveSuccess(resultItem)); 97 | } 98 | } 99 | 100 | /** 101 | * The interface Recycling callback. 102 | * 103 | * @author zido 104 | */ 105 | @FunctionalInterface 106 | public interface RecyclingCallback { 107 | /** 108 | * On recycling. 109 | */ 110 | void onRecycling(); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/SingleProcessorEventListener.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | import site.zido.elise.processor.ResultItem; 4 | 5 | import java.util.EventListener; 6 | 7 | /** 8 | * The interface Single processor event listener. 9 | * 10 | * @author zido 11 | */ 12 | public interface SingleProcessorEventListener extends EventListener { 13 | /** 14 | * On save success. 15 | * 16 | * @param resultItems the result items 17 | */ 18 | default void onSaveSuccess(ResultItem resultItems) { 19 | } 20 | 21 | /** 22 | * On save error. 23 | * 24 | * @param resultItems the result items 25 | */ 26 | default void onSaveError(ResultItem resultItems) { 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/events/TaskEventListener.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.events; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.task.Task; 6 | 7 | /** 8 | * The interface Task event listener. 9 | * 10 | * @author zido 11 | */ 12 | public interface TaskEventListener extends EventListener { 13 | /** 14 | * On download success. 15 | * 16 | * @param task the task 17 | * @param request the request 18 | * @param response the response 19 | */ 20 | default void onDownloadSuccess(Task task, Request request, Response response) { 21 | } 22 | 23 | /** 24 | * On download error. 25 | * 26 | * @param task the task 27 | * @param request the request 28 | * @param response the response 29 | */ 30 | default void onDownloadError(Task task, Request request, Response response) { 31 | } 32 | 33 | /** 34 | * On success. 35 | * 36 | * @param task the task 37 | */ 38 | default void onSuccess(Task task) { 39 | } 40 | 41 | /** 42 | * On pause. 43 | * 44 | * @param task the task 45 | */ 46 | default void onPause(Task task) { 47 | } 48 | 49 | /** 50 | * On recover. 51 | * 52 | * @param task the task 53 | */ 54 | default void onRecover(Task task) { 55 | } 56 | 57 | /** 58 | * On cancel. 59 | * 60 | * @param task the task 61 | */ 62 | default void onCancel(Task task) { 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Body.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.io.Serializable; 4 | import java.nio.charset.Charset; 5 | 6 | /** 7 | * http request 8 | * 9 | * @author zido 10 | */ 11 | public interface Body extends Serializable { 12 | /** 13 | * Get bytes byte [ ]. 14 | * 15 | * @return the byte [ ] 16 | */ 17 | byte[] getBytes(); 18 | 19 | /** 20 | * Gets content type. 21 | * 22 | * @return the content type 23 | */ 24 | Http.ContentType contentType(); 25 | 26 | 27 | /** 28 | * Gets encoding. 29 | * 30 | * @return the encoding 31 | */ 32 | Charset getEncoding(); 33 | 34 | 35 | } 36 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Cookie.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * The interface Cookie. 7 | * 8 | * @author zido 9 | */ 10 | public interface Cookie extends Header { 11 | /** 12 | * Gets path. 13 | * 14 | * @return the path 15 | */ 16 | String getPath(); 17 | 18 | /** 19 | * Gets expiry date. 20 | * 21 | * @return the expiry date 22 | */ 23 | Date getExpiryDate(); 24 | 25 | /** 26 | * Is secure boolean. 27 | * 28 | * @return the boolean 29 | */ 30 | boolean isSecure(); 31 | 32 | /** 33 | * Gets domain. 34 | * 35 | * @return the domain 36 | */ 37 | String getDomain(); 38 | } 39 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Header.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * The interface Header. 7 | * 8 | * @author zido 9 | */ 10 | public interface Header extends Serializable { 11 | /** 12 | * Gets name. 13 | * 14 | * @return the name 15 | */ 16 | String getName(); 17 | 18 | /** 19 | * Gets value. 20 | * 21 | * @return the value 22 | */ 23 | String getValue(); 24 | } 25 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/HttpModel.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | /** 7 | * The interface Http model. 8 | * 9 | * @author zido 10 | */ 11 | public interface HttpModel extends Serializable { 12 | /** 13 | * Gets headers. 14 | * 15 | * @param key the key 16 | * @return the headers 17 | */ 18 | List
getHeaders(String key); 19 | 20 | /** 21 | * Gets all headers. 22 | * 23 | * @return the all headers 24 | */ 25 | List
getAllHeaders(); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Pair.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | /** 4 | * The type Pair. 5 | * 6 | * @author zido 7 | */ 8 | public class Pair { 9 | private String name; 10 | private String value; 11 | 12 | /** 13 | * Instantiates a new Pair. 14 | */ 15 | public Pair() { 16 | 17 | } 18 | 19 | /** 20 | * Instantiates a new Pair. 21 | * 22 | * @param name the name 23 | * @param value the value 24 | */ 25 | public Pair(String name, String value) { 26 | this.name = name; 27 | this.value = value; 28 | } 29 | 30 | /** 31 | * Gets name. 32 | * 33 | * @return the name 34 | */ 35 | public String getName() { 36 | return name; 37 | } 38 | 39 | /** 40 | * Sets name. 41 | * 42 | * @param name the name 43 | * @return the name 44 | */ 45 | public Pair setName(String name) { 46 | this.name = name; 47 | return this; 48 | } 49 | 50 | /** 51 | * Gets value. 52 | * 53 | * @return the value 54 | */ 55 | public String getValue() { 56 | return value; 57 | } 58 | 59 | /** 60 | * Sets value. 61 | * 62 | * @param value the value 63 | * @return the value 64 | */ 65 | public Pair setValue(String value) { 66 | this.value = value; 67 | return this; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Request.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * The interface Request. 7 | * 8 | * @author zido 9 | */ 10 | public interface Request extends HttpModel { 11 | /** 12 | * The constant CYCLE_TRIED_TIMES. 13 | */ 14 | String CYCLE_TRIED_TIMES = "_cycle_tried_times"; 15 | 16 | /** 17 | * Gets extra. 18 | * 19 | * @param key the key 20 | * @return the extra 21 | */ 22 | Object getExtra(String key); 23 | 24 | /** 25 | * Gets method. 26 | * 27 | * @return the method 28 | */ 29 | String getMethod(); 30 | 31 | /** 32 | * Gets url. 33 | * 34 | * @return the url 35 | */ 36 | String getUrl(); 37 | 38 | /** 39 | * Gets body. 40 | * 41 | * @return the body 42 | */ 43 | Body getBody(); 44 | 45 | /** 46 | * Gets headers. 47 | * 48 | * @return the headers 49 | */ 50 | List
getHeaders(); 51 | 52 | /** 53 | * Gets cookies. 54 | * 55 | * @return the cookies 56 | */ 57 | List getCookies(); 58 | 59 | /** 60 | * Put extra. 61 | * 62 | * @param key the key 63 | * @param value the value 64 | */ 65 | void putExtra(String key, Object value); 66 | } 67 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/Response.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * The interface Response. 7 | * 8 | * @author zido 9 | */ 10 | public interface Response extends HttpModel { 11 | boolean isDownloadSuccess(); 12 | 13 | /** 14 | * Gets status code. 15 | * 16 | * @return the status code 17 | */ 18 | int getStatusCode(); 19 | 20 | /** 21 | * Gets reason phrase. 22 | * 23 | * @return the reason phrase 24 | */ 25 | String getReasonPhrase(); 26 | 27 | /** 28 | * Gets url. 29 | * 30 | * @return the url 31 | */ 32 | String getUrl(); 33 | 34 | /** 35 | * get body from response 36 | * 37 | * @return body body 38 | */ 39 | Body getBody(); 40 | 41 | /** 42 | * Gets cookies. 43 | * 44 | * @return the cookies 45 | */ 46 | List getCookies(); 47 | 48 | } 49 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/impl/DefaultBody.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http.impl; 2 | 3 | import site.zido.elise.http.Body; 4 | import site.zido.elise.http.Http; 5 | 6 | import java.nio.charset.Charset; 7 | 8 | /** 9 | * The type Default request body. 10 | * 11 | * @author zido 12 | */ 13 | public class DefaultBody implements Body { 14 | private static final long serialVersionUID = 2018040215121L; 15 | private byte[] bytes; 16 | private Http.ContentType contentType; 17 | private Charset encoding; 18 | 19 | /** 20 | * Instantiates a new Http request bytes. 21 | */ 22 | public DefaultBody() { 23 | } 24 | 25 | /** 26 | * Get bytes byte [ ]. 27 | * 28 | * @return the byte [ ] 29 | */ 30 | @Override 31 | public byte[] getBytes() { 32 | return bytes; 33 | } 34 | 35 | /** 36 | * Sets bytes. 37 | * 38 | * @param bytes the bytes 39 | */ 40 | public void setBytes(byte[] bytes) { 41 | this.bytes = bytes; 42 | } 43 | 44 | /** 45 | * Gets content type. 46 | * 47 | * @return the content type 48 | */ 49 | @Override 50 | public Http.ContentType contentType() { 51 | return contentType; 52 | } 53 | 54 | /** 55 | * Sets content type. 56 | * 57 | * @param contentType the content type 58 | */ 59 | public void setContentType(Http.ContentType contentType) { 60 | this.contentType = contentType; 61 | } 62 | 63 | /** 64 | * Gets encoding. 65 | * 66 | * @return the encoding 67 | */ 68 | @Override 69 | public Charset getEncoding() { 70 | return encoding; 71 | } 72 | 73 | /** 74 | * Sets encoding. 75 | * 76 | * @param encoding the encoding 77 | */ 78 | public void setEncoding(Charset encoding) { 79 | this.encoding = encoding; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/impl/DefaultCookie.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http.impl; 2 | 3 | import site.zido.elise.http.Cookie; 4 | 5 | import java.util.Date; 6 | 7 | /** 8 | * The type Default cookie. 9 | * 10 | * @author zido 11 | */ 12 | public class DefaultCookie extends DefaultHeader implements Cookie { 13 | private static final long serialVersionUID = -492077641315574434L; 14 | private String domain; 15 | private String path; 16 | private Date expiryDate; 17 | private boolean secure; 18 | 19 | /** 20 | * Instantiates a new Default cookie. 21 | * 22 | * @param name the name 23 | * @param value the value 24 | */ 25 | public DefaultCookie(String name, String value) { 26 | super(name, value); 27 | } 28 | 29 | /** 30 | * Instantiates a new Default cookie. 31 | * 32 | * @param name the name 33 | * @param value the value 34 | * @param path the path 35 | */ 36 | public DefaultCookie(String name, String value, String path) { 37 | super(name, value); 38 | this.path = path; 39 | } 40 | 41 | /** 42 | * Instantiates a new Default cookie. 43 | * 44 | * @param name the name 45 | * @param value the value 46 | * @param domain the domain 47 | * @param path the path 48 | * @param expiryDate the expiry date 49 | * @param secure the secure 50 | */ 51 | public DefaultCookie(String name, String value, String domain, String path, Date expiryDate, boolean secure) { 52 | super(name, value); 53 | this.domain = domain; 54 | this.path = path; 55 | this.expiryDate = expiryDate; 56 | this.secure = secure; 57 | } 58 | 59 | @Override 60 | public String getPath() { 61 | return path; 62 | } 63 | 64 | @Override 65 | public Date getExpiryDate() { 66 | return expiryDate; 67 | } 68 | 69 | @Override 70 | public boolean isSecure() { 71 | return secure; 72 | } 73 | 74 | @Override 75 | public String getDomain() { 76 | return domain; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/impl/DefaultHeader.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http.impl; 2 | 3 | import site.zido.elise.http.Header; 4 | 5 | /** 6 | * The type Default header. 7 | * 8 | * @author zido 9 | */ 10 | public class DefaultHeader implements Header { 11 | private static final long serialVersionUID = -8520992620053571143L; 12 | private String name; 13 | private String value; 14 | 15 | /** 16 | * Instantiates a new Default header. 17 | * 18 | * @param name the name 19 | * @param value the value 20 | */ 21 | public DefaultHeader(String name, String value) { 22 | this.name = name; 23 | this.value = value; 24 | } 25 | 26 | @Override 27 | public String getName() { 28 | return this.name; 29 | } 30 | 31 | /** 32 | * Sets name. 33 | * 34 | * @param name the name 35 | */ 36 | public void setName(String name) { 37 | this.name = name; 38 | } 39 | 40 | @Override 41 | public String getValue() { 42 | return this.value; 43 | } 44 | 45 | /** 46 | * Sets value. 47 | * 48 | * @param value the value 49 | */ 50 | public void setValue(String value) { 51 | this.value = value; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/impl/DefaultResponse.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http.impl; 2 | 3 | import site.zido.elise.E; 4 | import site.zido.elise.http.Body; 5 | import site.zido.elise.http.Cookie; 6 | import site.zido.elise.http.Header; 7 | import site.zido.elise.http.Response; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Objects; 12 | import java.util.stream.Collectors; 13 | 14 | /** 15 | * 下载的响应对象 16 | * 17 | * @author zido 18 | */ 19 | public class DefaultResponse implements Response { 20 | private static final long serialVersionUID = 8652625484193923483L; 21 | private String url; 22 | private int statusCode = E.StatusCode.CODE_200; 23 | private String reasonPhrase; 24 | 25 | private Body body; 26 | 27 | private boolean downloadSuccess = true; 28 | 29 | private List
headers; 30 | 31 | private List cookies; 32 | 33 | /** 34 | * Instantiates a new Default response. 35 | */ 36 | public DefaultResponse() { 37 | this.headers = new ArrayList<>(); 38 | } 39 | 40 | public DefaultResponse(Response response) { 41 | this.url = response.getUrl(); 42 | this.statusCode = response.getStatusCode(); 43 | this.reasonPhrase = response.getReasonPhrase(); 44 | this.body = response.getBody(); 45 | this.headers = response.getAllHeaders(); 46 | this.cookies = response.getCookies(); 47 | } 48 | 49 | /** 50 | * Fail default response. 51 | * 52 | * @return the default response 53 | */ 54 | public static DefaultResponse fail(String url) { 55 | DefaultResponse response = new DefaultResponse(); 56 | response.setUrl(url); 57 | response.setDownloadSuccess(false); 58 | response.setStatusCode(E.StatusCode.CODE_DOWNLOAD_ERROR); 59 | return response; 60 | } 61 | 62 | /** 63 | * get url of current page 64 | * 65 | * @return url of current page 66 | */ 67 | @Override 68 | public String getUrl() { 69 | return url; 70 | } 71 | 72 | /** 73 | * Sets url. 74 | * 75 | * @param url the url 76 | * @return the url 77 | */ 78 | public DefaultResponse setUrl(String url) { 79 | this.url = url; 80 | return this; 81 | } 82 | 83 | @Override 84 | public int getStatusCode() { 85 | return statusCode; 86 | } 87 | 88 | /** 89 | * Sets status code. 90 | * 91 | * @param statusCode the status code 92 | */ 93 | public void setStatusCode(int statusCode) { 94 | this.statusCode = statusCode; 95 | } 96 | 97 | @Override 98 | public String getReasonPhrase() { 99 | return reasonPhrase; 100 | } 101 | 102 | /** 103 | * Sets reason phrase. 104 | * 105 | * @param reasonPhrase the reason phrase 106 | */ 107 | public void setReasonPhrase(String reasonPhrase) { 108 | this.reasonPhrase = reasonPhrase; 109 | } 110 | 111 | /** 112 | * Sets headers. 113 | * 114 | * @param headers the headers 115 | */ 116 | public void setHeaders(List
headers) { 117 | this.headers = headers; 118 | } 119 | 120 | /** 121 | * Is download success boolean. 122 | * 123 | * @return the boolean 124 | */ 125 | @Override 126 | public boolean isDownloadSuccess() { 127 | return downloadSuccess; 128 | } 129 | 130 | /** 131 | * Sets download success. 132 | * 133 | * @param downloadSuccess the download success 134 | */ 135 | public void setDownloadSuccess(boolean downloadSuccess) { 136 | this.downloadSuccess = downloadSuccess; 137 | } 138 | 139 | /** 140 | * Gets body. 141 | * 142 | * @return the body 143 | */ 144 | @Override 145 | public Body getBody() { 146 | return body; 147 | } 148 | 149 | /** 150 | * Sets body. 151 | * 152 | * @param body the body 153 | */ 154 | public void setBody(Body body) { 155 | this.body = body; 156 | } 157 | 158 | @Override 159 | public List getCookies() { 160 | return this.cookies; 161 | } 162 | 163 | /** 164 | * Sets cookies. 165 | * 166 | * @param cookies the cookies 167 | */ 168 | public void setCookies(List cookies) { 169 | this.cookies = cookies; 170 | } 171 | 172 | @Override 173 | public List
getHeaders(String key) { 174 | return headers.stream().filter(header -> Objects.equals(header.getName(), key)).collect(Collectors.toList()); 175 | } 176 | 177 | @Override 178 | public List
getAllHeaders() { 179 | return headers; 180 | } 181 | 182 | /** 183 | * Sets header. 184 | * 185 | * @param header the header 186 | */ 187 | public void setHeader(Header header) { 188 | headers.add(header); 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/http/impl/HttpClientBodyWrapper.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.http.impl; 2 | 3 | import org.apache.http.Header; 4 | import org.apache.http.HttpEntity; 5 | import org.apache.http.entity.HttpEntityWrapper; 6 | import org.apache.http.util.EntityUtils; 7 | import site.zido.elise.http.Body; 8 | import site.zido.elise.http.Http; 9 | 10 | import java.io.IOException; 11 | import java.nio.charset.Charset; 12 | 13 | /** 14 | * wrapper httpclient body to elise body 15 | * 16 | * @author zido 17 | */ 18 | public class HttpClientBodyWrapper implements Body { 19 | private static final byte[] EMPTY_BYTES = new byte[0]; 20 | private static final long serialVersionUID = -7276549416780682765L; 21 | private HttpEntityWrapper entity; 22 | private Http.ContentType contentType; 23 | private byte[] bytes; 24 | 25 | /** 26 | * Instantiates a new Http client body wrapper. 27 | * 28 | * @param entity the entity 29 | */ 30 | public HttpClientBodyWrapper(HttpEntity entity) { 31 | this.entity = new HttpEntityWrapper(entity); 32 | this.contentType = Http.ContentType.parse(entity.getContentType().getValue()); 33 | try { 34 | bytes = EntityUtils.toByteArray(entity); 35 | } catch (IOException e) { 36 | bytes = EMPTY_BYTES; 37 | } 38 | } 39 | 40 | @Override 41 | public byte[] getBytes() { 42 | return bytes; 43 | } 44 | 45 | @Override 46 | public Http.ContentType contentType() { 47 | return contentType; 48 | } 49 | 50 | @Override 51 | public Charset getEncoding() { 52 | Header encoding = entity.getContentEncoding(); 53 | if (encoding != null) { 54 | return Charset.forName(encoding.getValue()); 55 | } 56 | if (contentType != null && contentType.getCharset() != null && Charset.isSupported(contentType.getCharset())) { 57 | return Charset.forName(contentType.getCharset()); 58 | } 59 | return null; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/BlankSaver.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | import java.io.OutputStream; 6 | import java.io.PrintStream; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | /** 11 | * The type Blank saver. 12 | * 13 | * @author zido 14 | */ 15 | public class BlankSaver implements Saver { 16 | private PrintStream stream; 17 | 18 | public BlankSaver() { 19 | stream = new PrintStream(new OutputStream() { 20 | @Override 21 | public void write(int b) { 22 | //do nothing 23 | } 24 | }); 25 | } 26 | 27 | public BlankSaver(PrintStream stream) { 28 | this.stream = stream; 29 | } 30 | 31 | @Override 32 | public void save(ResultItem resultItem, Task task) { 33 | Map> all = resultItem.getAll(); 34 | for (Map.Entry> entry : all.entrySet()) { 35 | if (entry.getValue().size() == 1) { 36 | stream.println(entry.getKey() + ":\t" + entry.getValue().get(0)); 37 | } else { 38 | stream.println(entry.getKey() + ":\t" + entry.getValue()); 39 | } 40 | //LOGGER.debug(entry.getKey() + ":\t" + entry.getValue()); 41 | } 42 | } 43 | 44 | @Override 45 | public ResultItem next(Task task, ResultItem item) { 46 | return null; 47 | } 48 | 49 | @Override 50 | public boolean hasNext(Task task, ResultItem item) { 51 | return false; 52 | } 53 | 54 | @Override 55 | public ResultItem first(Task task) { 56 | return null; 57 | } 58 | 59 | @Override 60 | public int size(Task task) { 61 | return 0; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ListenableResponseProcessor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | /** 4 | * The interface Listenable response handler. 5 | * 6 | * @author zido 7 | */ 8 | public interface ListenableResponseProcessor extends ResponseProcessor { 9 | /** 10 | * Add event listener. 11 | * 12 | * @param listener the listener 13 | */ 14 | void addEventListener(ProcessorEventListener listener); 15 | 16 | /** 17 | * Remove event listener. 18 | * 19 | * @param listener the listener 20 | */ 21 | void removeEventListener(ProcessorEventListener listener); 22 | } 23 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/MemorySaver.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.task.Task; 4 | import site.zido.elise.utils.ValidateUtils; 5 | 6 | import java.io.PrintStream; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.concurrent.ConcurrentHashMap; 11 | 12 | /** 13 | * The type Console saver. 14 | * 15 | * @author zido 16 | */ 17 | public class MemorySaver extends BlankSaver implements Saver { 18 | private Map> cup = new ConcurrentHashMap<>(); 19 | 20 | public MemorySaver() { 21 | 22 | } 23 | 24 | public MemorySaver(PrintStream stream) { 25 | super(stream); 26 | } 27 | 28 | /** 29 | * Gets cup. 30 | * 31 | * @return the cup 32 | */ 33 | public Map> getCup() { 34 | return cup; 35 | } 36 | 37 | @Override 38 | public void save(ResultItem resultItem, Task task) { 39 | List resultItems = cup.computeIfAbsent(task.getId(), k -> new ArrayList<>()); 40 | resultItems.add(resultItem); 41 | super.save(resultItem, task); 42 | } 43 | 44 | @Override 45 | public ResultItem next(Task task, ResultItem item) { 46 | List resultItems = cup.get(task.getId()); 47 | if (ValidateUtils.isEmpty(resultItems)) { 48 | return null; 49 | } 50 | if (item == null) { 51 | return resultItems.get(0); 52 | } 53 | int i = resultItems.indexOf(item); 54 | if (i >= 0 && resultItems.size() - 1 > i) { 55 | return resultItems.get(i + 1); 56 | } 57 | return null; 58 | } 59 | 60 | @Override 61 | public boolean hasNext(Task task, ResultItem item) { 62 | return next(task, item) != null; 63 | } 64 | 65 | @Override 66 | public ResultItem first(Task task) { 67 | return next(task, null); 68 | } 69 | 70 | @Override 71 | public int size(Task task) { 72 | return cup.computeIfAbsent(task.getId(), k -> new ArrayList<>()).size(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ProcessorEventListener.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | import java.util.EventListener; 6 | 7 | /** 8 | * The interface Processor event listener. 9 | * 10 | * @author zido 11 | */ 12 | public interface ProcessorEventListener extends EventListener { 13 | /** 14 | * On save success. 15 | * 16 | * @param task the task 17 | * @param resultItem the result items 18 | */ 19 | default void onSaveSuccess(Task task, ResultItem resultItem) { 20 | } 21 | 22 | /** 23 | * On save error. 24 | * 25 | * @param task the task 26 | * @param resultItem the result items 27 | */ 28 | default void onSaveError(Task task, ResultItem resultItem) { 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ResponseContextHolder.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import site.zido.elise.custom.Config; 6 | import site.zido.elise.custom.GlobalConfig; 7 | import site.zido.elise.http.Body; 8 | import site.zido.elise.http.Response; 9 | import site.zido.elise.http.impl.DefaultResponse; 10 | 11 | import java.nio.charset.Charset; 12 | 13 | public class ResponseContextHolder extends DefaultResponse { 14 | private String html; 15 | private Document document; 16 | private Charset charset; 17 | 18 | public ResponseContextHolder(Response response, Config config) { 19 | super(response); 20 | Body body = response.getBody(); 21 | if (body != null) { 22 | Charset encoding = body.getEncoding(); 23 | String configCharset = config.get(GlobalConfig.KEY_CHARSET); 24 | this.charset = encoding == null ? Charset.forName(configCharset) : encoding; 25 | } 26 | } 27 | 28 | public String getHtml() { 29 | if (getBody() == null) { 30 | this.html = ""; 31 | return ""; 32 | } 33 | if (html == null) { 34 | html = new String(getBody().getBytes(), charset); 35 | } 36 | return html; 37 | } 38 | 39 | public Document getDocument() { 40 | if (html == null) { 41 | getHtml(); 42 | } 43 | if (document == null) { 44 | String url = getUrl(); 45 | document = Jsoup.parse(html, url); 46 | } 47 | return document; 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ResponseHolder.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import site.zido.elise.custom.Config; 6 | import site.zido.elise.http.Response; 7 | 8 | import java.nio.charset.Charset; 9 | 10 | public class ResponseHolder { 11 | private String html; 12 | private Document document; 13 | private String url; 14 | private byte[] bytes; 15 | private Charset charset; 16 | 17 | public static ResponseHolder create(String url, byte[] bytes, Charset charset) { 18 | ResponseHolder holder = new ResponseHolder(); 19 | holder.url = url; 20 | holder.bytes = bytes; 21 | holder.charset = charset; 22 | return holder; 23 | } 24 | 25 | public static ResponseHolder create(Response response, Config config) { 26 | return null; 27 | } 28 | 29 | public String getHtml() { 30 | if (html == null) { 31 | html = new String(bytes, charset); 32 | } 33 | return html; 34 | } 35 | 36 | public void setHtml(String html) { 37 | this.html = html; 38 | } 39 | 40 | public Document getDocument() { 41 | if (document == null) { 42 | document = Jsoup.parse(getHtml(), url); 43 | } 44 | return document; 45 | } 46 | 47 | public void setDocument(Document document) { 48 | this.document = document; 49 | } 50 | 51 | public String getUrl() { 52 | return url; 53 | } 54 | 55 | public void setUrl(String url) { 56 | this.url = url; 57 | } 58 | 59 | public byte[] getBytes() { 60 | return bytes; 61 | } 62 | 63 | public void setBytes(byte[] bytes) { 64 | this.bytes = bytes; 65 | } 66 | 67 | public Charset getCharset() { 68 | return charset; 69 | } 70 | 71 | public void setCharset(Charset charset) { 72 | this.charset = charset; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ResponseProcessor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.http.Response; 4 | import site.zido.elise.select.SelectorMatchException; 5 | import site.zido.elise.task.Task; 6 | 7 | import java.util.Set; 8 | 9 | /** 10 | * the page processor 11 | * 12 | * @author zido 13 | */ 14 | public interface ResponseProcessor { 15 | /** 16 | * process the response, extract urls to fetch, extract the data and store. 17 | * 18 | * @param task task. 19 | * @param response response. 20 | * @return results set 21 | * @throws SelectorMatchException when selector matcher error 22 | */ 23 | Set process(Task task, Response response) throws SelectorMatchException; 24 | } 25 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/ResultItem.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.select.FieldType; 4 | 5 | import java.util.LinkedHashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * 结果集 11 | * 12 | * @author zido 13 | */ 14 | public class ResultItem { 15 | private String name; 16 | private Map> fields = new LinkedHashMap<>(); 17 | private Map fieldTypeMap = new LinkedHashMap<>(); 18 | 19 | /** 20 | * Instantiates a new Result item. 21 | */ 22 | public ResultItem() { 23 | } 24 | 25 | /** 26 | * Get object. 27 | * 28 | * @param key the key 29 | * @return the object 30 | */ 31 | public Object get(String key) { 32 | Object o = fields.get(key); 33 | if (o == null) { 34 | return null; 35 | } 36 | return fields.get(key); 37 | } 38 | 39 | /** 40 | * Gets type. 41 | * 42 | * @param key the key 43 | * @return the type 44 | */ 45 | public FieldType getType(String key) { 46 | return fieldTypeMap.get(key); 47 | } 48 | 49 | /** 50 | * Sets type. 51 | * 52 | * @param key the key 53 | * @param type the type 54 | * @return the type 55 | */ 56 | public ResultItem setType(String key, FieldType type) { 57 | fieldTypeMap.put(key, type); 58 | return this; 59 | } 60 | 61 | /** 62 | * Gets all. 63 | * 64 | * @return the all 65 | */ 66 | public Map> getAll() { 67 | return fields; 68 | } 69 | 70 | /** 71 | * Put result item. 72 | * 73 | * @param key the key 74 | * @param value the value 75 | * @return the result item 76 | */ 77 | public ResultItem put(String key, List value) { 78 | fields.put(key, value); 79 | return this; 80 | } 81 | 82 | /** 83 | * Put result item. 84 | * 85 | * @param key the key 86 | * @param value the value 87 | * @param type the type 88 | * @return the result item 89 | */ 90 | public ResultItem put(String key, List value, FieldType type) { 91 | put(key, value); 92 | setType(key, type); 93 | return this; 94 | } 95 | 96 | 97 | /** 98 | * Gets name. 99 | * 100 | * @return the name 101 | */ 102 | public String getName() { 103 | return name; 104 | } 105 | 106 | /** 107 | * Sets name. 108 | * 109 | * @param name the name 110 | */ 111 | public void setName(String name) { 112 | this.name = name; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/Saver.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | /** 6 | * 结果输出处理器 7 | * 8 | * @author zido 9 | */ 10 | public interface Saver { 11 | /** 12 | * 结果处理 13 | * 14 | * @param resultItem 结果集 15 | * @param task 任务 16 | */ 17 | void save(ResultItem resultItem, Task task); 18 | 19 | /** 20 | * Next result item. 21 | * 22 | * @param task the task 23 | * @param item the item 24 | * @return the result item 25 | */ 26 | ResultItem next(Task task, ResultItem item); 27 | 28 | /** 29 | * Has next boolean. 30 | * 31 | * @param task the task 32 | * @param item the item 33 | * @return the boolean 34 | */ 35 | boolean hasNext(Task task, ResultItem item); 36 | 37 | /** 38 | * First result item. 39 | * 40 | * @param task the task 41 | * @return the result item 42 | */ 43 | ResultItem first(Task task); 44 | 45 | /** 46 | * Size int. 47 | * 48 | * @param task the task 49 | * @return the int 50 | */ 51 | int size(Task task); 52 | } 53 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/processor/SaverHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.processor; 2 | 3 | /** 4 | * The interface Saver handler. 5 | * 6 | * @author zido 7 | */ 8 | public interface SaverHandler { 9 | /** 10 | * Gets name. 11 | * 12 | * @return the name 13 | */ 14 | String getName(); 15 | } 16 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/proxy/Proxy.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.proxy; 2 | 3 | /** 4 | * http proxy 5 | * 6 | * @author zido 7 | */ 8 | public class Proxy { 9 | 10 | private String host; 11 | private int port; 12 | private String username; 13 | private String password; 14 | 15 | private Proxy() { 16 | 17 | } 18 | 19 | /** 20 | * Instantiates a new Proxy. 21 | * 22 | * @param host the host 23 | * @param port the port 24 | */ 25 | public Proxy(String host, int port) { 26 | this.host = host; 27 | this.port = port; 28 | } 29 | 30 | /** 31 | * Instantiates a new Proxy. 32 | * 33 | * @param host the host 34 | * @param port the port 35 | * @param username the username 36 | * @param password the password 37 | */ 38 | public Proxy(String host, int port, String username, String password) { 39 | this.host = host; 40 | this.port = port; 41 | this.username = username; 42 | this.password = password; 43 | } 44 | 45 | /** 46 | * Gets host. 47 | * 48 | * @return the host 49 | */ 50 | public String getHost() { 51 | return host; 52 | } 53 | 54 | /** 55 | * Gets port. 56 | * 57 | * @return the port 58 | */ 59 | public int getPort() { 60 | return port; 61 | } 62 | 63 | /** 64 | * Gets username. 65 | * 66 | * @return the username 67 | */ 68 | public String getUsername() { 69 | return username; 70 | } 71 | 72 | /** 73 | * Gets password. 74 | * 75 | * @return the password 76 | */ 77 | public String getPassword() { 78 | return password; 79 | } 80 | 81 | @Override 82 | public boolean equals(Object o) { 83 | if (this == o) { 84 | return true; 85 | } 86 | if (o == null || getClass() != o.getClass()) { 87 | return false; 88 | } 89 | 90 | Proxy proxy = (Proxy) o; 91 | 92 | if (port != proxy.port) { 93 | return false; 94 | } 95 | if (host != null ? !host.equals(proxy.host) : proxy.host != null) { 96 | return false; 97 | } 98 | if (username != null ? !username.equals(proxy.username) : proxy.username != null) { 99 | return false; 100 | } 101 | return password != null ? password.equals(proxy.password) : proxy.password == null; 102 | } 103 | 104 | @Override 105 | public int hashCode() { 106 | int result = host != null ? host.hashCode() : 0; 107 | result = 31 * result + port; 108 | result = 31 * result + (username != null ? username.hashCode() : 0); 109 | result = 31 * result + (password != null ? password.hashCode() : 0); 110 | return result; 111 | } 112 | 113 | @Override 114 | public String toString() { 115 | return "Proxy{" + 116 | "host='" + host + '\'' + 117 | ", port=" + port + 118 | ", username='" + username + '\'' + 119 | ", password='" + password + '\'' + 120 | '}'; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/proxy/ProxyProvider.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.proxy; 2 | 3 | import site.zido.elise.http.impl.DefaultResponse; 4 | import site.zido.elise.task.Task; 5 | 6 | /** 7 | * Proxy provider. 8 | * 9 | * @author zido 10 | */ 11 | public interface ProxyProvider { 12 | 13 | /** 14 | * Return proxy to Provider when complete a download. 15 | * 16 | * @param proxy the proxy config contains host,port and identify info 17 | * @param response the download result 18 | * @param task the download task 19 | */ 20 | void returnProxy(Proxy proxy, DefaultResponse response, Task task); 21 | 22 | /** 23 | * Get a proxy for task by some strategy. 24 | * 25 | * @param task the download task 26 | * @return proxy proxy 27 | */ 28 | Proxy getProxy(Task task); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/proxy/SimpleProxyProvider.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.proxy; 2 | 3 | import site.zido.elise.http.impl.DefaultResponse; 4 | import site.zido.elise.task.Task; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.List; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | /** 12 | * A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable. 13 | * 14 | * @author zido 15 | */ 16 | public class SimpleProxyProvider implements ProxyProvider { 17 | 18 | private final List proxies; 19 | 20 | private final AtomicInteger pointer; 21 | 22 | /** 23 | * Instantiates a new Simple proxy provider. 24 | * 25 | * @param proxies the proxies 26 | */ 27 | public SimpleProxyProvider(List proxies) { 28 | this(proxies, new AtomicInteger(-1)); 29 | } 30 | 31 | private SimpleProxyProvider(List proxies, AtomicInteger pointer) { 32 | this.proxies = proxies; 33 | this.pointer = pointer; 34 | } 35 | 36 | /** 37 | * From simple proxy provider. 38 | * 39 | * @param proxies the proxies 40 | * @return the simple proxy provider 41 | */ 42 | public static SimpleProxyProvider from(Proxy... proxies) { 43 | List proxiesTemp = new ArrayList<>(proxies.length); 44 | Collections.addAll(proxiesTemp, proxies); 45 | return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp)); 46 | } 47 | 48 | @Override 49 | public void returnProxy(Proxy proxy, DefaultResponse response, Task task) { 50 | } 51 | 52 | @Override 53 | public Proxy getProxy(Task task) { 54 | return proxies.get(incrForLoop()); 55 | } 56 | 57 | private int incrForLoop() { 58 | int p = pointer.incrementAndGet(); 59 | int size = proxies.size(); 60 | if (p < size) { 61 | return p; 62 | } 63 | while (!pointer.compareAndSet(p, p % size)) { 64 | p = pointer.get(); 65 | } 66 | return p % size; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/CountManager.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | import java.util.EventListener; 6 | 7 | /** 8 | * The interface Count manager. 9 | * 10 | * @author zido 11 | */ 12 | public interface CountManager { 13 | /** 14 | * Count int. 15 | * 16 | * @param task the task 17 | * @return the int 18 | */ 19 | int count(Task task); 20 | 21 | /** 22 | * Incr. 23 | * 24 | * @param task the task 25 | * @param num the num 26 | * @param listener the listener 27 | */ 28 | void incr(Task task, int num, CountListener listener); 29 | 30 | /** 31 | * Incr. 32 | * 33 | * @param task the task 34 | * @param num the num 35 | */ 36 | default void incr(Task task, int num) { 37 | incr(task, num, null); 38 | } 39 | 40 | /** 41 | * The interface Count listener. 42 | * 43 | * @author zido 44 | */ 45 | @FunctionalInterface 46 | interface CountListener extends EventListener { 47 | /** 48 | * Result. 49 | * 50 | * @param num the num 51 | */ 52 | void result(int num); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/DefaultMemoryCountManager.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import site.zido.elise.task.Task; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | /** 11 | * The type Default memory count manager. 12 | * 13 | * @author zido 14 | */ 15 | public class DefaultMemoryCountManager implements CountManager { 16 | private static final Logger LOGGER = LoggerFactory.getLogger(DefaultMemoryCountManager.class); 17 | private Map container = new HashMap<>(); 18 | 19 | @Override 20 | public int count(Task task) { 21 | final Integer number = container.get(task.getId()); 22 | if (number == null) { 23 | return 0; 24 | } 25 | return number; 26 | } 27 | 28 | @Override 29 | public synchronized void incr(Task task, int num, CountListener listener) { 30 | final int i = container.getOrDefault(task.getId(), 0) + num; 31 | container.put(task.getId(), i); 32 | LOGGER.debug("count result:" + i); 33 | if (listener != null) { 34 | listener.result(i); 35 | } 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/DefaultOperator.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.Operator; 4 | import site.zido.elise.events.SingleEventListener; 5 | import site.zido.elise.events.SingleListenerContainer; 6 | import site.zido.elise.http.Request; 7 | import site.zido.elise.http.RequestBuilder; 8 | import site.zido.elise.task.Task; 9 | import site.zido.elise.utils.Asserts; 10 | 11 | import java.util.concurrent.locks.Condition; 12 | import java.util.concurrent.locks.Lock; 13 | import java.util.concurrent.locks.ReentrantLock; 14 | 15 | /** 16 | * default operator. 17 | * 18 | * @author zido 19 | */ 20 | public class DefaultOperator implements Operator, SingleListenerContainer.RecyclingCallback { 21 | private final Task task; 22 | private final AbstractScheduler scheduler; 23 | private final SingleListenerContainer container; 24 | private Lock lock = new ReentrantLock(); 25 | private Condition condition = lock.newCondition(); 26 | 27 | /** 28 | * Instantiates a new Default operator. 29 | * 30 | * @param task the task 31 | * @param scheduler the scheduler 32 | */ 33 | public DefaultOperator(Task task, AbstractScheduler scheduler) { 34 | Asserts.notNull(task); 35 | Asserts.notNull(scheduler); 36 | this.scheduler = scheduler; 37 | this.task = task; 38 | container = new SingleListenerContainer(task.getId()); 39 | container.setCallback(this); 40 | scheduler.addEventListener(container); 41 | 42 | } 43 | 44 | @Override 45 | public Operator cancel(boolean ifRunning) { 46 | scheduler.cancel(task, ifRunning); 47 | return this; 48 | } 49 | 50 | @Override 51 | public Operator pause() { 52 | scheduler.pause(task); 53 | return this; 54 | } 55 | 56 | @Override 57 | public Operator recover() { 58 | scheduler.recover(task); 59 | return this; 60 | } 61 | 62 | @Override 63 | public Operator block() throws InterruptedException { 64 | lock.lock(); 65 | try { 66 | condition.await(); 67 | } finally { 68 | lock.unlock(); 69 | } 70 | return this; 71 | } 72 | 73 | @Override 74 | public Operator execute(String... url) { 75 | for (String s : url) { 76 | scheduler.pushRequest(task, RequestBuilder.get(s).build()); 77 | } 78 | return this; 79 | } 80 | 81 | @Override 82 | public Operator execute(Request request) { 83 | scheduler.pushRequest(task, request); 84 | return this; 85 | } 86 | 87 | @Override 88 | public Operator addEventListener(SingleEventListener listener) { 89 | container.addListener(listener); 90 | return this; 91 | } 92 | 93 | @Override 94 | public void onRecycling() { 95 | lock.lock(); 96 | try { 97 | scheduler.removeEventListener(container); 98 | condition.signal(); 99 | } finally { 100 | lock.unlock(); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/DuplicationProcessor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.task.Task; 5 | 6 | /** 7 | * duplication processor,It is an extension of the scheduler{@link TaskScheduler}. 8 | * Implementing this interface allows Task Scheduler to handle duplicate data 9 | * 10 | * @author zido 11 | */ 12 | public interface DuplicationProcessor { 13 | 14 | /** 15 | * Determine if this request is duplicate 16 | * 17 | * @param task the task 18 | * @param request request 19 | * @return true /false 20 | */ 21 | boolean isDuplicate(Task task, Request request); 22 | 23 | /** 24 | * Reset all non-repeating sets of this task so that the task can send the same request as before 25 | * 26 | * @param task This is the task ID that needs to be cleared this time. It can be set according to its internal rules. try to ensure that this clearing is only relevant to the current task. 27 | */ 28 | void resetDuplicateCheck(Task task); 29 | 30 | /** 31 | * Get the number of all requests provided under the current task 32 | * 33 | * @param task task 34 | * @return downloaderSize of all request under the current task 35 | */ 36 | int getTotalRequestsCount(Task task); 37 | } 38 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/HashSetDeduplicationProcessor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.task.Task; 5 | 6 | import java.util.Collections; 7 | import java.util.Set; 8 | import java.util.concurrent.ConcurrentHashMap; 9 | 10 | /** 11 | * Remove duplicate task processor using hash set. 12 | *

13 | * It is process isolation and thread-safe{@link ConcurrentHashMap} 14 | * 15 | * @author zido 16 | */ 17 | public class HashSetDeduplicationProcessor implements DuplicationProcessor { 18 | private Set urls = Collections.newSetFromMap(new ConcurrentHashMap<>()); 19 | 20 | @Override 21 | public boolean isDuplicate(Task task, Request request) { 22 | return !urls.add(getUrl(request)); 23 | } 24 | 25 | /** 26 | * Gets url. 27 | * 28 | * @param request the request 29 | * @return the url 30 | */ 31 | protected String getUrl(Request request) { 32 | return request.getUrl(); 33 | } 34 | 35 | @Override 36 | public void resetDuplicateCheck(Task task) { 37 | urls.clear(); 38 | } 39 | 40 | @Override 41 | public int getTotalRequestsCount(Task task) { 42 | return urls.size(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/MonitorableScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | /** 4 | * Monitorable Manager 5 | * 6 | * @author zido 7 | */ 8 | public interface MonitorableScheduler { 9 | 10 | /** 11 | * See how many messages are in the message container 12 | * 13 | * @return the size of message container 14 | */ 15 | int blockSize(); 16 | 17 | /** 18 | * Check if the message container is empty 19 | * 20 | * @return true /false 21 | */ 22 | default boolean empty() { 23 | return blockSize() == 0; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/NoDepuplicationProcessor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.task.Task; 5 | 6 | /** 7 | * NoDepuplicationProcessor 8 | * 9 | * @author zido 10 | */ 11 | public class NoDepuplicationProcessor implements DuplicationProcessor { 12 | @Override 13 | public boolean isDuplicate(Task task, Request request) { 14 | return false; 15 | } 16 | 17 | @Override 18 | public void resetDuplicateCheck(Task task) { 19 | 20 | } 21 | 22 | @Override 23 | public int getTotalRequestsCount(Task task) { 24 | return 0; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/OperationalTaskScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.task.Task; 4 | 5 | /** 6 | * The interface Operational task scheduler. 7 | * 8 | * @author zido 9 | */ 10 | public interface OperationalTaskScheduler extends TaskScheduler { 11 | /** 12 | * Cancel. 13 | * 14 | * @param ifRunning the if running 15 | */ 16 | void cancel(boolean ifRunning); 17 | 18 | /** 19 | * Cancel boolean. 20 | * 21 | * @param task the task 22 | * @param ifRunning the if running 23 | * @return the boolean 24 | */ 25 | boolean cancel(Task task, boolean ifRunning); 26 | 27 | /** 28 | * Pause boolean. 29 | * 30 | * @param task the task 31 | */ 32 | void pause(Task task); 33 | 34 | /** 35 | * Recover. 36 | * 37 | * @param task the task 38 | */ 39 | void recover(Task task); 40 | } 41 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/Seed.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.http.impl.DefaultResponse; 6 | import site.zido.elise.task.Task; 7 | 8 | import java.io.Serializable; 9 | import java.util.Objects; 10 | 11 | /** 12 | * The type Seed. 13 | * 14 | * @author zido 15 | */ 16 | public class Seed implements Serializable { 17 | private static final long serialVersionUID = 6615813166213363435L; 18 | private Task task; 19 | private Request request; 20 | private Response response; 21 | 22 | /** 23 | * Instantiates a new Seed. 24 | * 25 | * @param task the task 26 | * @param request the request 27 | * @param response the response 28 | */ 29 | public Seed(Task task, Request request, Response response) { 30 | this.task = task; 31 | this.request = request; 32 | this.response = response; 33 | } 34 | 35 | /** 36 | * Instantiates a new Seed. 37 | * 38 | * @param task the task 39 | * @param request the request 40 | */ 41 | public Seed(Task task, Request request) { 42 | this.task = task; 43 | this.request = request; 44 | } 45 | 46 | /** 47 | * Gets task. 48 | * 49 | * @return the task 50 | */ 51 | public Task getTask() { 52 | return task; 53 | } 54 | 55 | /** 56 | * Sets task. 57 | * 58 | * @param task the task 59 | */ 60 | public void setTask(Task task) { 61 | this.task = task; 62 | } 63 | 64 | /** 65 | * Gets request. 66 | * 67 | * @return the request 68 | */ 69 | public Request getRequest() { 70 | return request; 71 | } 72 | 73 | /** 74 | * Sets request. 75 | * 76 | * @param request the request 77 | */ 78 | public void setRequest(Request request) { 79 | this.request = request; 80 | } 81 | 82 | /** 83 | * Gets response. 84 | * 85 | * @return the response 86 | */ 87 | public Response getResponse() { 88 | return response; 89 | } 90 | 91 | /** 92 | * Sets response. 93 | * 94 | * @param response the response 95 | */ 96 | public void setResponse(DefaultResponse response) { 97 | this.response = response; 98 | } 99 | 100 | @Override 101 | public boolean equals(Object o) { 102 | if (this == o) { 103 | return true; 104 | } 105 | if (o == null || getClass() != o.getClass()) { 106 | return false; 107 | } 108 | Seed seed = (Seed) o; 109 | return Objects.equals(task, seed.task) && 110 | Objects.equals(request, seed.request) && 111 | Objects.equals(response, seed.response); 112 | } 113 | 114 | @Override 115 | public int hashCode() { 116 | return Objects.hash(task, request, response); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/scheduler/TaskScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.scheduler; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.task.Task; 5 | 6 | /** 7 | * the interface of message manager,it provide message service. 8 | *

9 | * Each client corresponds to a messageManager, and multiple clients should instantiate multiple message managers. 10 | *

11 | * In theory, the client is based on statelessness, either as a download client, an analytics client, or both. 12 | * Just simply register as the appropriate module 13 | * 14 | * @author zido 15 | */ 16 | public interface TaskScheduler { 17 | 18 | /** 19 | * If you need to download, you can call this method (usually after the analysis is completed) 20 | * 21 | * @param task the task 22 | * @param request the request 23 | */ 24 | void pushRequest(Task task, Request request); 25 | } 26 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/CssSelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.nodes.Node; 5 | import org.jsoup.select.Elements; 6 | import site.zido.elise.processor.ResponseContextHolder; 7 | import site.zido.elise.task.api.Source; 8 | import site.zido.elise.task.model.Action; 9 | import site.zido.elise.utils.Safe; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.stream.Collectors; 14 | 15 | /** 16 | * The type Css selector. 17 | * 18 | * @author zido 19 | */ 20 | public class CssSelectHandler implements SelectHandler { 21 | @Override 22 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 23 | Object[] extras = action.getExtras(); 24 | String express = Safe.getStrFromArray(extras, 0); 25 | if ("".equals(express)) { 26 | throw new SelectorMatchException(String.format("the action: [%s] need a string css express but get %s", action.getToken(), extras[0])); 27 | } 28 | Element document = null; 29 | if (Source.matchSource(action.getSource(), Source.PARTITION)) { 30 | List results = new ArrayList<>(); 31 | if (partition instanceof Element) { 32 | document = (Element) partition; 33 | } else if (partition instanceof List) { 34 | for (Object str : (List) partition) { 35 | if (str instanceof Node) { 36 | Elements elements = ((Element) str).select(express); 37 | results.addAll(elements); 38 | } 39 | } 40 | return results; 41 | } 42 | } else if (Source.matchSource(action.getSource(), Source.BODY, Source.HTML)) { 43 | document = response.getDocument(); 44 | } 45 | if (document == null) { 46 | return null; 47 | } 48 | return document.select(express).stream().map(element -> (Object) element).collect(Collectors.toList()); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/CssSelector.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.E; 4 | 5 | public class CssSelector extends ElementSelector { 6 | public CssSelector(String express) { 7 | super(E.Action.CSS_SELECTOR); 8 | super.setExtras(new Object[]{express}); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/ElementSelector.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.task.model.Action; 4 | 5 | /** 6 | * The interface Element selector. 7 | * 8 | * @author zido 9 | */ 10 | public abstract class ElementSelector extends Action { 11 | public ElementSelector(String token) { 12 | super(); 13 | super.setToken(token); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/FieldType.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | /** 4 | * The enum Field type. 5 | * 6 | * @author zido 7 | */ 8 | public enum FieldType { 9 | /** 10 | * Number field type. 11 | */ 12 | NUMBER, 13 | /** 14 | * Text field type. 15 | */ 16 | TEXT, 17 | /** 18 | * Rich field type. 19 | */ 20 | RICH, 21 | /** 22 | * Origin field type. 23 | */ 24 | ORIGIN, 25 | /** 26 | * Xml field type. 27 | */ 28 | XML, 29 | /** 30 | * Html field type. 31 | */ 32 | HTML, 33 | /** 34 | * Bytes field type. 35 | */ 36 | BYTES 37 | } 38 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/Fragment.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import org.jsoup.nodes.Node; 4 | 5 | import java.io.Serializable; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * contains a range of content that may be included in addition to text. 11 | *

12 | * such as images, videos, etc. 13 | * 14 | * @author zido 15 | */ 16 | public class Fragment implements Serializable { 17 | private static final long serialVersionUID = -630439618508226636L; 18 | private List contents = new ArrayList<>(); 19 | 20 | /** 21 | * Instantiates a new Fragment. 22 | */ 23 | public Fragment() { 24 | 25 | } 26 | 27 | /** 28 | * Instantiates a new Fragment. 29 | * 30 | * @param text the text 31 | */ 32 | public Fragment(String text) { 33 | this.add(text, RichType.TEXT); 34 | } 35 | 36 | /** 37 | * Add. 38 | * 39 | * @param node the node 40 | */ 41 | public void add(Node node) { 42 | switch (node.nodeName()) { 43 | case "#text": 44 | add(node.attr("text"), RichType.TEXT); 45 | break; 46 | case "#comment": 47 | add(node.attr("comment"), RichType.TEXT); 48 | break; 49 | case "#data": 50 | add(node.attr("data"), RichType.TEXT); 51 | break; 52 | case "img": 53 | add(node.attr("src"), RichType.IMAGE); 54 | break; 55 | case "audio": 56 | add(node.attr("src"), RichType.AUDIO); 57 | break; 58 | case "video": 59 | add(node.attr("src"), RichType.VIDEO); 60 | break; 61 | case "#doctype": 62 | case "#declaration": 63 | break;//can't support 64 | case "a": 65 | //mark like this:

somethings

other things 66 | //and then,the result like this:START_TAG|href=http://www.baidu.com|something|other things|END_TAG 67 | add("", RichType.CONTENT_START); 68 | add(node.attr("abs:href"), RichType.LINK); 69 | List linkNodes = node.childNodes(); 70 | for (Node linkNode : linkNodes) { 71 | add(linkNode); 72 | } 73 | add("", RichType.CONTENT_END); 74 | break; 75 | default: 76 | List nodes = node.childNodes(); 77 | for (Node child : nodes) { 78 | add(child); 79 | } 80 | } 81 | } 82 | 83 | /** 84 | * Add. 85 | * 86 | * @param raw the raw 87 | * @param type the type 88 | */ 89 | public void add(String raw, RichType type) { 90 | this.contents.add(new Paragraph(raw, type)); 91 | } 92 | 93 | /** 94 | * Gets contents. 95 | * 96 | * @return the contents 97 | */ 98 | public List getContents() { 99 | return contents; 100 | } 101 | 102 | /** 103 | * Text string. 104 | * 105 | * @return the string 106 | */ 107 | public String text() { 108 | StringBuilder sb = new StringBuilder(); 109 | for (Paragraph content : contents) { 110 | if (content.getType() == RichType.TEXT) { 111 | sb.append(content.getRaw()); 112 | } 113 | } 114 | return sb.toString(); 115 | } 116 | 117 | @Override 118 | public String toString() { 119 | if (contents.size() == 0) { 120 | return ""; 121 | } 122 | StringBuilder sb = new StringBuilder(); 123 | for (Paragraph content : contents) { 124 | sb.append(content).append(","); 125 | } 126 | sb.deleteCharAt(sb.length() - 1); 127 | return sb.toString(); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/HtmlLinkSelector.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.E; 4 | 5 | public class HtmlLinkSelector extends ElementSelector { 6 | public HtmlLinkSelector(String regex) { 7 | super(E.Action.LINK_SELECTOR); 8 | super.setExtras(new String[]{regex}); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/LinkSelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import org.jsoup.nodes.Document; 4 | import org.jsoup.nodes.Element; 5 | import org.jsoup.select.Elements; 6 | import site.zido.elise.processor.ResponseContextHolder; 7 | import site.zido.elise.task.api.Source; 8 | import site.zido.elise.task.model.Action; 9 | import site.zido.elise.utils.Safe; 10 | 11 | import java.util.ArrayList; 12 | import java.util.Arrays; 13 | import java.util.List; 14 | import java.util.regex.Pattern; 15 | 16 | /** 17 | * link selector 18 | * 19 | * @author zido 20 | */ 21 | public class LinkSelectHandler implements SelectHandler { 22 | private String[] defaultLinkProps = new String[0]; 23 | 24 | public LinkSelectHandler() { 25 | } 26 | 27 | public LinkSelectHandler(String... defaultLinkProps) { 28 | this.defaultLinkProps = defaultLinkProps; 29 | } 30 | 31 | @Override 32 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 33 | Object[] extras = action.getExtras(); 34 | String express = Safe.getStrFromArray(extras, 0); 35 | if ("".equals(express)) { 36 | throw new SelectorMatchException(String.format("the action: [%s] need a string express but get %s", action.getToken(), extras[0])); 37 | } 38 | Pattern pattern = Pattern.compile(express); 39 | Document document = null; 40 | if (Source.matchSource(action.getSource(), Source.PARTITION)) { 41 | List results = new ArrayList<>(); 42 | if (partition instanceof Document) { 43 | document = (Document) partition; 44 | } else if (partition instanceof List) { 45 | for (Object str : (List) partition) { 46 | if (str instanceof String) { 47 | if (pattern.matcher((CharSequence) str).find()) { 48 | results.add(str); 49 | } 50 | } 51 | } 52 | return results; 53 | } 54 | } else if (Source.matchSource(action.getSource(), Source.BODY, Source.HTML)) { 55 | document = response.getDocument(); 56 | } 57 | if (document == null) { 58 | return null; 59 | } 60 | List results = new ArrayList<>(); 61 | Object[] tmp; 62 | if (extras.length == 1) { 63 | tmp = defaultLinkProps; 64 | } else { 65 | tmp = Arrays.copyOfRange(extras, 1, extras.length); 66 | } 67 | for (int i = 0; i < tmp.length; i++) { 68 | if (!(tmp[i] instanceof String)) { 69 | throw new SelectorMatchException(String.format("the action: [%s] need param like [a:href] but get %s", action.getToken(), tmp[i])); 70 | } 71 | String linkProp = (String) tmp[i]; 72 | String[] split = linkProp.split(":"); 73 | if (split.length != 2) { 74 | throw new SelectorMatchException(String.format("the action: [%s] need param like [a:href] but get %s", action.getToken(), linkProp)); 75 | } 76 | String tagName = split[0]; 77 | String attr = split[1]; 78 | Elements elements = document.select(tagName + "[" + attr + "]"); 79 | for (Element element : elements) { 80 | String href = element.attr("abs:" + attr); 81 | if (pattern.matcher(href).find()) { 82 | results.add(href); 83 | } 84 | } 85 | } 86 | return results; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/ModelExtractor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.processor.ProcessorEventListener; 4 | import site.zido.elise.processor.Saver; 5 | import site.zido.elise.task.api.SelectableResponse; 6 | 7 | import java.util.List; 8 | import java.util.Set; 9 | 10 | /** 11 | * model extractor 12 | * 13 | * @author zido 14 | */ 15 | public interface ModelExtractor { 16 | 17 | /** 18 | * Extract result item. 19 | * 20 | * @param response the response 21 | * @param saver the saver 22 | * @param listeners the listeners 23 | * @return the result item 24 | */ 25 | List extract(SelectableResponse response, Saver saver, Set listeners); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/OriginSelectorHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.processor.ResponseContextHolder; 4 | import site.zido.elise.task.api.Source; 5 | import site.zido.elise.task.model.Action; 6 | 7 | import java.util.Collections; 8 | import java.util.List; 9 | 10 | public class OriginSelectorHandler implements SelectHandler { 11 | @Override 12 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 13 | String source = action.getSource(); 14 | if (Source.matchSource(source, Source.URL)) { 15 | return Collections.singletonList(response.getUrl()); 16 | } else if (Source.matchSource(source, Source.CODE)) { 17 | return Collections.singletonList(response.getStatusCode()); 18 | } else if (Source.matchSource(source, Source.HTML, Source.TEXT)) { 19 | return Collections.singletonList(response.getHtml()); 20 | } 21 | return null; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/Paragraph.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * The type Paragraph. 7 | * 8 | * @author zido 9 | */ 10 | class Paragraph implements Serializable { 11 | private static final long serialVersionUID = 2429211411138531885L; 12 | private RichType type; 13 | private String raw; 14 | 15 | /** 16 | * Instantiates a new Paragraph. 17 | */ 18 | public Paragraph() { 19 | 20 | } 21 | 22 | /** 23 | * Instantiates a new Paragraph. 24 | * 25 | * @param raw the raw 26 | * @param type the type 27 | */ 28 | public Paragraph(String raw, RichType type) { 29 | this.type = type; 30 | this.raw = raw; 31 | } 32 | 33 | /** 34 | * Gets type. 35 | * 36 | * @return the type 37 | */ 38 | public RichType getType() { 39 | return type; 40 | } 41 | 42 | /** 43 | * Sets type. 44 | * 45 | * @param type the type 46 | */ 47 | public void setType(RichType type) { 48 | this.type = type; 49 | } 50 | 51 | /** 52 | * Gets raw. 53 | * 54 | * @return the raw 55 | */ 56 | public String getRaw() { 57 | return raw; 58 | } 59 | 60 | /** 61 | * Sets raw. 62 | * 63 | * @param raw the raw 64 | */ 65 | public void setRaw(String raw) { 66 | this.raw = raw; 67 | } 68 | 69 | @Override 70 | public String toString() { 71 | return type.name() + ":" + raw; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/RegexSelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.processor.ResponseContextHolder; 4 | import site.zido.elise.task.api.Source; 5 | import site.zido.elise.task.model.Action; 6 | import site.zido.elise.utils.Safe; 7 | 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | /** 14 | * The type Regex selector. 15 | * 16 | * @author zido 17 | */ 18 | public class RegexSelectHandler implements SelectHandler { 19 | @Override 20 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 21 | String source = action.getSource(); 22 | Object[] extras = action.getExtras(); 23 | String regex = Safe.getStrFromArray(extras, 0); 24 | if ("".equalsIgnoreCase(regex)) { 25 | throw new SelectorMatchException(String.format("the action: [%s] need regex express but get %s", action.getToken(), regex)); 26 | } 27 | int flags = Safe.getIntFromArray(extras, 1, 0); 28 | int group = Safe.getIntFromArray(extras, 2, 0); 29 | Pattern pattern = Pattern.compile(regex, flags); 30 | if (Source.matchSource(source, Source.BODY, Source.HTML, Source.TEXT)) { 31 | Matcher matcher = pattern.matcher(response.getHtml()); 32 | List results = new LinkedList<>(); 33 | while (matcher.find()) { 34 | results.add(matcher.group(group)); 35 | } 36 | return results; 37 | } else if (Source.matchSource(source, Source.URL)) { 38 | Matcher matcher = pattern.matcher(response.getUrl()); 39 | List results = new LinkedList<>(); 40 | if (matcher.find()) { 41 | results.add(matcher.group(group)); 42 | } 43 | return results; 44 | } 45 | return null; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/RichType.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | /** 4 | * The enum Rich type. 5 | * 6 | * @author zido 7 | */ 8 | public enum RichType { 9 | /** 10 | * Text rich type. 11 | */ 12 | TEXT, 13 | /** 14 | * Image rich type. 15 | */ 16 | IMAGE, 17 | /** 18 | * Audio rich type. 19 | */ 20 | AUDIO, 21 | /** 22 | * Video rich type. 23 | */ 24 | VIDEO, 25 | /** 26 | * Link rich type. 27 | */ 28 | LINK, 29 | /** 30 | * Content start rich type. 31 | */ 32 | CONTENT_START, 33 | /** 34 | * Content end rich type. 35 | */ 36 | CONTENT_END 37 | } 38 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/SelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.processor.ResponseContextHolder; 4 | import site.zido.elise.task.model.Action; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * The interface SelectHandler. 10 | * 11 | * @author zido 12 | */ 13 | public interface SelectHandler { 14 | List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException; 15 | } 16 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/SelectorMatchException.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | public class SelectorMatchException extends Exception { 4 | private static final long serialVersionUID = -6677452217814605384L; 5 | 6 | public SelectorMatchException(String msg) { 7 | super(msg); 8 | } 9 | 10 | public SelectorMatchException(Throwable e) { 11 | super(e); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/SiteMatcherSelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.processor.ResponseContextHolder; 4 | import site.zido.elise.task.api.Source; 5 | import site.zido.elise.task.model.Action; 6 | import site.zido.elise.utils.Safe; 7 | 8 | import java.util.Collections; 9 | import java.util.List; 10 | 11 | /** 12 | * The type Site matcher. 13 | * 14 | * @author zido 15 | */ 16 | public class SiteMatcherSelectHandler implements SelectHandler { 17 | /** 18 | * a character can match any single character 19 | */ 20 | private static final char SINGLE_MATCH_CHAR = '?'; 21 | /** 22 | * a character can match any number of characters 23 | */ 24 | private static final char MORE_MATCH_CHAR = '*'; 25 | 26 | private boolean match(char[] origin, char[] express, int originIndex, int expressIndex) { 27 | if (originIndex == origin.length && expressIndex == express.length) { 28 | return true; 29 | } 30 | if (expressIndex == express.length) { 31 | return false; 32 | } 33 | if (express[expressIndex] == MORE_MATCH_CHAR && express[expressIndex] + 1 != express.length && originIndex == origin.length) { 34 | return false; 35 | } 36 | if (originIndex == origin.length) { 37 | return false; 38 | } 39 | if (express[expressIndex] == SINGLE_MATCH_CHAR || express[expressIndex] == origin[originIndex]) { 40 | return match(origin, express, originIndex + 1, expressIndex + 1); 41 | } 42 | if (express[expressIndex] == MORE_MATCH_CHAR) { 43 | return match(origin, express, originIndex + 1, expressIndex) || match(origin, express, originIndex, expressIndex + 1); 44 | } 45 | return false; 46 | } 47 | 48 | @Override 49 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 50 | Object[] extras = action.getExtras(); 51 | String express = Safe.getStrFromArray(extras, 0); 52 | if ("".equals(express)) { 53 | throw new SelectorMatchException(String.format("the action: [%s] need a string express like [site.zido.*] but get %s", action.getToken(), extras[0])); 54 | } 55 | String source = action.getSource(); 56 | String target; 57 | if (Source.matchSource(source, Source.URL)) { 58 | target = response.getUrl(); 59 | } else { 60 | throw new SelectorMatchException("match site just support response url"); 61 | } 62 | if (match(target.toCharArray(), express.toCharArray(), 0, 0)) { 63 | return Collections.singletonList(target); 64 | } 65 | return null; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/XpathSelectHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import com.virjar.sipsoup.exception.XpathSyntaxErrorException; 4 | import com.virjar.sipsoup.model.XpathEvaluator; 5 | import com.virjar.sipsoup.parse.XpathParser; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.nodes.Node; 8 | import org.jsoup.select.Elements; 9 | import site.zido.elise.processor.ResponseContextHolder; 10 | import site.zido.elise.task.api.Source; 11 | import site.zido.elise.task.model.Action; 12 | import site.zido.elise.utils.Safe; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.stream.Collectors; 17 | 18 | /** 19 | * The type X path selector. 20 | * 21 | * @author zido 22 | */ 23 | public class XpathSelectHandler implements SelectHandler { 24 | @Override 25 | public List select(ResponseContextHolder response, Object partition, Action action) throws SelectorMatchException { 26 | Object[] extras = action.getExtras(); 27 | String express = Safe.getStrFromArray(extras, 0); 28 | if ("".equals(express)) { 29 | throw new SelectorMatchException(String.format("the action: [%s] need a xpath string express but get %s", action.getToken(), extras[0])); 30 | } 31 | XpathEvaluator evaluator; 32 | try { 33 | evaluator = XpathParser.compile(express); 34 | } catch (XpathSyntaxErrorException e) { 35 | throw new SelectorMatchException(e); 36 | } 37 | Element document = null; 38 | if (Source.matchSource(action.getSource(), Source.PARTITION)) { 39 | List results = new ArrayList<>(); 40 | if (partition instanceof Element) { 41 | document = (Element) partition; 42 | } else if (partition instanceof List) { 43 | for (Object str : (List) partition) { 44 | if (str instanceof Node) { 45 | Elements elements = evaluator.evaluateToElements((Element) str); 46 | results.addAll(elements); 47 | } 48 | } 49 | return results; 50 | } 51 | } else if (Source.matchSource(action.getSource(), Source.BODY, Source.HTML)) { 52 | document = response.getDocument(); 53 | } 54 | if (document == null) { 55 | return null; 56 | } 57 | return evaluator.evaluateToElement(document).stream().map(element -> (Object) element).collect(Collectors.toList()); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/select/XpathSelector.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.select; 2 | 3 | import site.zido.elise.E; 4 | 5 | public class XpathSelector extends ElementSelector { 6 | public XpathSelector(String xpath) { 7 | super(E.Action.XPATH_SELECTOR); 8 | super.setExtras(new Object[]{xpath}); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/DefaultTask.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task; 2 | 3 | import site.zido.elise.custom.Config; 4 | import site.zido.elise.task.model.Model; 5 | 6 | /** 7 | * default extractor task 8 | *
9 | * 10 | * @author zido 11 | */ 12 | public class DefaultTask implements Task { 13 | private Long id; 14 | private Config config; 15 | private Model model; 16 | 17 | /** 18 | * Instantiates a new Default task. 19 | */ 20 | public DefaultTask() { 21 | 22 | } 23 | 24 | /** 25 | * Instantiates a new Default task. 26 | * 27 | * @param id the id 28 | * @param model the extractor model 29 | * @param config the config 30 | */ 31 | public DefaultTask(Long id, Model model, Config config) { 32 | this.id = id; 33 | this.model = model; 34 | this.config = config; 35 | } 36 | 37 | @Override 38 | public long getId() { 39 | return id; 40 | } 41 | 42 | /** 43 | * Sets id. 44 | * 45 | * @param id the id 46 | * @return the id 47 | */ 48 | public DefaultTask setId(Long id) { 49 | this.id = id; 50 | return this; 51 | } 52 | 53 | @Override 54 | public Model getModel() { 55 | return this.model; 56 | } 57 | 58 | public void setModel(Model model) { 59 | this.model = model; 60 | } 61 | 62 | @Override 63 | public Config getConfig() { 64 | return config; 65 | } 66 | 67 | /** 68 | * Sets config. 69 | * 70 | * @param config the config 71 | * @return the config 72 | */ 73 | public DefaultTask setConfig(Config config) { 74 | this.config = config; 75 | return this; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/Task.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task; 2 | 3 | import site.zido.elise.custom.Config; 4 | import site.zido.elise.task.model.Model; 5 | 6 | /** 7 | * Task interface 8 | * 9 | * @author zido 10 | */ 11 | public interface Task { 12 | /** 13 | * Get task id 14 | * 15 | * @return id id 16 | */ 17 | long getId(); 18 | 19 | /** 20 | * get the model extractor 21 | * 22 | * @return extractors model 23 | */ 24 | Model getModel(); 25 | 26 | /** 27 | * Gets config. 28 | * 29 | * @return the config 30 | */ 31 | Config getConfig(); 32 | } 33 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/annotations/EliseHelper.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.annotations; 2 | 3 | import java.lang.annotation.*; 4 | 5 | @Target(ElementType.TYPE) 6 | @Documented 7 | @Retention(RetentionPolicy.CLASS) 8 | public @interface EliseHelper { 9 | String regex() default ""; 10 | } 11 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/annotations/EliseModel.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.annotations; 2 | 3 | import java.lang.annotation.*; 4 | 5 | @Target(ElementType.TYPE) 6 | @Documented 7 | @Retention(RetentionPolicy.CLASS) 8 | public @interface EliseModel { 9 | String name(); 10 | 11 | boolean nullable() default false; 12 | 13 | 14 | } 15 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/annotations/ElisePartition.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.annotations; 2 | 3 | import java.lang.annotation.*; 4 | 5 | @Target(ElementType.TYPE) 6 | @Documented 7 | @Retention(RetentionPolicy.RUNTIME) 8 | public @interface ElisePartition { 9 | } 10 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/annotations/EliseTarget.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.annotations; 2 | 3 | import java.lang.annotation.*; 4 | 5 | @Target(ElementType.TYPE) 6 | @Documented 7 | @Retention(RetentionPolicy.RUNTIME) 8 | public @interface EliseTarget { 9 | String matchUrl() default ""; 10 | 11 | String matchStatusCode() default "200"; 12 | } 13 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/DataDescriptor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.E; 4 | import site.zido.elise.select.FieldType; 5 | import site.zido.elise.task.model.Action; 6 | import site.zido.elise.task.model.ModelField; 7 | 8 | import java.util.Collections; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | 12 | /** 13 | * Data descriptor. 14 | * describe how to extract data. 15 | * 16 | * @author zido 17 | */ 18 | public class DataDescriptor { 19 | private List fields; 20 | 21 | public DataDescriptor(List fields) { 22 | this.fields = fields; 23 | } 24 | 25 | /** 26 | * Html element selectable. 27 | * 28 | * @return the element selectable 29 | */ 30 | public ElementSelectable html() { 31 | ModelField field = new ModelField(); 32 | fields.add(field); 33 | List actions = new LinkedList<>(); 34 | field.setActions(actions); 35 | return new ElementSelectable(Source.HTML, field, actions); 36 | } 37 | 38 | /** 39 | * get data from url 40 | * 41 | * @return the value descriptor 42 | */ 43 | public Value url() { 44 | return getValue(Source.URL, FieldType.TEXT); 45 | } 46 | 47 | /** 48 | * get data from status code 49 | * 50 | * @return the value descriptor 51 | */ 52 | public Value statusCode() { 53 | return getValue(Source.CODE, FieldType.NUMBER); 54 | } 55 | 56 | private Value getValue(String source, FieldType type) { 57 | ModelField field = new ModelField(); 58 | fields.add(field); 59 | Action action = new Action(); 60 | action.setSource(source); 61 | action.setToken(E.Action.SELECT_ORIGIN); 62 | field.setActions(Collections.singletonList(action)); 63 | field.setValueType(type); 64 | return new Value(field); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/DefaultSelectableResponse.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.select.ElementSelector; 4 | import site.zido.elise.task.model.Action; 5 | import site.zido.elise.task.model.Model; 6 | import site.zido.elise.task.model.ModelField; 7 | import site.zido.elise.task.model.Partition; 8 | 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | 12 | public class DefaultSelectableResponse implements SelectableResponse { 13 | private final Model model = new Model(); 14 | 15 | @Override 16 | public SelectableResponse modelName(String name) { 17 | model.setName(name); 18 | return this; 19 | } 20 | 21 | @Override 22 | public TargetDescriptor asTarget() { 23 | List targets = model.getTargets(); 24 | if (targets == null) { 25 | targets = new LinkedList<>(); 26 | model.setTargets(targets); 27 | } 28 | return new TargetDescriptor(targets); 29 | } 30 | 31 | @Override 32 | public HelpDescriptor asHelper() { 33 | List helpers = model.getHelpers(); 34 | if (helpers == null) { 35 | helpers = new LinkedList<>(); 36 | model.setHelpers(helpers); 37 | } 38 | return new HelpDescriptor(helpers); 39 | } 40 | 41 | @Override 42 | public DataDescriptor asContent() { 43 | List fields = model.getFields(); 44 | if (fields == null) { 45 | fields = new LinkedList<>(); 46 | model.setFields(fields); 47 | } 48 | return new DataDescriptor(fields); 49 | } 50 | 51 | @Override 52 | public PartitionDescriptor asPartition(ElementSelector selector) { 53 | Partition partition = new Partition(); 54 | model.setPartition(partition); 55 | partition.setAction(selector); 56 | selector.setSource(Source.HTML); 57 | selector.setChildren(null); 58 | return new PartitionDescriptor(partition); 59 | } 60 | 61 | public Model getModel() { 62 | return this.model; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/ElementSelectable.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.select.CssSelector; 4 | import site.zido.elise.select.ElementSelector; 5 | import site.zido.elise.select.XpathSelector; 6 | import site.zido.elise.task.model.Action; 7 | import site.zido.elise.task.model.ModelField; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * The interface Element selectable. 13 | * 14 | * @author zido 15 | */ 16 | public class ElementSelectable { 17 | private String source; 18 | private ModelField field; 19 | private List actions; 20 | 21 | public ElementSelectable(String source, ModelField field, List actions) { 22 | this.source = source; 23 | this.field = field; 24 | this.actions = actions; 25 | } 26 | 27 | 28 | /** 29 | * Select element value. 30 | * 31 | * @param selector the selector 32 | * @return the element value 33 | */ 34 | public ElementValue select(ElementSelector selector) { 35 | selector.setSource(source); 36 | actions.add(selector); 37 | return new ElementValue(this, field); 38 | } 39 | 40 | /** 41 | * Css element value. 42 | * 43 | * @param css the css 44 | * @return the element value 45 | */ 46 | public ElementValue css(String css) { 47 | return select(new CssSelector(css)); 48 | } 49 | 50 | /** 51 | * Xpath element value. 52 | * 53 | * @param xpath the xpath 54 | * @return the element value 55 | */ 56 | public ElementValue xpath(String xpath) { 57 | return select(new XpathSelector(xpath)); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/ElementValue.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.select.FieldType; 4 | import site.zido.elise.task.model.ModelField; 5 | 6 | /** 7 | * The interface Element value. 8 | * 9 | * @author zido 10 | */ 11 | public class ElementValue { 12 | private ModelField field; 13 | private ElementSelectable top; 14 | 15 | public ElementValue(ElementSelectable top, ModelField field) { 16 | this.field = field; 17 | this.top = top; 18 | } 19 | 20 | /** 21 | * Text value. 22 | * 23 | * @return the value 24 | */ 25 | public Value text() { 26 | field.setValueType(FieldType.TEXT); 27 | return new Value(field); 28 | } 29 | 30 | /** 31 | * Rich value. 32 | * 33 | * @return the value 34 | */ 35 | public Value rich() { 36 | field.setValueType(FieldType.RICH); 37 | return new Value(field); 38 | } 39 | 40 | /** 41 | * Xml value. 42 | * 43 | * @return the value 44 | */ 45 | public Value xml() { 46 | field.setValueType(FieldType.XML); 47 | return new Value(field); 48 | } 49 | 50 | public ElementSelectable or() { 51 | return top; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/HelpDescriptor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.E; 4 | import site.zido.elise.task.model.Action; 5 | 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | 9 | /** 10 | * The interface Help descriptor. 11 | * 12 | * @author zido 13 | */ 14 | public class HelpDescriptor { 15 | private List helpActions; 16 | 17 | public HelpDescriptor(List action) { 18 | this.helpActions = action; 19 | } 20 | 21 | /** 22 | * Filter help descriptor. 23 | * 24 | * @param regex the regex express 25 | * @return the help descriptor 26 | */ 27 | public HelpDescriptor regex(String regex) { 28 | Action action = new Action(); 29 | action.setToken(E.Action.LINK_SELECTOR); 30 | action.setExtras(new Object[]{regex}); 31 | action.setSource(Source.HTML); 32 | helpActions.add(action); 33 | return this; 34 | } 35 | 36 | public HelpDescriptor and() { 37 | if (helpActions.isEmpty()) { 38 | return this; 39 | } 40 | List children = new LinkedList<>(); 41 | Action action = helpActions.get(helpActions.size() - 1); 42 | action.setChildren(children); 43 | return new HelpDescriptor(children); 44 | } 45 | 46 | public HelpDescriptor or() { 47 | return this; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/PartitionDescriptor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.task.model.ModelField; 4 | import site.zido.elise.task.model.Partition; 5 | 6 | import java.util.LinkedList; 7 | 8 | public class PartitionDescriptor { 9 | private Partition partition; 10 | 11 | public PartitionDescriptor(Partition partition) { 12 | this.partition = partition; 13 | } 14 | 15 | public ElementSelectable field() { 16 | if (partition.getFields() == null) { 17 | partition.setFields(new LinkedList<>()); 18 | } 19 | ModelField field = new ModelField(); 20 | field.setActions(new LinkedList<>()); 21 | partition.getFields().add(field); 22 | return new ElementSelectable(Source.PARTITION, field, field.getActions()); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/RepeatMatchException.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | public class RepeatMatchException extends RuntimeException { 4 | public RepeatMatchException() { 5 | super("Cannot repeat matches when the target has been built,you may need call the [and] function"); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/ResponseHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | /** 4 | * the response handler 5 | * 6 | * @author zido 7 | */ 8 | @FunctionalInterface 9 | public interface ResponseHandler { 10 | /** 11 | * handle response,build a extractor by java api 12 | * 13 | * @param response response 14 | */ 15 | void onHandle(SelectableResponse response); 16 | } 17 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/SelectableResponse.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.select.ElementSelector; 4 | 5 | /** 6 | * The interface Selectable response. 7 | * 8 | * @author zido 9 | */ 10 | public interface SelectableResponse { 11 | 12 | /** 13 | * Model name selectable response. 14 | * 15 | * @param name the name 16 | * @return the selectable response 17 | */ 18 | SelectableResponse modelName(String name); 19 | 20 | /** 21 | * As target target descriptor. 22 | * 23 | * @return the target descriptor 24 | */ 25 | TargetDescriptor asTarget(); 26 | 27 | /** 28 | * As helper help descriptor. 29 | * 30 | * @return the help descriptor 31 | */ 32 | HelpDescriptor asHelper(); 33 | 34 | /** 35 | * As content data descriptor. 36 | * 37 | * @return the data descriptor 38 | */ 39 | DataDescriptor asContent(); 40 | 41 | PartitionDescriptor asPartition(ElementSelector selector); 42 | } 43 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/Source.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | /** 4 | * 抽取范围 5 | * 6 | * @author zido 7 | */ 8 | public class Source { 9 | public final static String URL = "url"; 10 | public final static String CODE = "code"; 11 | public final static String BODY = "body"; 12 | public final static String HTML = "html"; 13 | public final static String TEXT = "text"; 14 | public final static String PARTITION = "partition"; 15 | private Source() { 16 | } 17 | 18 | public static boolean matchSource(String target, String... source) { 19 | if (source.length == 0) { 20 | return false; 21 | } 22 | if (target == null || "".equals(target)) { 23 | return false; 24 | } 25 | for (String s : source) { 26 | if (s.equalsIgnoreCase(target)) { 27 | return true; 28 | } 29 | } 30 | return false; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/TargetDescriptor.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.E; 4 | import site.zido.elise.task.model.Action; 5 | 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | 9 | /** 10 | * The interface Target descriptor. 11 | * 12 | * @author zido 13 | */ 14 | public class TargetDescriptor { 15 | private List targetActions; 16 | 17 | public TargetDescriptor(List action) { 18 | this.targetActions = action; 19 | } 20 | 21 | /** 22 | * Match url target descriptor. 23 | * 24 | * @return the target descriptor 25 | */ 26 | public TargetDescriptor matchUrl(String regex) { 27 | Action action = new Action(); 28 | action.setToken(E.Action.MATCH_LINK); 29 | action.setExtras(new Object[]{regex}); 30 | action.setSource(Source.URL); 31 | targetActions.add(action); 32 | return this; 33 | } 34 | 35 | /** 36 | * Status code target descriptor. 37 | * 38 | * @param numberMatchExpress the matcher 39 | * @return the target descriptor 40 | */ 41 | public TargetDescriptor matchStatusCode(String numberMatchExpress) { 42 | Action action = new Action(); 43 | action.setToken(E.Action.MATCH_NUMBER); 44 | action.setExtras(new Object[]{numberMatchExpress}); 45 | action.setSource(Source.CODE); 46 | targetActions.add(action); 47 | return this; 48 | } 49 | 50 | public TargetDescriptor and() { 51 | if (targetActions.isEmpty()) { 52 | return this; 53 | } 54 | List children = new LinkedList<>(); 55 | Action action = targetActions.get(targetActions.size() - 1); 56 | action.setChildren(children); 57 | return new TargetDescriptor(children); 58 | } 59 | 60 | public TargetDescriptor or() { 61 | return this; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/api/Value.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import site.zido.elise.task.model.ModelField; 4 | 5 | /** 6 | * The interface Value. 7 | * 8 | * @author zido 9 | */ 10 | public class Value { 11 | private ModelField field; 12 | 13 | public Value(ModelField field) { 14 | this.field = field; 15 | } 16 | 17 | /** 18 | * Save value. 19 | * 20 | * @param name the name 21 | * @return the value 22 | */ 23 | public Value save(String name) { 24 | field.setName(name); 25 | return this; 26 | } 27 | 28 | /** 29 | * Nullable value. 30 | * 31 | * @param nullable the nullable 32 | * @return the value 33 | */ 34 | public Value nullable(boolean nullable) { 35 | field.setNullable(nullable); 36 | return this; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/model/Action.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.model; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.Objects; 6 | 7 | public class Action { 8 | private String token; 9 | private Object[] extras; 10 | private String source; 11 | private List children; 12 | 13 | public String getToken() { 14 | return token; 15 | } 16 | 17 | public void setToken(String token) { 18 | this.token = token; 19 | } 20 | 21 | public Object[] getExtras() { 22 | return extras; 23 | } 24 | 25 | public void setExtras(Object[] extras) { 26 | this.extras = extras; 27 | } 28 | 29 | public List getChildren() { 30 | return children; 31 | } 32 | 33 | public void setChildren(List children) { 34 | this.children = children; 35 | } 36 | 37 | public String getSource() { 38 | return source; 39 | } 40 | 41 | public void setSource(String source) { 42 | this.source = source; 43 | } 44 | 45 | @Override 46 | public boolean equals(Object o) { 47 | if (this == o) return true; 48 | if (o == null || getClass() != o.getClass()) return false; 49 | Action action = (Action) o; 50 | return Objects.equals(token, action.token) && 51 | Arrays.equals(extras, action.extras) && 52 | Objects.equals(source, action.source) && 53 | Objects.equals(children, action.children); 54 | } 55 | 56 | @Override 57 | public int hashCode() { 58 | int result = Objects.hash(token, source, children); 59 | result = 31 * result + Arrays.hashCode(extras); 60 | return result; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/model/Model.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.model; 2 | 3 | import java.util.List; 4 | import java.util.Objects; 5 | 6 | public final class Model { 7 | private String name; 8 | private List targets; 9 | private List helpers; 10 | private Partition partition; 11 | private List fields; 12 | 13 | public String getName() { 14 | return name; 15 | } 16 | 17 | public void setName(String name) { 18 | this.name = name; 19 | } 20 | 21 | public List getTargets() { 22 | return targets; 23 | } 24 | 25 | public void setTargets(List targets) { 26 | this.targets = targets; 27 | } 28 | 29 | public List getHelpers() { 30 | return helpers; 31 | } 32 | 33 | public void setHelpers(List helpers) { 34 | this.helpers = helpers; 35 | } 36 | 37 | public Partition getPartition() { 38 | return partition; 39 | } 40 | 41 | public void setPartition(Partition partition) { 42 | this.partition = partition; 43 | } 44 | 45 | public List getFields() { 46 | return fields; 47 | } 48 | 49 | public void setFields(List fields) { 50 | this.fields = fields; 51 | } 52 | 53 | @Override 54 | public boolean equals(Object o) { 55 | if (this == o) { 56 | return true; 57 | } 58 | if (o == null || getClass() != o.getClass()) { 59 | return false; 60 | } 61 | Model model = (Model) o; 62 | return Objects.equals(name, model.name) && 63 | Objects.equals(targets, model.targets) && 64 | Objects.equals(helpers, model.helpers) && 65 | Objects.equals(partition, model.partition) && 66 | Objects.equals(fields, model.fields); 67 | } 68 | 69 | @Override 70 | public int hashCode() { 71 | return Objects.hash(name, targets, helpers, partition, fields); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/model/ModelField.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.model; 2 | 3 | import site.zido.elise.select.FieldType; 4 | 5 | import java.util.List; 6 | import java.util.Objects; 7 | 8 | /** 9 | * The Model field. 10 | * 11 | * @author zido 12 | */ 13 | public final class ModelField { 14 | /** 15 | * field name 16 | */ 17 | private String name; 18 | 19 | private boolean nullable; 20 | 21 | private FieldType valueType; 22 | 23 | private List actions; 24 | 25 | public String getName() { 26 | return name; 27 | } 28 | 29 | public void setName(String name) { 30 | this.name = name; 31 | } 32 | 33 | public boolean isNullable() { 34 | return nullable; 35 | } 36 | 37 | public void setNullable(boolean nullable) { 38 | this.nullable = nullable; 39 | } 40 | 41 | public List getActions() { 42 | return actions; 43 | } 44 | 45 | public void setActions(List actions) { 46 | this.actions = actions; 47 | } 48 | 49 | public FieldType getValueType() { 50 | return valueType; 51 | } 52 | 53 | public void setValueType(FieldType valueType) { 54 | this.valueType = valueType; 55 | } 56 | 57 | @Override 58 | public boolean equals(Object o) { 59 | if (this == o) return true; 60 | if (o == null || getClass() != o.getClass()) return false; 61 | ModelField that = (ModelField) o; 62 | return nullable == that.nullable && 63 | Objects.equals(name, that.name) && 64 | valueType == that.valueType && 65 | Objects.equals(actions, that.actions); 66 | } 67 | 68 | @Override 69 | public int hashCode() { 70 | return Objects.hash(name, nullable, valueType, actions); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/task/model/Partition.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.model; 2 | 3 | import java.util.List; 4 | import java.util.Objects; 5 | 6 | public final class Partition { 7 | private Action action; 8 | private List fields; 9 | 10 | public Action getAction() { 11 | return action; 12 | } 13 | 14 | public void setAction(Action action) { 15 | this.action = action; 16 | } 17 | 18 | public List getFields() { 19 | return fields; 20 | } 21 | 22 | public void setFields(List fields) { 23 | this.fields = fields; 24 | } 25 | 26 | @Override 27 | public boolean equals(Object o) { 28 | if (this == o) return true; 29 | if (o == null || getClass() != o.getClass()) return false; 30 | Partition partition = (Partition) o; 31 | return Objects.equals(action, partition.action) && 32 | Objects.equals(fields, partition.fields); 33 | } 34 | 35 | @Override 36 | public int hashCode() { 37 | return Objects.hash(action, fields); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/ActionUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import site.zido.elise.task.model.Action; 4 | 5 | public class ActionUtils { 6 | public static void traversing(Action action) { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/Asserts.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import java.util.Collection; 4 | 5 | /** 6 | * Assertion tool. 7 | * 8 | * @author zido 9 | */ 10 | public class Asserts { 11 | /** 12 | * Not null. 13 | * 14 | * @param obj the obj 15 | * @param message the message 16 | */ 17 | public static void notNull(Object obj, String message) { 18 | if (obj == null) { 19 | throw new IllegalArgumentException(message); 20 | } 21 | } 22 | 23 | /** 24 | * Not null. 25 | * 26 | * @param obj the obj 27 | */ 28 | public static void notNull(Object obj) { 29 | notNull(obj, "[Assertion failed] - the object argument must not be null"); 30 | } 31 | 32 | /** 33 | * Is null. 34 | * 35 | * @param obj the obj 36 | * @param message the message 37 | */ 38 | public static void isNull(Object obj, String message) { 39 | if (obj != null) { 40 | throw new IllegalArgumentException(message); 41 | } 42 | } 43 | 44 | /** 45 | * Is null. 46 | * 47 | * @param obj the obj 48 | */ 49 | public static void isNull(Object obj) { 50 | isNull(obj, "[Assertion failed] - the object argument must be null"); 51 | } 52 | 53 | /** 54 | * Has length. 55 | * 56 | * @param text the text 57 | * @param message the message 58 | */ 59 | public static void hasLength(String text, String message) { 60 | if (!StringUtils.hasLength(text)) { 61 | throw new IllegalArgumentException(message); 62 | } 63 | } 64 | 65 | /** 66 | * Has length. 67 | * 68 | * @param text the text 69 | */ 70 | public static void hasLength(String text) { 71 | hasLength(text, "[Assertion failed] - this String argument must have length; it must not be null or empty"); 72 | } 73 | 74 | /** 75 | * Not empty. 76 | * 77 | * @param collection the collection 78 | */ 79 | public static void notEmpty(Collection collection) { 80 | notEmpty(collection, "[Assertion failed] - this collection argument must be not null or empty"); 81 | } 82 | 83 | /** 84 | * Not empty. 85 | * 86 | * @param collection the collection 87 | * @param message the message 88 | */ 89 | public static void notEmpty(Collection collection, String message) { 90 | if (ValidateUtils.isEmpty(collection)) { 91 | throw new IllegalArgumentException(message); 92 | } 93 | } 94 | 95 | /** 96 | * Not empty. 97 | * 98 | * @param array the array 99 | * @param message the message 100 | */ 101 | public static void notEmpty(Object[] array, String message) { 102 | if (ValidateUtils.isEmpty(array)) { 103 | throw new IllegalArgumentException(message); 104 | } 105 | } 106 | 107 | /** 108 | * Not empty. 109 | * 110 | * @param array the array 111 | */ 112 | public static void notEmpty(Object[] array) { 113 | notEmpty(array, "[Assertion failed] - this array must not be empty: it must contain at least 1 element"); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/EventUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.util.EventListener; 8 | import java.util.Set; 9 | import java.util.function.Consumer; 10 | 11 | /** 12 | * Event utils. 13 | * 14 | * @author zido 15 | */ 16 | public class EventUtils { 17 | private final static Logger LOGGER = LoggerFactory.getLogger(EventUtils.class); 18 | 19 | private EventUtils() { 20 | } 21 | 22 | /** 23 | * Must notify listeners. 24 | * 25 | * @param the type parameter 26 | * @param listeners the listeners 27 | * @param callback the callback 28 | */ 29 | public static void mustNotifyListeners(Set listeners, Consumer callback) { 30 | for (T listener : listeners) { 31 | try { 32 | callback.accept(listener); 33 | } catch (Throwable e) { 34 | LOGGER.error("listener callback error", e); 35 | } 36 | } 37 | } 38 | 39 | /** 40 | * Notify listeners. 41 | * 42 | * @param the type parameter 43 | * @param listeners the listeners 44 | * @param callback the callback 45 | */ 46 | public static void notifyListeners(Set listeners, Consumer callback) { 47 | for (T listener : listeners) { 48 | callback.accept(listener); 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/HtmlUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | import java.nio.charset.Charset; 9 | import java.util.regex.Matcher; 10 | import java.util.regex.Pattern; 11 | 12 | /** 13 | * html utils 14 | * 15 | * @author zido 16 | */ 17 | public class HtmlUtils { 18 | /** 19 | * the meta charset pattern 20 | */ 21 | private static final Pattern META_CHARSET_PATTERN = Pattern.compile("<[mM][eE][tT][aA][^>]*([cC][Hh][Aa][Rr][Ss][Ee][Tt][\\s]*=[\\s\\\"']*)([\\w\\d-_]*)[^>]*>"); 22 | /** 23 | * the key words of head 24 | */ 25 | private static final char[] KEYWORDS = {'h', 'e', 'a', 'd', '>'}; 26 | 27 | /** 28 | * get the charset from html 29 | * 30 | * @param htmlContent html bytes 31 | * @return charset name 32 | */ 33 | public static String getHtmlCharset(byte[] htmlContent) { 34 | String charset = Charset.defaultCharset().name(); 35 | return getHtmlCharset(htmlContent, charset); 36 | } 37 | 38 | /** 39 | * get the charset from html 40 | * 41 | * @param htmlContent html bytes 42 | * @param defaultCharset default charset 43 | * @return charset name 44 | */ 45 | public static String getHtmlCharset(byte[] htmlContent, String defaultCharset) { 46 | String content = new String(htmlContent); 47 | Matcher matcher = META_CHARSET_PATTERN.matcher(content); 48 | if (matcher.find()) { 49 | return matcher.group(2); 50 | } else { 51 | String headStr = getHeadStr(content); 52 | if (headStr == null) { 53 | return defaultCharset; 54 | } 55 | Document head = Jsoup.parse(headStr); 56 | Elements links = head.select("meta"); 57 | for (Element link : links) { 58 | String metaContent = link.attr("content"); 59 | if (metaContent.contains("charset")) { 60 | metaContent = metaContent.substring(metaContent.indexOf("charset")); 61 | return metaContent.split("=")[1]; 62 | } 63 | } 64 | } 65 | return StringUtils.getEncode(htmlContent, defaultCharset); 66 | } 67 | 68 | /** 69 | * get the head string from html 70 | * 71 | * @param html the html str that contains the head tag 72 | * @return head string 73 | */ 74 | public static String getHeadStr(String html) { 75 | char[] chars = html.toCharArray(); 76 | boolean isStartTag = true; 77 | int startIndex = -1, endIndex = -1; 78 | for (int i = 0; i < chars.length; i++) { 79 | if (chars[i] == '<' && i < chars.length - 1) { 80 | if (isStartTag) { 81 | int keyIndex = 0; 82 | for (int j = i + 1; j < chars.length; j++) { 83 | if (keyIndex == 0 && chars[j] == ' ') { 84 | continue; 85 | } else if (keyIndex == 4 && chars[j] == ' ') { 86 | continue; 87 | } 88 | if (chars[j] == KEYWORDS[keyIndex]) { 89 | keyIndex++; 90 | if (keyIndex == 5) { 91 | isStartTag = false; 92 | startIndex = i; 93 | break; 94 | } 95 | } else { 96 | break; 97 | } 98 | } 99 | } else if (chars[i + 1] == '/') { 100 | int keyIndex = 0; 101 | for (int j = i + 2; j < chars.length; j++) { 102 | if (keyIndex == 0 && chars[j] == ' ') { 103 | continue; 104 | } else if ((keyIndex == 4) && chars[j] == ' ') { 105 | continue; 106 | } 107 | if (chars[j] == KEYWORDS[keyIndex]) { 108 | keyIndex++; 109 | if (keyIndex == 5) { 110 | endIndex = j + 1; 111 | break; 112 | } 113 | } 114 | } 115 | } 116 | } 117 | } 118 | if (startIndex != -1 && endIndex != -1) { 119 | return html.substring(startIndex, endIndex); 120 | } 121 | return null; 122 | } 123 | } -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/ModuleNamedDefaultThreadFactory.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.util.concurrent.ThreadFactory; 7 | import java.util.concurrent.atomic.AtomicInteger; 8 | 9 | /** 10 | * the default thread factory for elise.It can receive the module name to form a more appropriate thread name. 11 | * 12 | * @author zido 13 | */ 14 | public class ModuleNamedDefaultThreadFactory implements ThreadFactory { 15 | 16 | private static final Logger LOGGER = LoggerFactory.getLogger(ModuleNamedDefaultThreadFactory.class); 17 | private static final AtomicInteger POOL_NUMBER = new AtomicInteger(1); 18 | private final ThreadGroup group; 19 | private final String prefix; 20 | private volatile int threadNumber = 1; 21 | private boolean daemon; 22 | private String moduleName; 23 | 24 | /** 25 | * Instantiates a new Module named default thread factory. 26 | * 27 | * @param moduleName the module name 28 | */ 29 | public ModuleNamedDefaultThreadFactory(String moduleName) { 30 | this(moduleName, false); 31 | } 32 | 33 | /** 34 | * Instantiates a new Module named default thread factory. 35 | * 36 | * @param moduleName the module name 37 | * @param daemon the daemon 38 | */ 39 | public ModuleNamedDefaultThreadFactory(String moduleName, boolean daemon) { 40 | SecurityManager s = System.getSecurityManager(); 41 | group = (s != null) ? s.getThreadGroup() : Thread.currentThread().getThreadGroup(); 42 | moduleName = moduleName.trim(); 43 | prefix = moduleName + "-pool-" + POOL_NUMBER.getAndIncrement() + "-thread-"; 44 | this.daemon = daemon; 45 | this.moduleName = moduleName; 46 | } 47 | 48 | @Override 49 | public Thread newThread(Runnable r) { 50 | StringBuilder sb = new StringBuilder(); 51 | Thread t; 52 | synchronized (this) { 53 | int number = threadNumber; 54 | threadNumber++; 55 | t = new Thread(group, r, prefix + number, 0); 56 | sb.append("create a thread of ").append(moduleName); 57 | } 58 | t.setDaemon(daemon); 59 | sb.append(",daemon:").append(daemon); 60 | if (t.getPriority() != Thread.NORM_PRIORITY) { 61 | t.setPriority(Thread.NORM_PRIORITY); 62 | } 63 | LOGGER.debug(sb.toString()); 64 | return t; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/Safe.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | public class Safe { 4 | private Safe() { 5 | throw new AssertionError("No Safe instances for you!"); 6 | } 7 | 8 | public static Object getFromArray(Object[] objs, int index) { 9 | if (index >= objs.length) { 10 | return null; 11 | } 12 | return objs[index]; 13 | } 14 | 15 | public static int getIntFromArray(Object[] objs, int index) { 16 | return getIntFromArray(objs, index, 0); 17 | } 18 | 19 | public static int getIntFromArray(Object[] objs, int index, int defaultValue) { 20 | Object obj = getFromArray(objs, index); 21 | if (obj instanceof Integer) { 22 | return (int) obj; 23 | } 24 | return defaultValue; 25 | } 26 | 27 | public static String getStrFromArray(Object[] objs, int index) { 28 | return getStrFromArray(objs, index, ""); 29 | } 30 | 31 | public static String getStrFromArray(Object[] objs, int index, String defaultValue) { 32 | Object obj = getFromArray(objs, index); 33 | if (obj instanceof String) { 34 | return (String) obj; 35 | } 36 | return defaultValue; 37 | } 38 | 39 | public static char getCharFromArray(Object[] objs, int index, char defaultValue) { 40 | Object obj = getFromArray(objs, index); 41 | if (obj instanceof Character) { 42 | return (char) obj; 43 | } 44 | return defaultValue; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/StringUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import java.nio.charset.Charset; 4 | 5 | /** 6 | * String utils. 7 | * 8 | * @author zido 9 | */ 10 | public class StringUtils { 11 | /** 12 | * Has length boolean. 13 | * 14 | * @param text the text 15 | * @return the boolean 16 | */ 17 | public static boolean hasLength(String text) { 18 | return text != null && text.length() > 0; 19 | } 20 | 21 | /** 22 | * Gets encode. 23 | * 24 | * @param bytes the bytes 25 | * @return the encode 26 | */ 27 | public static String getEncode(byte[] bytes) { 28 | return getEncode(bytes, Charset.defaultCharset().name()); 29 | } 30 | 31 | /** 32 | * Gets encode. 33 | * 34 | * @param bytes the bytes 35 | * @param defaultCharset the default charset 36 | * @return the encode 37 | */ 38 | public static String getEncode(byte[] bytes, String defaultCharset) { 39 | String code; 40 | if (bytes == null || bytes.length < 2) { 41 | return defaultCharset; 42 | } 43 | int p = ((int) bytes[0] & 0x00ff) << 8 | ((int) bytes[1] & 0x00ff); 44 | switch (p) { 45 | case 0xefbb: 46 | code = "UTF-8"; 47 | break; 48 | case 0xfffe: 49 | code = "Unicode"; 50 | break; 51 | case 0xfeff: 52 | code = "UTF-16BE"; 53 | break; 54 | default: 55 | code = defaultCharset; 56 | } 57 | return code; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/SystemClock.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import java.util.concurrent.ScheduledExecutorService; 4 | import java.util.concurrent.ScheduledThreadPoolExecutor; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.atomic.AtomicLong; 7 | 8 | /** 9 | * High concurrency performance significantly improves performance (space Exchange time) through memory cache current time. 10 | * 11 | * @author zido 12 | */ 13 | public class SystemClock { 14 | private final long period; 15 | private final AtomicLong now; 16 | 17 | private SystemClock(long period) { 18 | this.period = period; 19 | this.now = new AtomicLong(System.currentTimeMillis()); 20 | scheduleClockUpdating(); 21 | } 22 | 23 | private static SystemClock instance() { 24 | return InstanceHolder.INSTANCE; 25 | } 26 | 27 | /** 28 | * Now long. 29 | * 30 | * @return the long 31 | */ 32 | public static long now() { 33 | return instance().currentTimeMillis(); 34 | } 35 | 36 | private void scheduleClockUpdating() { 37 | ScheduledExecutorService scheduler = new ScheduledThreadPoolExecutor(1, new ModuleNamedDefaultThreadFactory("system-clock", true)); 38 | scheduler.scheduleAtFixedRate(() -> now.set(System.currentTimeMillis()), period, period, TimeUnit.MILLISECONDS); 39 | } 40 | 41 | private long currentTimeMillis() { 42 | return now.get(); 43 | } 44 | 45 | private static class InstanceHolder { 46 | /** 47 | * The Instance. 48 | */ 49 | static final SystemClock INSTANCE = new SystemClock(1); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /Elise-core/src/main/java/site/zido/elise/utils/ValidateUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import java.util.Collection; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | /** 8 | * validate util 9 | * 10 | * @author zido 11 | * @since 2017 /5/25 0025 12 | */ 13 | public class ValidateUtils { 14 | 15 | /** 16 | * Is empty boolean. 17 | * 18 | * @param array the array 19 | * @return the boolean 20 | */ 21 | public static boolean isEmpty(Object[] array) { 22 | return array == null || array.length == 0; 23 | } 24 | 25 | /** 26 | * Is empty boolean. 27 | * 28 | * @param array the array 29 | * @return the boolean 30 | */ 31 | public static boolean isEmpty(long[] array) { 32 | return array == null || array.length == 0; 33 | } 34 | 35 | /** 36 | * Is empty boolean. 37 | * 38 | * @param array the array 39 | * @return the boolean 40 | */ 41 | public static boolean isEmpty(int[] array) { 42 | return array == null || array.length == 0; 43 | } 44 | 45 | /** 46 | * Is empty boolean. 47 | * 48 | * @param array the array 49 | * @return the boolean 50 | */ 51 | public static boolean isEmpty(short[] array) { 52 | return array == null || array.length == 0; 53 | } 54 | 55 | /** 56 | * Is empty boolean. 57 | * 58 | * @param array the array 59 | * @return the boolean 60 | */ 61 | public static boolean isEmpty(char[] array) { 62 | return array == null || array.length == 0; 63 | } 64 | 65 | /** 66 | * Is empty boolean. 67 | * 68 | * @param array the array 69 | * @return the boolean 70 | */ 71 | public static boolean isEmpty(byte[] array) { 72 | return array == null || array.length == 0; 73 | } 74 | 75 | /** 76 | * Is empty boolean. 77 | * 78 | * @param array the array 79 | * @return the boolean 80 | */ 81 | public static boolean isEmpty(double[] array) { 82 | return array == null || array.length == 0; 83 | } 84 | 85 | /** 86 | * Is empty boolean. 87 | * 88 | * @param array the array 89 | * @return the boolean 90 | */ 91 | public static boolean isEmpty(float[] array) { 92 | return array == null || array.length == 0; 93 | } 94 | 95 | /** 96 | * Is empty boolean. 97 | * 98 | * @param array the array 99 | * @return the boolean 100 | */ 101 | public static boolean isEmpty(boolean[] array) { 102 | return array == null || array.length == 0; 103 | } 104 | 105 | /** 106 | * Is empty boolean. 107 | * 108 | * @param list the list 109 | * @return the boolean 110 | */ 111 | public static boolean isEmpty(Collection list) { 112 | return null == list || list.size() == 0; 113 | } 114 | 115 | /** 116 | * Is not empty boolean. 117 | * 118 | * @param list the list 119 | * @return the boolean 120 | */ 121 | public static boolean isNotEmpty(Collection list) { 122 | return !isEmpty(list); 123 | } 124 | 125 | /** 126 | * Is empty boolean. 127 | * 128 | * @param s the s 129 | * @return the boolean 130 | */ 131 | public static boolean isEmpty(String s) { 132 | return s == null || s.length() == 0; 133 | } 134 | 135 | /** 136 | * Is empty boolean. 137 | * 138 | * @param i the 139 | * @return the boolean 140 | */ 141 | public static boolean isEmpty(Integer i) { 142 | return i == null; 143 | } 144 | 145 | /** 146 | * Is empty boolean. 147 | * 148 | * @param l the l 149 | * @return the boolean 150 | */ 151 | public static boolean isEmpty(Long l) { 152 | return l == null; 153 | } 154 | 155 | /** 156 | * Is empty boolean. 157 | * 158 | * @param b the b 159 | * @return the boolean 160 | */ 161 | public static boolean isEmpty(Boolean b) { 162 | return b == null; 163 | } 164 | 165 | /** 166 | * Is all empty boolean. 167 | * 168 | * @param lists the lists 169 | * @return the boolean 170 | */ 171 | public static boolean isAllEmpty(List... lists) { 172 | for (List list : lists) { 173 | if (isNotEmpty(list)) { 174 | return false; 175 | } 176 | } 177 | return true; 178 | } 179 | 180 | /** 181 | * 后缀是否能匹配上 182 | * 183 | * @param target 待验证后缀字符串 184 | * @param allowSuffixes 允许后缀字符串 185 | * @return 是 /否 186 | */ 187 | public static boolean mapSuffix(String target, String... allowSuffixes) { 188 | if (isEmpty(target)) { 189 | return false; 190 | } 191 | for (String type : allowSuffixes) { 192 | if (target.endsWith(type)) { 193 | return true; 194 | } 195 | } 196 | return false; 197 | } 198 | 199 | /** 200 | * Is empty boolean. 201 | * 202 | * @param cookies the cookies 203 | * @return the boolean 204 | */ 205 | public static boolean isEmpty(Map cookies) { 206 | return cookies == null || cookies.size() == 0; 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /Elise-core/src/test/java/site/zido/elise/task/api/DefaultSelectableResponseTest.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.task.api; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | import site.zido.elise.select.XpathSelector; 7 | import site.zido.elise.task.model.Model; 8 | 9 | import java.io.*; 10 | 11 | public class DefaultSelectableResponseTest { 12 | @Test 13 | public void testBuildModel() throws IOException { 14 | DefaultSelectableResponse response = new DefaultSelectableResponse(); 15 | response.modelName("test_model"); 16 | response.asTarget().matchStatusCode("200<300").and().matchUrl("http://xxx.yyy"); 17 | response.asTarget().matchUrl("http://aaa.bbb"); 18 | response.asHelper().regex("ddd$").and().regex("^aaa").or().regex("^ccc"); 19 | response.asContent().statusCode().nullable(false).save("code"); 20 | response.asContent().url().nullable(false).save("url"); 21 | PartitionDescriptor partition = response.asPartition(new XpathSelector("//div[@class='profile']")); 22 | partition.field().css(".text").rich().save("content").nullable(true); 23 | partition.field().xpath(".description").text().save("description"); 24 | response.asContent().html().css(".author").text().save("author").nullable(false); 25 | Model model = response.getModel(); 26 | 27 | ObjectMapper mapper = new ObjectMapper(); 28 | InputStream is = getClass().getClassLoader().getResourceAsStream("task" + File.separator + "api" + File.separator + "model1.json"); 29 | BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 30 | StringBuilder builder = new StringBuilder(); 31 | String line; 32 | while ((line = reader.readLine()) != null) { 33 | builder.append(line.trim()); 34 | } 35 | //格式化一次 36 | Assert.assertEquals(mapper.writeValueAsString(mapper.readTree(builder.toString())), mapper.writeValueAsString(model)); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Elise-core/src/test/java/site/zido/elise/utils/HtmlUtilsTest.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.utils; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.io.UnsupportedEncodingException; 7 | 8 | public class HtmlUtilsTest { 9 | private static final String[] charsets = {"UTF-8", "GB2312", "GBK"}; 10 | 11 | @Test 12 | public void testGetCharset() throws UnsupportedEncodingException { 13 | for (String currentCharset : charsets) { 14 | String html = ""; 15 | String charset = HtmlUtils.getHtmlCharset(html.getBytes(currentCharset)); 16 | Assert.assertEquals(currentCharset, charset); 17 | } 18 | } 19 | 20 | @Test 21 | public void testGetHtmlHead() { 22 | String head = " adwadefe"; 23 | String html = "" + head + " dwfrewg"; 24 | String headStr = HtmlUtils.getHeadStr(html); 25 | Assert.assertEquals(head, headStr); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Elise-core/src/test/resources/db.properties: -------------------------------------------------------------------------------- 1 | driver=com.mysql.jdbc.Driver 2 | url=jdbc:mysql://127.0.0.1:3306/test?useSSL=false 3 | username=root 4 | password=123456 -------------------------------------------------------------------------------- /Elise-core/src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] [%level] %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Elise-core/src/test/resources/task/api/model1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test_model", 3 | "targets": [ 4 | { 5 | "token": "match_number", 6 | "extras": [ 7 | "200<300" 8 | ], 9 | "source": "code", 10 | "children": [ 11 | { 12 | "token": "match_link", 13 | "extras": [ 14 | "http://xxx.yyy" 15 | ], 16 | "source": "url", 17 | "children": null 18 | } 19 | ] 20 | }, 21 | { 22 | "token": "match_link", 23 | "extras": [ 24 | "http://aaa.bbb" 25 | ], 26 | "source": "url", 27 | "children": null 28 | } 29 | ], 30 | "helpers": [ 31 | { 32 | "token": "select_link", 33 | "extras": [ 34 | "ddd$" 35 | ], 36 | "source": "html", 37 | "children": [ 38 | { 39 | "token": "select_link", 40 | "extras": [ 41 | "^aaa" 42 | ], 43 | "source": "html", 44 | "children": null 45 | }, 46 | { 47 | "token": "select_link", 48 | "extras": [ 49 | "^ccc" 50 | ], 51 | "source": "html", 52 | "children": null 53 | } 54 | ] 55 | } 56 | ], 57 | "partition": { 58 | "action": { 59 | "token": "xpath_selector", 60 | "extras": [ 61 | "//div[@class='profile']" 62 | ], 63 | "source": "html", 64 | "children": null 65 | }, 66 | "fields": [ 67 | { 68 | "name": "content", 69 | "nullable": true, 70 | "valueType": "RICH", 71 | "actions": [ 72 | { 73 | "token": "css_selector", 74 | "extras": [ 75 | ".text" 76 | ], 77 | "source": "partition", 78 | "children": null 79 | } 80 | ] 81 | }, 82 | { 83 | "name": "description", 84 | "nullable": false, 85 | "valueType": "TEXT", 86 | "actions": [ 87 | { 88 | "token": "xpath_selector", 89 | "extras": [ 90 | ".description" 91 | ], 92 | "source": "partition", 93 | "children": null 94 | } 95 | ] 96 | } 97 | ] 98 | }, 99 | "fields": [ 100 | { 101 | "name": "code", 102 | "nullable": false, 103 | "valueType": "NUMBER", 104 | "actions": [ 105 | { 106 | "token": "select_origin", 107 | "extras": null, 108 | "source": "code", 109 | "children": null 110 | } 111 | ] 112 | }, 113 | { 114 | "name": "url", 115 | "nullable": false, 116 | "valueType": "TEXT", 117 | "actions": [ 118 | { 119 | "token": "select_origin", 120 | "extras": null, 121 | "source": "url", 122 | "children": null 123 | } 124 | ] 125 | }, 126 | { 127 | "name": "author", 128 | "nullable": false, 129 | "valueType": "TEXT", 130 | "actions": [ 131 | { 132 | "token": "css_selector", 133 | "extras": [ 134 | ".author" 135 | ], 136 | "source": "html", 137 | "children": null 138 | } 139 | ] 140 | } 141 | ] 142 | } 143 | -------------------------------------------------------------------------------- /Elise-distributed/build.gradle: -------------------------------------------------------------------------------- 1 | description = 'Elise-distributed' 2 | dependencies { 3 | compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25' 4 | compile project(':Elise-core') 5 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.11.0' 6 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.11.0' 7 | testCompile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.11.0' 8 | testCompile group: 'junit', name: 'junit', version: '4.12' 9 | } 10 | -------------------------------------------------------------------------------- /Elise-distributed/src/main/java/site/zido/elise/distributed/AbstractQueueScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.distributed; 2 | 3 | import site.zido.elise.scheduler.AbstractScheduler; 4 | import site.zido.elise.scheduler.Seed; 5 | 6 | /** 7 | * 使用消息队列实现的分布式任务调度器基类 8 | * 9 | * @author zido 10 | */ 11 | public abstract class AbstractQueueScheduler extends AbstractScheduler implements Runnable { 12 | @Override 13 | public void run() { 14 | 15 | } 16 | 17 | protected abstract Seed readSeedFromQueue(); 18 | } 19 | -------------------------------------------------------------------------------- /Elise-distributed/src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] [%level] %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Elise-jedis-support/build.gradle: -------------------------------------------------------------------------------- 1 | description = '' 2 | -------------------------------------------------------------------------------- /Elise-kafka-support/build.gradle: -------------------------------------------------------------------------------- 1 | description = '' 2 | dependencies { 3 | compile group: 'org.springframework.kafka', name: 'spring-kafka', version: '2.1.4.RELEASE' 4 | compile project(':Elise-distributed') 5 | compile group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: '2.9.8' 6 | compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.9.10.4' 7 | } 8 | -------------------------------------------------------------------------------- /Elise-kafka-support/src/main/java/site/zido/elise/kafka/pojo/Seed.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.kafka.pojo; 2 | 3 | import site.zido.elise.http.Request; 4 | import site.zido.elise.http.Response; 5 | import site.zido.elise.http.impl.DefaultResponse; 6 | import site.zido.elise.task.DefaultTask; 7 | 8 | /** 9 | * 种子,信息载体 10 | * 11 | * @author zido 12 | */ 13 | public class Seed { 14 | private DefaultTask task; 15 | private Request request; 16 | private Response response; 17 | 18 | public Seed(DefaultTask task, Request request, Response response) { 19 | this.task = task; 20 | this.request = request; 21 | this.response = response; 22 | } 23 | 24 | public Seed() { 25 | 26 | } 27 | 28 | public DefaultTask getTask() { 29 | return task; 30 | } 31 | 32 | public Seed setTask(DefaultTask task) { 33 | this.task = task; 34 | return this; 35 | } 36 | 37 | public Request getRequest() { 38 | return request; 39 | } 40 | 41 | public Seed setRequest(Request request) { 42 | this.request = request; 43 | return this; 44 | } 45 | 46 | public Response getResponse() { 47 | return response; 48 | } 49 | 50 | public Seed setResponse(DefaultResponse response) { 51 | this.response = response; 52 | return this; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Elise-kafka-support/src/main/resources/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Programming-With-Love/Elise/18fb37a465ffa5bf97b25a891bf4fac9749ec909/Elise-kafka-support/src/main/resources/.gitkeep -------------------------------------------------------------------------------- /Elise-kafka-support/src/test/java/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Programming-With-Love/Elise/18fb37a465ffa5bf97b25a891bf4fac9749ec909/Elise-kafka-support/src/test/java/.gitkeep -------------------------------------------------------------------------------- /Elise-redis-support/build.gradle: -------------------------------------------------------------------------------- 1 | description = '' 2 | dependencies { 3 | compile group: 'io.lettuce', name: 'lettuce-core', version: '5.1.3.RELEASE' 4 | compile project(':Elise-core') 5 | compile project(':Elise-distributed') 6 | } 7 | -------------------------------------------------------------------------------- /Elise-redis-support/src/main/java/site/zido/elise/support/redis/scheduler/RedisTaskScheduler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.support.redis.scheduler; 2 | 3 | import com.fasterxml.jackson.core.JsonProcessingException; 4 | import com.fasterxml.jackson.databind.ObjectMapper; 5 | import io.lettuce.core.RedisClient; 6 | import io.lettuce.core.api.StatefulRedisConnection; 7 | import io.lettuce.core.api.sync.RedisCommands; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import site.zido.elise.distributed.AbstractQueueScheduler; 11 | import site.zido.elise.http.Request; 12 | import site.zido.elise.scheduler.Seed; 13 | import site.zido.elise.task.Task; 14 | 15 | import java.util.concurrent.atomic.AtomicBoolean; 16 | 17 | /** 18 | * 使用redis实现的任务调度器 19 | * 20 | * @author zido 21 | */ 22 | public class RedisTaskScheduler extends AbstractQueueScheduler { 23 | private static Logger LOGGER = LoggerFactory.getLogger(RedisTaskScheduler.class); 24 | private RedisClient redisClient; 25 | private StatefulRedisConnection connection; 26 | private String key = "elise:seed:queue"; 27 | private AtomicBoolean STATE = new AtomicBoolean(false); 28 | private ObjectMapper mapper; 29 | 30 | /** 31 | * redis url like "redis://password@localhost:6379/0" 32 | * 33 | * @param url redis server url 34 | */ 35 | public RedisTaskScheduler(String url) { 36 | redisClient = RedisClient.create(url); 37 | } 38 | 39 | @Override 40 | protected Seed readSeedFromQueue() { 41 | //TODO read from queue 42 | return null; 43 | } 44 | 45 | /** 46 | * Pre start. 47 | */ 48 | private void preStart() { 49 | if (STATE.compareAndSet(false, true)) { 50 | connection = redisClient.connect(); 51 | if (mapper == null) { 52 | mapper = new ObjectMapper(); 53 | } 54 | } 55 | } 56 | 57 | @Override 58 | protected void pushWhenNoDuplicate(Task task, Request request) { 59 | preStart(); 60 | RedisCommands syncCommands = connection.sync(); 61 | try { 62 | String value = mapper.writeValueAsString(new Seed(task, request)); 63 | syncCommands.rpush(key, value); 64 | } catch (JsonProcessingException e) { 65 | LOGGER.error("serialize seed to json error", e); 66 | } 67 | } 68 | 69 | @Override 70 | public void cancel(boolean ifRunning) { 71 | connection.close(); 72 | redisClient.shutdown(); 73 | } 74 | 75 | public void setMapper(ObjectMapper mapper) { 76 | this.mapper = mapper; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /Elise-test-server/build.gradle: -------------------------------------------------------------------------------- 1 | description = '' 2 | dependencies { 3 | compile group: 'junit', name: 'junit', version: '4.12' 4 | compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25' 5 | compile group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.11.0' 6 | compile group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.11.0' 7 | compile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.11.0' 8 | } 9 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/java/site/zido/elise/test/Server.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test; 2 | 3 | import com.sun.net.httpserver.HttpServer; 4 | import com.sun.net.httpserver.spi.HttpServerProvider; 5 | import site.zido.elise.test.handlers.OneHandler; 6 | 7 | import java.io.IOException; 8 | import java.net.InetSocketAddress; 9 | 10 | public class Server { 11 | public static final String ONE_PATH; 12 | public static final String MULTI_PATH_ENTRY; 13 | private static final int PORT = 8080; 14 | private static final String DOMAIN = "http://127.0.0.1"; 15 | 16 | static { 17 | ONE_PATH = wrapPath("/one"); 18 | MULTI_PATH_ENTRY = wrapPath("/multi/entry"); 19 | } 20 | 21 | private HttpServer httpServer; 22 | 23 | public static String wrapPath(String path) { 24 | return DOMAIN + ":" + PORT + path; 25 | } 26 | 27 | public static void main(String[] args) { 28 | new Server().start(); 29 | } 30 | 31 | public void start() { 32 | HttpServerProvider provider = HttpServerProvider.provider(); 33 | try { 34 | httpServer = provider.createHttpServer(new InetSocketAddress(8080), 100); 35 | } catch (IOException e) { 36 | throw new RuntimeException(e); 37 | } 38 | httpServer.createContext("/one", new OneHandler()); 39 | httpServer.createContext("/multi/entry", StaticHandler.render("/multi/entry.html")); 40 | httpServer.createContext("/multi/one", StaticHandler.render("/multi/one.html")); 41 | httpServer.createContext("/multi/two", StaticHandler.render("/multi/two.html")); 42 | httpServer.createContext("/multi/three", StaticHandler.render("/multi/three.html")); 43 | httpServer.start(); 44 | } 45 | 46 | public void stop() { 47 | httpServer.stop(0); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/java/site/zido/elise/test/StaticHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test; 2 | 3 | import com.sun.net.httpserver.HttpExchange; 4 | import com.sun.net.httpserver.HttpHandler; 5 | import site.zido.elise.test.utils.TemplateUtils; 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.nio.charset.StandardCharsets; 10 | 11 | public abstract class StaticHandler implements HttpHandler { 12 | public static HttpHandler render(String filename) { 13 | return new WrapHandler(filename.replaceAll("/", File.separator)); 14 | } 15 | 16 | public abstract String getPath(); 17 | 18 | @Override 19 | public void handle(HttpExchange httpExchange) throws IOException { 20 | httpExchange.getResponseHeaders().set("Content-Type", "text/html; charset=UTF-8"); 21 | 22 | httpExchange.sendResponseHeaders(200, 0); 23 | String html = TemplateUtils.createHtml(getPath()); 24 | httpExchange.getResponseBody().write(html.getBytes(StandardCharsets.UTF_8)); 25 | httpExchange.getResponseBody().close(); 26 | } 27 | 28 | public static class WrapHandler extends StaticHandler { 29 | private String filename; 30 | 31 | public WrapHandler(String filename) { 32 | this.filename = filename; 33 | } 34 | 35 | @Override 36 | public String getPath() { 37 | return filename; 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/java/site/zido/elise/test/handlers/OneHandler.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test.handlers; 2 | 3 | import site.zido.elise.test.StaticHandler; 4 | 5 | public class OneHandler extends StaticHandler { 6 | 7 | @Override 8 | public String getPath() { 9 | return "one.html"; 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/java/site/zido/elise/test/utils/ResourcesUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test.utils; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.FileNotFoundException; 5 | import java.io.InputStream; 6 | 7 | public class ResourcesUtils { 8 | public static InputStream get(String path) { 9 | try { 10 | return new FileInputStream(ResourcesUtils.class.getClassLoader().getResource(path).getFile()); 11 | } catch (FileNotFoundException e) { 12 | throw new RuntimeException(e); 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/java/site/zido/elise/test/utils/TemplateUtils.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test.utils; 2 | 3 | import java.io.*; 4 | 5 | import static java.nio.charset.StandardCharsets.UTF_8; 6 | 7 | public class TemplateUtils { 8 | public static String createHtml(String filename, Object... params) { 9 | InputStream inputStream = ResourcesUtils.get("html" + File.separator + filename); 10 | BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, UTF_8)); 11 | String line; 12 | StringBuilder sb = new StringBuilder(); 13 | while (true) { 14 | try { 15 | if ((line = reader.readLine()) == null) { 16 | break; 17 | } 18 | sb.append(line); 19 | } catch (IOException e) { 20 | e.printStackTrace(); 21 | return ""; 22 | } 23 | } 24 | if (params.length == 0) { 25 | return sb.toString(); 26 | } 27 | return String.format(sb.toString(), params); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Elise-test-server/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] [%level] %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Elise-test-server/src/test/java/site/zido/elise/test/TestServer.java: -------------------------------------------------------------------------------- 1 | package site.zido.elise.test; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | import site.zido.elise.test.utils.TemplateUtils; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.InputStream; 9 | import java.io.InputStreamReader; 10 | import java.net.HttpURLConnection; 11 | import java.net.MalformedURLException; 12 | import java.net.URL; 13 | 14 | public class TestServer { 15 | @Test 16 | public void testStart() throws MalformedURLException { 17 | Server server = new Server(); 18 | server.start(); 19 | HttpURLConnection currentConnection = null; 20 | URL url = new URL("http://127.0.0.1:8080/one"); 21 | boolean success = false; 22 | try { 23 | currentConnection = (HttpURLConnection) url.openConnection(); 24 | currentConnection.setUseCaches(false); 25 | InputStream actualStream = currentConnection.getInputStream(); 26 | success = true; 27 | String html = TemplateUtils.createHtml("one.html"); 28 | BufferedReader reader = new BufferedReader(new InputStreamReader(actualStream)); 29 | StringBuilder sb = new StringBuilder(); 30 | String line; 31 | while ((line = reader.readLine()) != null) { 32 | sb.append(line); 33 | } 34 | Assert.assertEquals(html, sb.toString()); 35 | } catch (Exception ignore) { 36 | } finally { 37 | if (currentConnection != null) { 38 | currentConnection.disconnect(); 39 | } 40 | server.stop(); 41 | } 42 | Assert.assertTrue("server can't start", success); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 zido 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | allprojects { 2 | group = 'site.zido' 3 | version = '1.0.0-SNAPSHOT' 4 | } 5 | description = "Elise - 简单/强大/流畅的爬虫框架" 6 | 7 | buildscript { 8 | repositories { 9 | maven { url "https://maven.aliyun.com/nexus/content/groups/public" } 10 | maven { url "https://maven.aliyun.com/nexus/content/repositories/gradle-plugin" } 11 | mavenCentral() 12 | maven { url "https://plugins.gradle.org/m2/" } 13 | } 14 | } 15 | ext { 16 | configuration = [ 17 | javaVersion = JavaVersion.VERSION_1_8 18 | ] 19 | } 20 | 21 | subprojects { 22 | apply plugin: 'java-library' 23 | apply plugin: 'maven-publish' 24 | apply plugin: 'signing' 25 | 26 | sourceCompatibility = "${javaVersion}" 27 | targetCompatibility = "${javaVersion}" 28 | 29 | tasks.withType(JavaCompile) { 30 | options.encoding = 'UTF-8' 31 | options.deprecation = true 32 | options.compilerArgs += ["-parameters"] 33 | } 34 | 35 | task sourcesJar(type: Jar) { 36 | from sourceSets.main.allJava 37 | archiveClassifier.set("sources") 38 | } 39 | 40 | task javadocJar(type: Jar) { 41 | from javadoc 42 | archiveClassifier.set("javadoc") 43 | } 44 | 45 | repositories { 46 | mavenLocal() 47 | maven { url "https://maven.aliyun.com/repository/public" } 48 | maven { url "https://oss.sonatype.org/content/repositories/snapshots/" } 49 | jcenter() 50 | } 51 | 52 | publishing { 53 | publications { 54 | mavenJava(MavenPublication) { 55 | // groupId = project.group 56 | // artifactId = project.name 57 | // version = project.version 58 | from components.java 59 | artifact sourcesJar 60 | artifact javadocJar 61 | pom { 62 | name = 'Elise-builder' 63 | description = 'A Simple Spider Framework' 64 | url = 'https://elise.zido.site' 65 | licenses { 66 | license { 67 | name = 'The MIT License (MIT)' 68 | url = 'https://raw.githubusercontent.com/zidoshare/Elise/master/LICENSE' 69 | } 70 | } 71 | developers { 72 | developer { 73 | name = 'zido' 74 | email = 'wuhongxu1208@gmail.com' 75 | } 76 | } 77 | scm { 78 | url = 'https://github.com/zidoshare/Elise' 79 | connection = 'scm:git:git://github.com/zidoshare/Elise.git' 80 | developerConnection = 'scm:git:ssh://git@github.com:zidoshare/Elise.git' 81 | } 82 | } 83 | } 84 | } 85 | repositories { 86 | maven { 87 | name "oss" 88 | def releasesRepoUrl = "https://oss.sonatype.org/service/local/staging/deploy/maven2" 89 | def snapshotsRepoUrl = "https://oss.sonatype.org/content/repositories/snapshots" 90 | url = version.endsWith('SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl 91 | credentials { 92 | username sonatypeUsername 93 | password sonatypePassword 94 | } 95 | } 96 | } 97 | } 98 | 99 | signing { 100 | sign publishing.publications.mavenJava 101 | } 102 | 103 | javadoc { 104 | description = "Generates project-level javadoc for use in -javadoc jar" 105 | 106 | options.memberLevel = JavadocMemberLevel.PROTECTED 107 | options.author = true 108 | options.version = true 109 | options.header = project.name 110 | options.addStringOption('Xdoclint:none', '-quiet') 111 | 112 | logging.captureStandardError LogLevel.INFO 113 | logging.captureStandardOutput LogLevel.INFO 114 | options.encoding = "UTF-8" 115 | options.charSet = 'UTF-8' 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | elise.zido.site -------------------------------------------------------------------------------- /docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at wuhongxu1208@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /docs/CODE_REQUIREMENTS.md: -------------------------------------------------------------------------------- 1 | # 关于代码 2 | 3 | 关于代码编写,有以下准则: 4 | 5 | ## 使用体验很重要,常规使用者绝不接触任何多余api 6 | 7 | 尽可能让api更流畅。 8 | 9 | Spider,Operator均使用接口的方式屏蔽多余的方法,仅留下用户使用的方法。 10 | 11 | 参考以下逻辑: 12 | 13 | * 构造Spider类:`Spider spider = SpiderBuilder.defaults();` 14 | 15 | * 用户仅需要关注构造成功的类所提供的方法,分别为: 16 | 17 | 爬虫启动类方法: 18 | 19 | `of(ModelExtractor extractor, Config config)`: 能够是用户明白此处需要抓取器和配置 20 | 21 | `of(ModelExtractor extractor)`: 此处不需要任何配置,只需要提供抓取器 22 | 23 | 爬虫操作类方法: 24 | 25 | `cancel(boolean ifRunning)`: 取消爬虫 26 | 27 | `boolean pause()`: 暂停爬虫 28 | 29 | `void recover()`: 恢复爬虫 30 | 31 | 爬虫事件监听类方法: 32 | 33 | `addEventListener(EventListener listener)`: 添加事件监听器 34 | 35 | `removeEventListener(EventListener listener)`: 删除事件监听器 36 | 37 | 实际类的构造由构造器或者实现类内部完成。 38 | 39 | * 用户调用Spider.of之后将返回一个Operator接口。他是一个针对任务的操作句柄,并且提供不一样的方法供用户选择调用 40 | 41 | 执行类方法: 42 | 43 | `Operator execute(String... url)`: 添加url执行 44 | 45 | `Operator execute(Request request)`: 构建请求执行 46 | 47 | 任务操作类方法: 48 | 49 | `cancel(boolean ifRunning)`: 取消本任务 50 | 51 | `boolean pause()`: 暂停本任务 52 | 53 | `void recover()`: 恢复本任务 54 | 55 | 爬虫事件监听类方法: 56 | 57 | `addEventListener(EventListener listener)`: 添加事件监听器 58 | 59 | 额外提供一个阻塞直到任务完成的方法(有待商榷): 60 | `block() throws InterruptedException`:阻塞直到任务完成。 61 | 62 | 综合以后便得到如下的api使用场景: 63 | 64 | ```java 65 | //构建Spider实例,用户可通过此实例享有统一配置的爬虫 66 | SpiderBuilder.defaults() 67 | //构建目标爬虫的规则 68 | .of(ExtractorBuilder.create("login").build()) 69 | //添加爬虫入口 也可以直接使用execute(url),这里为了尽可能详细展示,使用了RequestBuilder来构建更具体的请求 70 | .execute(RequestBuilder.post("http://xxx").bodyForm("username=xxx&password=xxx").build()) 71 | //阻塞任务直到完成 72 | .block(); 73 | 74 | ``` 75 | 76 | ## 编写代码时必须考虑分布式下的场景 77 | 78 | 本框架在设计时必定遵循的规则是分布式下拥有统一的使用体验并且完全可以向各个方向衍生。 79 | 80 | 使用ModelExtractor的方式爬去暂时不说,因为是pojo传输可以任意扩展到redis,接下来演示对于将要实现的ResponseHandle api的示例 81 | 82 | 因暂未实现ResponseHandle api。仅给出以下构想: 83 | 84 | ```java 85 | //构建分布式爬虫实例 86 | DistributedSpiderBuilder.defaults() 87 | .of("taskName",new ResponseHandler(){ 88 | void onHandle(Response response){ 89 | //url必须匹配到目标表达式才能被确定为是需要采集的页面 90 | response.url().regex("https://a.b.c").asTarget() 91 | //url必须匹配到目标表达式才能被确定为是辅助采集的页面 92 | response.url().regex("https://a.b.c").asHelp() 93 | //从body中选择 94 | response.body() 95 | //使用xpath匹配 96 | .select(new XpathSelector("xxx")) 97 | //使用name作为属性名 98 | .as("name") 99 | //选中文字作为内容 100 | .text(); 101 | //从url中选择 102 | response.url() 103 | //使用regex匹配 104 | .select(new RegexSelector("xxx")) 105 | .nullable(false) 106 | //使用url作为属性名 107 | .as("url") 108 | //选中文字作为内容 109 | .text(); 110 | //从body中选择 111 | response.body() 112 | //使用xpath匹配 113 | .select(new XpathSelector("xxx")) 114 | //使用name作为属性名 115 | .as("description") 116 | //选中富文本作为内容 117 | .richText(); 118 | } 119 | }) 120 | //添加入口 121 | .execute("https://xxx"); 122 | ``` 123 | 124 | 对于这部分的实现,是使用模拟真实情况的方式,预计的实现方式是。response其实是一个代理对象, 125 | 通过response的各种api最终会生成一个ModelExtractor包含在代理Response内部。然后接下来由框架内部获取这个modelExtractor并实现最终爬取,得到与既定单机实现一样的结果。 126 | 127 | ## 代码约定 128 | 129 | 尽可能参照`alibaba开发手册`进行。但是框架本身而言,存在更多的不确定性,所以暂时做出以下两个例外: 130 | 131 | * 抽象类不必使用Abstract/Base开头,尽可能的参考本抽象类的功能。例如在httpclient中出现的抽象类`CloseableHttpClient`。 132 | 133 | * 字符串不要求提升为`static final`。也可以直接出现代码中,`alibaba`开发手册称之为`魔法字符串`。但是仍然推荐提升。 134 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # 贡献说明 2 | 3 | > 注意:非常非常欢迎任何意见或者建议在[issues](https://github.com/zidoshare/Elise/issues)中提出,也可以[发送email给我](mailto:wuhongxu1208@gmail.com),谢谢 4 | 5 | * 开发路线参考 [roadmap.md](./roadmap.md)。 6 | 7 | ## 介绍 8 | 9 | 欢迎任何类型的贡献,而不仅仅是代码。你可以做出的贡献包括但不限于以下条目: 10 | 11 | * bug反馈:请尽可能提供详细的错误报告(例如:控制台输出、异常堆栈等信息) 12 | 13 | * 撰写文档:所有任何相关文档、博客等,甚至包括对于logo的提议以及制作 14 | 15 | * 交流:在讨论区(包括但不限于issues/论坛/email等)提出任何相关意见火建议 16 | 17 | * 代码:看看未解决的问题。即使你不能编写代码,对它们进行评论,表明你关心某个特定问题也很重要。这能帮助我们对它们进行分类。 18 | 19 | ## 说明、帮助 20 | 21 | 您可以从这个免费系列中了解[如何在GitHub上贡献一个开源项目](https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github)。 22 | 23 | ## 提交代码 24 | 25 | 任何代码更改都应作为pull request提交。 26 | 27 | 你需要描述解释代码的作用并给出执行它的步骤。 28 | 29 | 你提交的代码应当尽可能的包含测试。 30 | 31 | > 请务必注意,项目开发分支为master分支。进行开发时务必确保你的master分支为最新 32 | 33 | 仅接受github的[pull request](https://help.github.com/articles/about-pull-requests/)工作流程。 34 | 35 | 因作者本人能力水平原因,近期会尽可能的学习github开发指南。也希望能有人提出意见或建议。 36 | 37 | ## 代码审查 38 | 39 | 拉取请求越大,审核和合并所需的时间就越长。你可以从一个大的需求中提取尽可能少的内容来进行实现,这样更易于查看和合并。你还应该描述:你的目的是什么? 40 | 41 | ## FAQ 42 | 43 | 如果您有任何疑问,请创建一个[issue](https://github.com/zidoshare/Elise/issues/new)(提示:请先搜索看看其他人之前是否没有问过同一个问题!)。 44 | 45 | 您也可以通过[wuhongxu1208@gmail.com](mailto:wuhongxu1208@gmail.com)与我们联系。 46 | 47 | > 如果想要贡献代码,请继续阅读下一篇文档[关于代码](./CODE_REQUIREMENTS.md),如有问题欢迎讨论,谢谢! -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Elise 伊莉丝 爬虫框架 2 | 3 | ![elise](./elise.png) 4 | 5 | > 名字取材于游戏《英雄联盟》中的一名英雄-蜘蛛女皇。 6 | 7 | ## 简介 8 | 9 | Elise一个开源的商业友好的java爬虫框架。他的目标是建立一个强大的可配置分布式爬虫框架,能够囊括绝大多数使用场景,通过大量的基础组件构建,任何人都能够构建一个复杂的傻瓜式爬虫平台 10 | 11 | Elise的优势: 12 | 13 | * 傻瓜式爬虫支持,语义化api,流畅/优雅 14 | 15 | * 多线程,速度快 16 | 17 | * 自动cookie和会话处理 18 | 19 | * 组建化,提供了大量基础组建,扩展性高 20 | 21 | * 完全支持分布式 22 | 23 | * 提供生命周期回调,任务完成,任务取消,下载成功,下载失败等等一系列回调支持 24 | 25 | * 支持任务暂停、取消、恢复等功能,并且配套提供了相应的声明周期回调方法 26 | 27 | * 对于爬取结果相较于纯文字或者纯html,增加一种富文本结果支持。富文本能将爬取结果段以文本(带链接文本)+图片+音频+视频等内容通过一个结果呈现,这对于开发者而言也许能更好的做内容爬去,而不再需要费劲解析,完整的文章就完整的保存下来。 28 | 29 | * 支持xpath/regex/css多模式组合爬取 30 | 31 | * html unit模拟浏览器 32 | 33 | * 商业友好(采用 AGPL-3.0 协议) 34 | 35 | ... 36 | 37 | ## 模块设计 38 | 39 | | 模块名 | 功能说明 | 40 | |---------------------------|-------------------------------------------------------------------------------------------------------------------------------| 41 | | Elise-core | 爬虫框架核心 | 42 | | Elise-client | 单实例爬虫扩展 | 43 | | Elise-distributed | 分布式爬虫扩展,但是不涉及任何具体分布式依赖(例如redis)。仅提供基础类以及相关的接口,其他分布式扩展包必须引用此模块进行扩展 | 44 | | Elise-jedis-support | 分布式redis支持组件,具体使用jedis | 45 | | Elise-redis-support | 分布式redis支持组件,具体使用lettuce | 46 | | Elise-kafka-support | 分布式kafka支持组件 | 47 | | spring-boot-elise-starter | 爬虫的spring boot自动配置组件,用于配合spring boot框架 | 48 | | .... | 更多模块实现,欢迎讨论 | 49 | 50 | ## maven仓库 51 | 52 | ``` 53 | 54 | site.zido 55 | Elise-client 56 | {version} 57 | 58 | ``` 59 | 60 | 暂无稳定版本,你可以使用`1.0.0-SNAPSHOT`做为版本号来预先体验开发版本 61 | 62 | > 请注意:不稳定且许多api暂未实现,请持续关注[ROADMAP.md](./ROADMAP.md)开发路线文档,期待第一个版本的诞生,一定会惊艳到你 63 | 64 | ## 快速试用 65 | 66 | 轻松利用response回调句柄,语义化api,像说话一样简单的爬取一个网站 67 | 68 | > 尝试爬取我的github仓库: 69 | 70 | ```java 71 | SpiderBuilder.defaults().of(response -> { 72 | response.modelName("project"); 73 | response.asTarget().matchUrl("github\\.com/zidoshare/[^/]*$"); 74 | response.asHelper().regex("github\\.com/zidoshare/[^/]*$"); 75 | response.asContent().html().xpath("//*[@id=\"js-repo-pjax-container\"]/div[1]/div/h1/strong/a").text().save("title"); 76 | response.asContent().html().xpath("//span[@class=\"text-gray-dark mr-2\"]").text().save("description"); 77 | response.asContent().html().xpath("//*[@id=\"readme\"]/div[2]").text().save("readme"); 78 | }).execute("http://github.com/zidoshare").block(); 79 | ``` 80 | 81 | 框架的核心需编程逻辑仅在response的回调中。response提供了url/html等供你快速的匹配内容 82 | 83 | 依靠高度封装的api,试着写下`response.`你能轻松的知道接下来可以做什么。 84 | 85 | 或者转到[使用文档](./TUTORIAL.md),详细的看看Elise的使用吧 86 | 87 | ## 构建指北 88 | 89 | Elise框架使用maven构建,并使用大量jdk8特性,请保证你的jdk版本为8以上。 90 | 91 | 获取源码: 92 | 93 | `git clone https://github.com/zidoshare/Elise.git` 94 | 95 | `cd Elise` 96 | 97 | 因为作者是使用idea进行开发,所以推荐的开发编辑器为IntelliJ IDEA。 98 | 99 | 如果你在其他编辑器中开发,请确保你的编辑器中的配置文件/构建文件/缓存等不会出现在提交目录中。这可以在[.gitignore](https://git-scm.com/gitignore)文件中进行设置,也欢迎提交类似的pr 100 | 101 | ## 状态 102 | 103 | 进行中(每天都在努力的编码中)...一个人的力量有限,希望有人能加入:smile: 104 | 105 | 目前还没能达到正式版本的状态。不过已经能基本保证正常运行,可以自行clone代码构建运行。 106 | 107 | 开发路线参考 [ROADMAP.md](./ROADMAP.md)。 108 | 109 | ## 贡献 110 | 111 | 行为准则请参阅[CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md) 112 | 113 | 请查看[贡献说明](./CONTRIBUTING.md)。 114 | 115 | ## 交流反馈 116 | 117 | 问题和建议反馈: 118 | 119 | [Github issues](https://github.com/zidoshare/Elise/issues) 120 | 121 | 邮箱: [wuhongxu1208@gmail.com](mailto:wuhongxu1208@gmail.com) 122 | 123 | ## 感谢 124 | 125 | 本项目的开发离不开前辈的探索,许多思想理念也来自于其他框架: 126 | 127 | * [webmagic](https://github.com/code4craft/webmagic):一个开源的Java垂直爬虫框架. 128 | 129 | * [Spiderman2](https://gitee.com/l-weiwei/Spiderman2):简单的说,这是一个网页爬虫工具,专门对网页内容进行抓取和解析 130 | 131 | * [colly](https://github.com/gocolly/colly):优雅快速的go语言爬虫框架 132 | -------------------------------------------------------------------------------- /docs/ROADMAP.md: -------------------------------------------------------------------------------- 1 | # 开发路线 2 | 3 | API和功能请求应作为PR提交到本文档。 4 | 5 | ## 框架核心 6 | 7 | * (基本完成)完成使用者会接触的各项复杂类的构建类(Builder模式) 8 | 9 | * Operator提供`pre(Request)`方法,该方法执行预先请求,主要用于类似登录的操作。提供`follow()`方法提示爬虫将跳转的界面作为入口。 10 | 11 | * HtmlUnitDownloader下载器适配以及HtmlUnitDownloaderFactory构建工厂类 12 | 13 | * (已完成)Spider提供的`pause`/`recover`功能实现,提供暂停/恢复功能(已写接口,但并未实现,可查看//TODO) 14 | 15 | * proxy代理。目前对于proxy并没有比较好的想法,因为代理的情况很多,考虑很多代理提供商会有`http隧道`和`直接ip`等多种代理方式。目前代理配置了并不会起作用 16 | 17 | * 注解配置支持。`spider.of(xxx.class)`方法,读取类注解构建ModelExtractor 18 | 19 | * 爬取结果的处理,目前使用Saver的机制,并不能很好的配合使用者,需要进行讨论。 20 | 21 | * (已完成 X 2)ResponseHandler支持。 22 | 23 | ```java 24 | SpiderBuilder.defaults().of(response -> { 25 | response.modelName("project"); 26 | response.asTarget().matchUrl(new LinkSelector("github\\.com/zidoshare/[^/]*$")); 27 | response.asHelper().filter(new LinkSelector("github\\.com/zidoshare/[^/]*$")); 28 | response.asContent().html().xpath("//*[@id=\"js-repo-pjax-container\"]/div[1]/div/h1/strong/a").text().save("title"); 29 | response.asContent().html().xpath("//span[@class=\"text-gray-dark mr-2\"]").text().save("description"); 30 | response.asContent().html().xpath("//*[@id=\"readme\"]/div[2]").text().save("readme"); 31 | }).execute("http://github.com/zidoshare").block(); 32 | ``` 33 | api化的构建抓取器 34 | 35 | * sleeptime支持,目前的sleeptime仅仅是最简单的实现,实际场景中没有意义,多线程下无法做到等待xxs后执行下一次爬去的场景。所以考虑加入一个全局钟摆实现等待。 36 | 37 | * 多线程下异常梳理,有些异常并不能让任务停下来,这个得等到代码进入一定规模后好好梳理一下。 38 | 39 | ## 分布式 40 | 41 | 分布式支持还早... -------------------------------------------------------------------------------- /docs/TUTORIAL.md: -------------------------------------------------------------------------------- 1 | # Elise 使用文档 2 | 3 | 欢迎使用Elise! 4 | 5 | 本文档的目标是帮助你快速的使用它执行第一次爬取。 6 | 7 | 在此之前你可能需要稍微理解一下Elise对于数据爬取目标的建模思想。 8 | 9 | ## 建模思想 10 | 11 | > 要把大象装冰箱,总共分两步!冰箱门就不用你关啦~ 12 | 13 | ### 对于目标的寻路策略(Target and Help) 14 | 15 | 对于定向的数据爬取,我们采用的是路径描述的模式: 16 | 17 | 其中有两个关键的类型,分别为**目标节点**(`target`)和**辅助节点**(`helpUrl`)。 18 | 19 | 一个网站多个页面的互相单向连接组成了一个复杂的有向图模型。为了能够**尽可能简单的**成功从其中抽取到我们想要的数据。 20 | 21 | 以此,通过目标节点和辅助节点两个元素,我们将复杂的过程以以下步骤精简的展示框架内部如何获取到最终的数据: 22 | 23 | > 实际上内部还包含了很多的优化处理,当然这些就不用使用者操心啦,放心地交给框架来搞定吧。 24 | 25 | 1. 添加入口,并通过入口获取到响应。 26 | 27 | 2. 判断响应体是否是**目标节点**,如果是进入第三步,否则进入第四步 28 | 29 | 3. 从响应体中进行内容匹配提取,如果能够成功的抓取到内容,则进行持久化,之后进入第四步 30 | 31 | 4. 提取其中的能够被**辅助节点**匹配的所有链接 32 | 33 | 5. 获取辅助链接下的响应,进入第二步 34 | 35 | 这是一个非常理想化的有向图遍历模型,通过这样的规范化步骤,我们将能得到我们梦寐以求的数据 36 | 37 | > 努力写出更好的爬取规则能大大减少爬虫需要遍历的页面,显著提高爬取速度哦! 38 | 39 | ### 数据内容模型(Content) 40 | 41 | 在寻路策略中我们提到了**从响应体中进行内容匹配提取**,对于实际的数据抽取,其情况要比想象中的复杂得多,必须得考虑一下情况: 42 | 43 | * 从响应体中的什么位置获取到数据,html/url/header还是cookie? 44 | 45 | * 从一个响应体中我们需要的内容模型完全可能不止一条,例如在[https://github.com/zidoshare](https://github.com/zidoshare)页面中,我们想要抽取项目名+描述,完全不需要进入到每个项目的具体页面就能执行,只需要外面展示的多个Pinned repositories即可。当然,如果你需要获取到readme的简述,那还是必须的进入页面才行啦~ 46 | 47 | * 我们需要的某个属性也可能不止一条,例如还是在[https://github.com/zidoshare](https://github.com/zidoshare)页面中,我们需要抽取到此用户以及它固定的6个项目的名称,我们不需要存储6个数据模型,直接整合到一起(当然也可能直接抽取出6条数据,那是再正常不过的啦)。 48 | 49 | * 每个属性具体结果的复杂性,比如你想爬取一篇文章,那么你可能需要获取纯文本,也可能想在纯文本基础上保留里面的图片视频这些多媒体内容或者直接保存html自行进行处理。 50 | 51 | ...当然还有更多更复杂的需求,就不一一叙述了 52 | 53 | 综合了各种可能出现的情况,最终我们通过`语义化api`实现了hold住分布式的优雅的通用解决方案。 54 | 55 | 1. 你能通过语义化api从response中的任何地方获取数据 56 | 57 | 2. response的html/xml中(仅支持html/xml),我们提供了**分块**(`parition`),通过使用分块,你能将网页分割成数个小块,他们可能拥有类似或者不同的内容,这将由你的抓取器自行决定,他们最终会组合成为多个数据内容结果。 58 | 59 | 3. 对于每一个内容结果的属性,我们默认它就是多条的,所以你能够随意进行任何的选择 60 | 61 | 4. 我们为内容提供了多种持久化选项,除了纯文本、html等元素外,还额外的提供了富文本类型,他剔除了多余的样式,保留了原来的文本信息,并且还把多媒体内容信息也截留了下来。为使用者尽可能的提供方便。 62 | 63 | 好啦~简单的了解了思想之后,我们快开始吧! 64 | 65 | ....未完待续 -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/elise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Programming-With-Love/Elise/18fb37a465ffa5bf97b25a891bf4fac9749ec909/docs/elise.png -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'Elise-build' 2 | include ':Elise-core' 3 | include ':Elise-distributed' 4 | include ':Elise-redis-support' 5 | include ':Elise-jedis-support' 6 | include ':Elise-kafka-support' 7 | include ':Elise-client' 8 | include ':Elise-test-server' 9 | 10 | --------------------------------------------------------------------------------