├── .classpath ├── .gitignore ├── .project ├── .settings ├── .jsdtscope ├── org.eclipse.core.resources.prefs ├── org.eclipse.jdt.core.prefs ├── org.eclipse.m2e.core.prefs ├── org.eclipse.wst.common.component ├── org.eclipse.wst.common.project.facet.core.xml ├── org.eclipse.wst.jsdt.ui.superType.container ├── org.eclipse.wst.jsdt.ui.superType.name └── org.eclipse.wst.validation.prefs ├── .travis.yml ├── README.md ├── ajaxDownloader └── phantomjs_fetcher.js ├── doc ├── README-en.md └── imgs │ ├── domainList.png │ ├── dynamic.png │ ├── editSpiderInfo.png │ ├── home.png │ ├── need.png │ ├── search.png │ ├── show.gif │ ├── showRelatedInfo.png │ ├── showWebpageById.png │ ├── spiderList.png │ ├── spiderinfo.png │ └── testSpiderinfo.png ├── examples ├── README.md ├── news.163.com.json ├── news.qq.com.json ├── news.qq.com_time_autodetect.json ├── news.sohu.com.json ├── phantomjs_fetcher.js ├── qq-体育.json ├── qq-军事.json ├── qq-娱乐.json ├── qq-房产.json ├── qq-文化.json ├── qq-新闻.json ├── qq-旅游.json ├── qq-时尚.json ├── qq-汽车.json ├── qq-社会.json ├── qq-社会生活.json ├── qq-科技.json ├── qq-财经.json ├── www.chinanews.com.json ├── www.oschina.net.json └── www.shicimingju.com.json ├── log └── ship.log ├── pay ├── alipay.jpeg └── wechat.jpeg ├── pom.xml └── src └── main ├── java └── com │ └── gs │ └── spider │ ├── controller │ ├── AsyncGatherBaseController.java │ ├── BaseController.java │ ├── commons │ │ ├── spider │ │ │ └── CommonsSpiderController.java │ │ ├── spiderinfo │ │ │ └── SpiderInfoController.java │ │ └── webpage │ │ │ └── CommonWebpageController.java │ ├── home │ │ └── HomeController.java │ └── panel │ │ └── commons │ │ └── CommonsSpiderPanel.java │ ├── dao │ ├── CommonWebpageDAO.java │ ├── CommonWebpagePipeline.java │ ├── CommonWebpageRedisPipeline.java │ ├── ESClient.java │ ├── ESPipeline.java │ ├── IDAO.java │ ├── JsonFilePipeline.java │ └── SpiderInfoDAO.java │ ├── gather │ ├── async │ │ ├── AsyncGather.java │ │ ├── TaskManager.java │ │ └── quartz │ │ │ ├── QuartzManager.java │ │ │ └── WebpageSpiderJob.java │ └── commons │ │ ├── Casperjs.java │ │ ├── CasperjsDownloader.java │ │ ├── CommonSpider.java │ │ ├── ContentLengthLimitHttpClientDownloader.java │ │ └── PageConsumer.java │ ├── model │ ├── async │ │ ├── BaseMsg.java │ │ ├── CallbackMsg.java │ │ ├── CallbackReplyMsg.java │ │ ├── InfoMsg.java │ │ ├── LoginMsg.java │ │ ├── MsgType.java │ │ ├── PingMsg.java │ │ ├── State.java │ │ └── Task.java │ ├── commons │ │ ├── LoginInfo.java │ │ ├── Page.java │ │ ├── Request.java │ │ ├── SpiderInfo.java │ │ └── Webpage.java │ └── utils │ │ ├── MySupplier.java │ │ ├── ResultBundle.java │ │ ├── ResultBundleBuilder.java │ │ ├── ResultBundleResolver.java │ │ └── ResultListBundle.java │ ├── service │ ├── AsyncGatherService.java │ └── commons │ │ ├── spider │ │ └── CommonsSpiderService.java │ │ ├── spiderinfo │ │ └── SpiderInfoService.java │ │ └── webpage │ │ └── CommonWebpageService.java │ └── utils │ ├── AppInfo.java │ ├── HANLPExtractor.java │ ├── HttpClientUtil.java │ ├── NLPExtractor.java │ ├── StaticValue.java │ └── TablePage.java ├── resources ├── appinfo ├── commonIndex.json ├── datePattern.txt ├── ignoredUrls.txt ├── log4j2.xml ├── mvc-dispatcher-servlet.xml ├── spiderinfo.json ├── staticvalue.json └── webpage.json └── webapp ├── WEB-INF └── web.xml ├── css ├── bootstrap.css ├── bootstrap.css.map ├── bootstrap.min.css └── bootstrap.min.css.map ├── imgs └── logos │ └── logo_without_char_48X48.ico ├── js ├── bootstrap.js ├── bootstrap.min.js ├── jquery.form.js ├── jquery.min.js ├── jquery.validate.min.js ├── messages_zh.min.js ├── my.js ├── npm.js └── tether.min.js └── pages ├── commons ├── allScript.jsp ├── head.jsp ├── header.jsp ├── minScript.jsp └── tablePage.jsp └── panel ├── commons ├── createQuartz.jsp ├── domainList.jsp ├── editSpiderInfo.jsp ├── list.jsp ├── listQuartz.jsp ├── listSpiderInfo.jsp ├── listTasks.jsp ├── showRelatedInfo.jsp ├── showWebpageById.jsp └── updateBySpiderInfoID.jsp └── welcome └── welcome.jsp /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .idea 3 | spider.iml 4 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | java-spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.wst.common.project.facet.core.builder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.validation.validationbuilder 20 | 21 | 22 | 23 | 24 | org.eclipse.m2e.core.maven2Builder 25 | 26 | 27 | 28 | 29 | 30 | org.eclipse.jem.workbench.JavaEMFNature 31 | org.eclipse.wst.common.modulecore.ModuleCoreNature 32 | org.eclipse.jdt.core.javanature 33 | org.eclipse.m2e.core.maven2Nature 34 | org.eclipse.wst.common.project.facet.core.nature 35 | org.eclipse.wst.jsdt.core.jsNature 36 | 37 | 38 | -------------------------------------------------------------------------------- /.settings/.jsdtscope: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.compliance=1.8 5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 7 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.jsdt.ui.superType.container: -------------------------------------------------------------------------------- 1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.jsdt.ui.superType.name: -------------------------------------------------------------------------------- 1 | Window -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JAVA爬虫框架实战 2 | 3 | 基于webmagic框架二次开发的java爬虫框架实战,已实现能爬取腾讯,搜狐,今日头条(单独集成功能,[教程学习地址](https://blog.csdn.net/hemin1003/article/details/104615208))等资讯内容,配合elasticsearch框架用法,实现了自动爬虫,已投入生产试用中。 4 | 5 | [后台管理统计系统源码](https://github.com/hemin1003/aylson-parent) 6 | 7 | 体验系统地址:http://182.92.82.188:8280/manage/login.jsp 8 | 9 | 体验账号/密码,test1001/a12345678 10 | 11 | 后台系统源码:[https://github.com/hemin1003/aylson-parent](https://github.com/hemin1003/aylson-parent) 12 | 13 | ## [关于我](http://heminit.com/about/) 14 | 15 | 欢迎交流问题,可加我的个人QQ 469580884,或群号 751925591,一起探讨交流问题 16 | 17 | [我的博客地址](http://blog.csdn.net/hemin1003) 18 | 19 | [个人域名](http://heminit.com) 20 | 21 | ## 感谢 22 | 如果觉得内容赞,您可以请我喝一杯咖啡: 23 |
24 |      25 | 26 | 27 | 28 |

29 | 参考项目资料如下: 30 | 31 | 32 | # 欢迎使用 Gather Platform 数据采集与分析平台 33 | 34 | ------ 35 | 36 | [Readme in English](https://github.com/gsh199449/spider/tree/master/doc/README-en.md) 37 | 38 | **详细使用方法请参考 [在线文档](https://gsh199449.github.io/gather_platform_pages/)** 39 | 40 | [![Build Status](https://travis-ci.org/gsh199449/spider.svg?branch=master)](https://travis-ci.org/gsh199449/spider) 41 | 42 | Gather Platform 数据抓取平台是一套基于[Webmagic](https://github.com/code4craft/webmagic)内核的,具有Web任务配置和任务管理界面的数据采集与搜索平台.具有以下功能 43 | 44 | > * 根据配置的模板进行数据采集,支持**Ajax网页采集** 45 | > * 在不配置采集模板的情况下自动检测网页正文,自动抽取文章发布时间 46 | > * 动态字段抽取与静态字段植入 47 | > * 已抓取数据的管理,包括:搜索,增删改查,按照新的数据模板重新抽取数据 48 | > * 对采集的数据进行NLP处理,包括:抽取关键词,抽取摘要,抽取实体词 49 | > * 含有相关文章推荐,文章中人物、地点之间的关联关系分析 50 | 51 | 5分钟即可部署完毕,半分钟即可完成一个爬虫,开始数据采集.、 52 | 53 | 不需要进行任何编码就可以完成一个功能强大的爬虫. 54 | 55 | show 56 | 57 | ## Windows/Mac/Linux 全平台支持 58 | 59 | 本系统需要如下依赖: 60 | 61 | - JDK 8 及以上 62 | - Tomcat 8.3 及以上 63 | 64 | 可选依赖组件: 65 | 66 |  - Elasticsearch 5.0 67 | 68 | ## 部署、使用方法、二次开发手册、常见问题等全部迁移至[在线文档](https://gsh199449.github.io/gather_platform_pages/) 69 | 70 | -------------------------------------------------------------------------------- /doc/imgs/domainList.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/domainList.png -------------------------------------------------------------------------------- /doc/imgs/dynamic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/dynamic.png -------------------------------------------------------------------------------- /doc/imgs/editSpiderInfo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/editSpiderInfo.png -------------------------------------------------------------------------------- /doc/imgs/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/home.png -------------------------------------------------------------------------------- /doc/imgs/need.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/need.png -------------------------------------------------------------------------------- /doc/imgs/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/search.png -------------------------------------------------------------------------------- /doc/imgs/show.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/show.gif -------------------------------------------------------------------------------- /doc/imgs/showRelatedInfo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/showRelatedInfo.png -------------------------------------------------------------------------------- /doc/imgs/showWebpageById.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/showWebpageById.png -------------------------------------------------------------------------------- /doc/imgs/spiderList.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/spiderList.png -------------------------------------------------------------------------------- /doc/imgs/spiderinfo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/spiderinfo.png -------------------------------------------------------------------------------- /doc/imgs/testSpiderinfo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/doc/imgs/testSpiderinfo.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # 爬虫模板库 2 | 3 | ------ 4 | 5 | 为了方便大家使用,平台提供了一些示例爬虫模板供大家测试使用。 6 | 7 | - 腾讯新闻:[精准时间模板](https://github.com/gsh199449/spider/blob/master/examples/news.qq.com.json),[自动探测时间模板](https://github.com/gsh199449/spider/blob/master/examples/news.qq.com_time_autodetect.json) 8 | - 诗词网:[诗词名句网](https://github.com/gsh199449/spider/blob/master/examples/www.shicimingju.com.json) 9 | - 中新网: [新闻抽取](https://github.com/gsh199449/spider/blob/master/examples/www.chinanews.com.json) 10 | - 网易新闻: [新闻抽取](https://github.com/gsh199449/spider/blob/master/examples/news.163.com.json) 11 | - 搜狐新闻:[搜狐](https://github.com/gsh199449/spider/blob/master/examples/news.sohu.com.json), 12 | - 开源中国: [含有动态字段的抽取](https://github.com/gsh199449/spider/blob/master/examples/www.oschina.net.json) 13 | 14 | -------------------------------------------------------------------------------- /examples/news.163.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "网易新闻", 3 | "domain": "news.163.com", 4 | "startURL": [ 5 | "http://news.163.com/" 6 | ], 7 | "id": "AVmyFYFbF9E-TzdHO95J", 8 | "thread": "1", 9 | "retry": "2", 10 | "sleep": "0", 11 | "maxPageGather": "500", 12 | "timeout": "5000", 13 | "charset": "", 14 | "callbackURL": [], 15 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 16 | "proxyHost": "", 17 | "proxyPort": "0", 18 | "proxyUsername": "", 19 | "proxyPassword": "", 20 | "contentReg": "", 21 | "contentXPath": "//div[@id=\"endText\"]/tidyText()", 22 | "titleReg": "", 23 | "titleXPath": "", 24 | "categoryReg": "", 25 | "categoryXPath": "//a[@id=\"ne_article_source\"]/text()", 26 | "defaultCategory": "", 27 | "urlReg": "http://news\\.163\\.com/.*html", 28 | "extractUrlReg": "", 29 | "publishTimeXPath": "", 30 | "publishTimeReg": "", 31 | "publishTimeFormat": "", 32 | "lang": "", 33 | "country": "", 34 | "ajaxWait": "1", 35 | "extractorId": "", 36 | "extractorScriptType": "python2", 37 | "extractorScript": "", 38 | "doNLP": true, 39 | "needContent": true, 40 | "needPublishTime": true, 41 | "saveCapture": true, 42 | "autoDetectPublishDate": true, 43 | "portSpiderInfo": "", 44 | "gatherFirstPage": false, 45 | "needTitle": false, 46 | "ajaxSite": false, 47 | "dynamicFields": [], 48 | "staticFields": [] 49 | } -------------------------------------------------------------------------------- /examples/news.qq.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "thread": "2", 3 | "retry": "2", 4 | "sleep": "800", 5 | "maxPageGather": "10", 6 | "timeout": "5000", 7 | "priority": "0", 8 | "siteName": "腾讯新闻", 9 | "domain": "news.qq.com", 10 | "startURL": [ 11 | "http://news.qq.com" 12 | ], 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/tidyText()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://news.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "(.*?)", 24 | "publishTimeFormat": "yyyy-MM-dd hh:mm", 25 | "authorReg": "", 26 | "authorXPath": "//h1/tidyText()", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "doNLP": true, 32 | "gatherFirstPage": false, 33 | "needTitle": false, 34 | "needContent": false, 35 | "needPublishTime": false, 36 | "ajaxSite": false, 37 | "dynamicFields": [ 38 | { 39 | "regex": "", 40 | "xpath": "//h1/tidyText()", 41 | "name": "author", 42 | "need": false 43 | } 44 | ] 45 | } -------------------------------------------------------------------------------- /examples/news.qq.com_time_autodetect.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "", 3 | "thread": "2", 4 | "retry": "2", 5 | "sleep": "800", 6 | "maxPageGather": "10", 7 | "timeout": "5000", 8 | "priority": "0", 9 | "siteName": "腾讯新闻", 10 | "domain": "news.qq.com", 11 | "startURL": [ 12 | "http://news.qq.com" 13 | ], 14 | "contentReg": "", 15 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/tidyText()", 16 | "titleReg": "", 17 | "titleXPath": "", 18 | "categoryReg": "", 19 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 20 | "defaultCategory": "", 21 | "urlReg": "http://news.qq.com/a/\\d{8}/\\d+\\.htm", 22 | "charset": "", 23 | "publishTimeXPath": "", 24 | "publishTimeReg": "", 25 | "publishTimeFormat": "", 26 | "authorReg": "", 27 | "authorXPath": "//h1/tidyText()", 28 | "lang": "", 29 | "country": "", 30 | "callbackURL": [], 31 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 32 | "doNLP": true, 33 | "saveCapture": true, 34 | "autoDetectPublishDate": true, 35 | "gatherFirstPage": false, 36 | "needTitle": false, 37 | "needContent": false, 38 | "needPublishTime": false, 39 | "ajaxSite": false, 40 | "dynamicFields": [ 41 | { 42 | "regex": "", 43 | "xpath": "//h1/tidyText()", 44 | "name": "author", 45 | "need": false 46 | } 47 | ], 48 | "staticFields": [] 49 | } -------------------------------------------------------------------------------- /examples/news.sohu.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "搜狐新闻", 3 | "domain": "news.sohu.com", 4 | "startURL": [ 5 | "http://news.sohu.com/" 6 | ], 7 | "id": "", 8 | "thread": "1", 9 | "retry": "2", 10 | "sleep": "0", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "charset": "", 14 | "callbackURL": [], 15 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 16 | "proxyHost": "", 17 | "proxyPort": "0", 18 | "proxyUsername": "", 19 | "proxyPassword": "", 20 | "contentReg": "", 21 | "contentXPath": "//div[@itemprop=\"articleBody\"]/allText()", 22 | "titleReg": "", 23 | "titleXPath": "//h1/text()", 24 | "categoryReg": "", 25 | "categoryXPath": "//div[@id=\"mypos\"]/span/allText()", 26 | "defaultCategory": "", 27 | "urlReg": "http://news\\.sohu\\.com/\\d{8}/n\\d*\\.shtml", 28 | "extractUrlReg": "", 29 | "publishTimeXPath": "//div[@id=\"pubtime_baidu\"]/text()", 30 | "publishTimeReg": "", 31 | "publishTimeFormat": "yyyy-MM-dd hh:mm:ss", 32 | "lang": "", 33 | "country": "", 34 | "doNLP": true, 35 | "needPublishTime": true, 36 | "saveCapture": true, 37 | "gatherFirstPage": false, 38 | "needTitle": false, 39 | "needContent": false, 40 | "ajaxSite": false, 41 | "autoDetectPublishDate": false, 42 | "dynamicFields": [], 43 | "staticFields": [] 44 | } -------------------------------------------------------------------------------- /examples/qq-体育.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-体育-自动探测", 3 | "domain": "sports.qq.com", 4 | "startURL": [ 5 | "http://sports.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://sports.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-军事.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-军事-自动探测", 3 | "domain": "mil.qq.com", 4 | "startURL": [ 5 | "http://mil.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://mil.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-娱乐.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-娱乐-自动探测", 3 | "domain": "ent.qq.com", 4 | "startURL": [ 5 | "http://ent.qq.com/" 6 | ], 7 | "id": "AV6TJ8aOE7C-scy0W2j3", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://ent.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//img[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//img[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-房产.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-房产-自动探测", 3 | "domain": "house.qq.com", 4 | "startURL": [ 5 | "http://house.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://house.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-文化.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-文化-自动探测", 3 | "domain": "cul.qq.com", 4 | "startURL": [ 5 | "http://cul.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://cul.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-新闻.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-新闻-自动探测", 3 | "domain": "news.qq.com", 4 | "startURL": [ 5 | "http://news.qq.com/" 6 | ], 7 | "id": "AV6TICQWE7C-scy0W2jy", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://news.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//img[@style='display:block;']/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//img[@style='display:block;']/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-旅游.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-旅游-自动探测", 3 | "domain": "ly.qq.com", 4 | "startURL": [ 5 | "http://ly.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://ly.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-时尚.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-时尚-自动探测", 3 | "domain": "fashion.qq.com", 4 | "startURL": [ 5 | "http://fashion.qq.com/" 6 | ], 7 | "id": "AV6Tt0O3E7C-scy0W2kV", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://fashion.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-汽车.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-汽车-自动探测", 3 | "domain": "auto.qq.com", 4 | "startURL": [ 5 | "http://auto.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://auto.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-社会.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-社会-自动探测", 3 | "domain": "society.qq.com", 4 | "startURL": [ 5 | "http://society.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://society.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//img[@style='display:block;']/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//img[@style='display:block;']/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-社会生活.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-社会生活-自动探测", 3 | "domain": "cq.qq.com", 4 | "startURL": [ 5 | "http://cq.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://cq.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-科技.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-科技-自动探测", 3 | "domain": "tech.qq.com", 4 | "startURL": [ 5 | "http://tech.qq.com/" 6 | ], 7 | "id": "AV6TDjg3E7C-scy0W2jq", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://tech.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//img[@style='display:block;']/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//img[@style='display:block;']/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/qq-财经.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteName": "腾讯新闻-财经-自动探测", 3 | "domain": "finance.qq.com", 4 | "startURL": [ 5 | "http://finance.qq.com/" 6 | ], 7 | "id": "", 8 | "thread": "2", 9 | "retry": "2", 10 | "sleep": "800", 11 | "maxPageGather": "10", 12 | "timeout": "5000", 13 | "contentReg": "", 14 | "contentXPath": "//div[@id='Cnt-Main-Article-QQ']/outerHtml()", 15 | "titleReg": "", 16 | "titleXPath": "", 17 | "categoryReg": "", 18 | "categoryXPath": "//a[@accesskey=\"5\"]/text()", 19 | "defaultCategory": "", 20 | "urlReg": "http://finance.qq.com/a/\\d{8}/\\d+\\.htm", 21 | "charset": "", 22 | "publishTimeXPath": "", 23 | "publishTimeReg": "", 24 | "publishTimeFormat": "", 25 | "imgsReg": "", 26 | "imgsXPath": "//p//IMG[@alt]/@src", 27 | "lang": "", 28 | "country": "", 29 | "callbackURL": [], 30 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 31 | "proxyHost": "", 32 | "proxyPort": "0", 33 | "proxyUsername": "", 34 | "proxyPassword": "", 35 | "doNLP": true, 36 | "needTitle": true, 37 | "needContent": true, 38 | "needPublishTime": true, 39 | "autoDetectPublishDate": true, 40 | "gatherFirstPage": false, 41 | "saveCapture": false, 42 | "ajaxSite": false, 43 | "dynamicFields": [ 44 | { 45 | "regex": "", 46 | "xpath": "//p//IMG[@alt]/@src", 47 | "name": "imgs", 48 | "need": false 49 | } 50 | ], 51 | "staticFields": [] 52 | } -------------------------------------------------------------------------------- /examples/www.chinanews.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "", 3 | "thread": "2", 4 | "retry": "2", 5 | "sleep": "0", 6 | "maxPageGather": "10", 7 | "timeout": "5000", 8 | "priority": "0", 9 | "siteName": "中国新闻网", 10 | "domain": "www.chinanews.com", 11 | "startURL": [ 12 | "http://www.chinanews.com/index.shtml" 13 | ], 14 | "contentReg": "", 15 | "contentXPath": "//div[@class='left_zw']/allText()", 16 | "titleReg": "", 17 | "titleXPath": "//div[@id='cont_1_1_2']/h1/text()", 18 | "categoryReg": "", 19 | "categoryXPath": "", 20 | "defaultCategory": "", 21 | "urlReg": "http://www.chinanews.com/\\w+/\\d+\\/\\d{2}\\-\\d{2}/\\d+\\.shtml", 22 | "charset": "", 23 | "publishTimeXPath": "//div[@class='left-t']/text()", 24 | "publishTimeReg": "", 25 | "publishTimeFormat": " yyyy年MM月dd日 HH:mm 来源:", 26 | "lang": "", 27 | "country": "", 28 | "callbackURL": [], 29 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 30 | "doNLP": true, 31 | "needPublishTime": true, 32 | "saveCapture": true, 33 | "gatherFirstPage": false, 34 | "needTitle": false, 35 | "needContent": false, 36 | "ajaxSite": false, 37 | "autoDetectPublishDate": false, 38 | "dynamicFields": [], 39 | "staticFields": [] 40 | } 41 | -------------------------------------------------------------------------------- /examples/www.oschina.net.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "", 3 | "thread": "1", 4 | "retry": "2", 5 | "sleep": "0", 6 | "maxPageGather": "10", 7 | "timeout": "5000", 8 | "priority": "0", 9 | "siteName": "oschina", 10 | "domain": "www.oschina.net", 11 | "startURL": [ 12 | "http://www.oschina.net/p/gather-platform" 13 | ], 14 | "contentReg": "", 15 | "contentXPath": "//section[@id='v-details']/tidyText()", 16 | "titleReg": "", 17 | "titleXPath": "//h1/tidyText()", 18 | "categoryReg": "", 19 | "categoryXPath": "", 20 | "defaultCategory": "", 21 | "urlReg": "https://www\\.oschina\\.net/p/.*", 22 | "charset": "", 23 | "publishTimeXPath": "", 24 | "publishTimeReg": "收录时间:\\d{4}-\\d{2}-\\d{2}", 25 | "publishTimeFormat": "收录时间:yyyy-mm-dd", 26 | "homePageReg": "", 27 | "homePageXPath": "//a[@class='item index']/@href", 28 | "lang": "", 29 | "country": "", 30 | "callbackURL": [], 31 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 32 | "doNLP": true, 33 | "needTitle": true, 34 | "needContent": true, 35 | "gatherFirstPage": false, 36 | "needPublishTime": false, 37 | "ajaxSite": false, 38 | "saveCapture": false, 39 | "autoDetectPublishDate": false, 40 | "dynamicFields": [ 41 | { 42 | "regex": "", 43 | "xpath": "//a[@class='item index']/@href", 44 | "name": "homePage", 45 | "need": false 46 | } 47 | ], 48 | "staticFields": [] 49 | } -------------------------------------------------------------------------------- /examples/www.shicimingju.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "", 3 | "thread": "2", 4 | "retry": "2", 5 | "sleep": "0", 6 | "maxPageGather": "10000", 7 | "timeout": "5000", 8 | "priority": "0", 9 | "siteName": "诗词名句", 10 | "domain": "www.shicimingju.com", 11 | "startURL": [ 12 | "http://www.shicimingju.com/" 13 | ], 14 | "contentReg": "", 15 | "contentXPath": "//div[@id='shicineirong']/tidyText()", 16 | "titleReg": "", 17 | "titleXPath": "//div[@class='zhuti yuanjiao']/h2/tidyText()", 18 | "categoryReg": "", 19 | "categoryXPath": "//div[@class='jjzz']/tidyText()", 20 | "defaultCategory": "", 21 | "urlReg": "http://www\\.shicimingju\\.com/chaxun/list/\\d+\\.html", 22 | "charset": "", 23 | "publishTimeXPath": "", 24 | "publishTimeReg": "", 25 | "publishTimeFormat": "", 26 | "lang": "", 27 | "country": "", 28 | "callbackURL": [], 29 | "userAgent": "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30", 30 | "needTitle": true, 31 | "needContent": true, 32 | "gatherFirstPage": false, 33 | "doNLP": false, 34 | "needPublishTime": false, 35 | "ajaxSite": false, 36 | "saveCapture": false, 37 | "autoDetectPublishDate": false, 38 | "dynamicFields": [], 39 | "staticFields": [] 40 | } -------------------------------------------------------------------------------- /pay/alipay.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/pay/alipay.jpeg -------------------------------------------------------------------------------- /pay/wechat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/pay/wechat.jpeg -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/controller/AsyncGatherBaseController.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.controller; 2 | 3 | import com.gs.spider.model.async.State; 4 | import com.gs.spider.model.async.Task; 5 | import com.gs.spider.model.utils.ResultBundle; 6 | import com.gs.spider.model.utils.ResultListBundle; 7 | import com.gs.spider.service.AsyncGatherService; 8 | import org.apache.logging.log4j.LogManager; 9 | import org.apache.logging.log4j.Logger; 10 | import org.springframework.web.bind.annotation.RequestMapping; 11 | import org.springframework.web.bind.annotation.RequestMethod; 12 | import org.springframework.web.bind.annotation.RequestParam; 13 | import org.springframework.web.bind.annotation.ResponseBody; 14 | 15 | import java.io.IOException; 16 | 17 | /** 18 | * AsyncGatherBaseController 异步抓取器的Controller 19 | */ 20 | public class AsyncGatherBaseController extends BaseController { 21 | 22 | private AsyncGatherService asyncGatherService; 23 | 24 | private Logger logger = LogManager.getLogger(AsyncGatherBaseController.class); 25 | 26 | public AsyncGatherBaseController(AsyncGatherService asyncGatherService) { 27 | this.asyncGatherService = asyncGatherService; 28 | } 29 | 30 | /** 31 | * 列出所有任务 32 | * 33 | * @return 0表示正在进行 1表示已经完成 34 | * @throws IOException 35 | */ 36 | @RequestMapping(value = "listTasks", method = RequestMethod.GET, produces = "application/json") 37 | @ResponseBody 38 | public ResultListBundle listTasks( 39 | @RequestParam(value = "containsExtraInfo", required = false, defaultValue = "false") boolean containsExtraInfo) 40 | throws IOException { 41 | return asyncGatherService.getTaskList(containsExtraInfo); 42 | } 43 | 44 | /** 45 | * 根据id获取task 46 | * 47 | * @param taskId 48 | * @return 49 | * @throws IOException 50 | */ 51 | @RequestMapping(value = "getTaskById", method = RequestMethod.GET, produces = "application/json") 52 | @ResponseBody 53 | public ResultBundle getTaskById(String taskId, 54 | @RequestParam(value = "containsExtraInfo", required = false, defaultValue = "true") boolean containsExtraInfo) 55 | throws IOException { 56 | return asyncGatherService.getTaskById(taskId, containsExtraInfo); 57 | } 58 | 59 | /** 60 | * 获取异步抓取长连接服务器端口号 61 | * 62 | * @return 63 | */ 64 | @RequestMapping(value = "getLongConnectionPort", method = RequestMethod.GET, produces = "application/json") 65 | @ResponseBody 66 | public ResultBundle getLongConnectionPort() throws IOException { 67 | return asyncGatherService.getLongConnectionPort(); 68 | } 69 | 70 | /** 71 | * 获取当前task已经抓取的文章数 72 | * 73 | * @param taskId 74 | * @return 75 | * @throws IOException 76 | */ 77 | @RequestMapping(value = "getTaskCount", method = RequestMethod.GET, produces = "application/json") 78 | @ResponseBody 79 | public ResultBundle getTaskCount(String taskId) throws IOException { 80 | return asyncGatherService.getTaskCount(taskId); 81 | } 82 | 83 | /** 84 | * 根据taskId删除任务 85 | * 86 | * @param taskId 87 | * 任务ID 88 | * @return 成功返回OK! 89 | */ 90 | @RequestMapping(value = "deleteTaskById", method = RequestMethod.GET, produces = "application/json") 91 | @ResponseBody 92 | public ResultBundle deleteTaskById(String taskId) { 93 | return asyncGatherService.deleteTaskById(taskId); 94 | } 95 | 96 | /** 97 | * 获取任务列表,通过状态过滤 98 | * 99 | * @param state 100 | * 任务状态 101 | * @return 102 | */ 103 | @RequestMapping(value = "getTasksFilterByState", method = RequestMethod.GET, produces = "application/json") 104 | @ResponseBody 105 | public ResultListBundle getTasksFilterByState(State state, 106 | @RequestParam(value = "containsExtraInfo", required = false, defaultValue = "false") boolean containsExtraInfo) { 107 | return asyncGatherService.getTasksFilterByState(state, containsExtraInfo); 108 | } 109 | 110 | /** 111 | * 获取任务列表,通过时间状态过滤 112 | * 113 | * @param start 114 | * 开始时间 115 | * @param end 116 | * 结束时间 117 | * @return 118 | */ 119 | @RequestMapping(value = "getTasksFilterByTime", method = RequestMethod.GET, produces = "application/json") 120 | @ResponseBody 121 | public ResultListBundle getTasksFilterByTime(long start, long end, 122 | @RequestParam(value = "containsExtraInfo", required = false, defaultValue = "false") boolean containsExtraInfo) { 123 | return asyncGatherService.getTasksFilterByTime(start, end, containsExtraInfo); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/controller/BaseController.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.controller; 2 | 3 | public class BaseController { 4 | 5 | } 6 | 7 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/controller/commons/spider/CommonsSpiderController.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.controller.commons.spider; 2 | 3 | import com.gs.spider.controller.AsyncGatherBaseController; 4 | import com.gs.spider.model.commons.Webpage; 5 | import com.gs.spider.model.utils.ResultBundle; 6 | import com.gs.spider.model.utils.ResultListBundle; 7 | import com.gs.spider.service.AsyncGatherService; 8 | import com.gs.spider.service.commons.spider.CommonsSpiderService; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | import org.assertj.core.util.Lists; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.beans.factory.annotation.Qualifier; 14 | import org.springframework.stereotype.Controller; 15 | import org.springframework.web.bind.annotation.RequestMapping; 16 | import org.springframework.web.bind.annotation.RequestMethod; 17 | import org.springframework.web.bind.annotation.RequestParam; 18 | import org.springframework.web.bind.annotation.ResponseBody; 19 | 20 | import javax.servlet.http.HttpServletResponse; 21 | import java.io.IOException; 22 | import java.io.OutputStream; 23 | import java.util.Map; 24 | 25 | /** 26 | * CommonsWebpageDownloadController 27 | */ 28 | @Controller 29 | @RequestMapping("/commons/spider") 30 | public class CommonsSpiderController extends AsyncGatherBaseController { 31 | 32 | private Logger logger = LogManager.getLogger(CommonsSpiderController.class); 33 | 34 | private CommonsSpiderService spiderService; 35 | 36 | 37 | @Autowired 38 | public CommonsSpiderController(@Qualifier("commonsSpiderService") AsyncGatherService asyncGatherService) { 39 | super(asyncGatherService); 40 | this.spiderService = (CommonsSpiderService) asyncGatherService; 41 | } 42 | 43 | /** 44 | * 启动爬虫 45 | * 46 | * @param spiderInfoJson 使用json格式进行序列化的spiderinfo 47 | * @return 任务id 48 | */ 49 | @RequestMapping(value = "start", method = {RequestMethod.GET, RequestMethod.POST}, produces = "application/json") 50 | @ResponseBody 51 | public ResultBundle start(String spiderInfoJson) { 52 | return spiderService.start(spiderInfoJson); 53 | } 54 | 55 | /** 56 | * 停止爬虫 57 | * 58 | * @param uuid 任务id(爬虫uuid) 59 | * @return 60 | */ 61 | @RequestMapping(value = "stop", method = RequestMethod.GET, produces = "application/json") 62 | @ResponseBody 63 | public ResultBundle stop(String uuid) { 64 | return spiderService.stop(uuid); 65 | } 66 | 67 | /** 68 | * 获取爬虫运行时信息 69 | * 70 | * @param uuid 爬虫uuid 任务id 71 | * @return 72 | */ 73 | @RequestMapping(value = "runtimeInfo", method = RequestMethod.GET, produces = "application/json") 74 | @ResponseBody 75 | public ResultBundle> runtimeInfo(String uuid, @RequestParam(value = "containsExtraInfo", required = false, defaultValue = "false") boolean containsExtraInfo) { 76 | return spiderService.runtimeInfo(uuid, containsExtraInfo); 77 | } 78 | 79 | /** 80 | * 列出所有爬虫的运行时信息 81 | * 82 | * @return 83 | */ 84 | @RequestMapping(value = "list", method = RequestMethod.GET, produces = "application/json") 85 | @ResponseBody 86 | public ResultBundle>> list(@RequestParam(value = "containsExtraInfo", required = false, defaultValue = "false") boolean containsExtraInfo) { 87 | return spiderService.list(containsExtraInfo); 88 | } 89 | 90 | /** 91 | * 删除爬虫 92 | * 93 | * @param uuid 爬虫uuid 任务id 94 | * @return 95 | */ 96 | @RequestMapping(value = "delete", method = RequestMethod.GET, produces = "application/json") 97 | @ResponseBody 98 | public ResultBundle delete(String uuid) { 99 | return spiderService.delete(uuid); 100 | } 101 | 102 | /** 103 | * 删除所有爬虫 104 | * 105 | * @return 106 | */ 107 | @RequestMapping(value = "deleteAll", method = RequestMethod.GET, produces = "application/json") 108 | @ResponseBody 109 | public ResultBundle deleteAll() { 110 | return spiderService.deleteAll(); 111 | } 112 | 113 | /** 114 | * 测试爬虫模板 115 | * 116 | * @param spiderInfoJson 117 | * @return 118 | */ 119 | @RequestMapping(value = "testSpiderInfo", method = RequestMethod.GET, produces = "application/json") 120 | @ResponseBody 121 | public ResultListBundle testSpiderInfo(String spiderInfoJson) { 122 | return spiderService.testSpiderInfo(spiderInfoJson); 123 | } 124 | 125 | /** 126 | * 获取忽略url黑名单 127 | * 128 | * @return 129 | */ 130 | @RequestMapping(value = "getIgnoredUrls", method = RequestMethod.GET, produces = "application/json") 131 | @ResponseBody 132 | public ResultListBundle getIgnoredUrls() { 133 | return spiderService.getIgnoredUrls(); 134 | } 135 | 136 | /** 137 | * 添加忽略url黑名单 138 | * 139 | * @param postfix 140 | */ 141 | @RequestMapping(value = "addIgnoredUrl", method = RequestMethod.GET, produces = "application/json") 142 | @ResponseBody 143 | public ResultBundle addIgnoredUrl(String postfix) { 144 | return spiderService.addIgnoredUrl(postfix); 145 | } 146 | 147 | /** 148 | * 根据爬虫模板ID批量启动任务 149 | * 150 | * @param spiderInfoIdList 爬虫模板ID列表 151 | * @return 任务id列表 152 | */ 153 | @RequestMapping(value = "startAll", method = RequestMethod.GET, produces = "application/json") 154 | @ResponseBody 155 | public ResultListBundle startAll(String spiderInfoIdList) { 156 | return spiderService.startAll(Lists.newArrayList(spiderInfoIdList.split(","))); 157 | } 158 | 159 | @RequestMapping(value = "createQuartzJob", method = RequestMethod.GET, produces = "application/json") 160 | @ResponseBody 161 | public ResultBundle createQuartzJob(String spiderInfoId, int hoursInterval) { 162 | return spiderService.createQuartzJob(spiderInfoId, hoursInterval); 163 | } 164 | 165 | @RequestMapping(value = "removeQuartzJob", method = RequestMethod.GET, produces = "application/json") 166 | @ResponseBody 167 | public ResultBundle removeQuartzJob(String spiderInfoId) { 168 | return spiderService.removeQuartzJob(spiderInfoId); 169 | } 170 | 171 | @RequestMapping(value = "checkQuartzJob", method = RequestMethod.GET, produces = "application/json") 172 | @ResponseBody 173 | public String checkQuartzJob(String spiderInfoId) { 174 | return spiderService.checkQuartzJob(spiderInfoId).getResult(); 175 | } 176 | 177 | @RequestMapping(value = "exportQuartz", method = RequestMethod.GET, produces = "application/json") 178 | public void exportQuartz(HttpServletResponse response) throws IOException { 179 | response.setCharacterEncoding("utf-8"); 180 | response.setContentType("multipart/form-data"); 181 | response.setHeader("Content-Disposition", "attachment;fileName=commons-spider.quartz"); 182 | OutputStream outputStream = response.getOutputStream(); 183 | outputStream.write(spiderService.exportQuartz().getBytes()); 184 | outputStream.close(); 185 | } 186 | 187 | @RequestMapping(value = "importQuartz", method = RequestMethod.POST, produces = "application/json") 188 | @ResponseBody 189 | public void importQuartz(String json) { 190 | spiderService.importQuartz(json); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/controller/commons/spiderinfo/SpiderInfoController.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.controller.commons.spiderinfo; 2 | 3 | import com.google.gson.Gson; 4 | import com.gs.spider.model.commons.SpiderInfo; 5 | import com.gs.spider.model.commons.Webpage; 6 | import com.gs.spider.model.utils.ResultBundle; 7 | import com.gs.spider.model.utils.ResultListBundle; 8 | import com.gs.spider.service.commons.spiderinfo.SpiderInfoService; 9 | import com.gs.spider.service.commons.webpage.CommonWebpageService; 10 | import com.gs.spider.utils.TablePage; 11 | 12 | import java.util.List; 13 | 14 | import org.apache.commons.lang3.StringUtils; 15 | import org.apache.commons.lang3.tuple.Pair; 16 | import org.apache.logging.log4j.LogManager; 17 | import org.apache.logging.log4j.Logger; 18 | import org.springframework.beans.factory.annotation.Autowired; 19 | import org.springframework.web.bind.annotation.*; 20 | import org.springframework.web.servlet.ModelAndView; 21 | 22 | /** 23 | * SpiderInfoController 24 | */ 25 | @RequestMapping("/commons/spiderinfo") 26 | @RestController 27 | public class SpiderInfoController { 28 | 29 | private final static Logger logger = LogManager.getLogger(SpiderInfoController.class); 30 | 31 | @Autowired 32 | private SpiderInfoService spiderInfoService; 33 | @Autowired 34 | private CommonWebpageService commonWebpageService; 35 | 36 | private Gson gson = new Gson(); 37 | 38 | /** 39 | * 列出库中所有爬虫模板 40 | * 41 | * @param size 页面容量 42 | * @param page 页码 43 | * @return 爬虫模板列表 44 | */ 45 | @RequestMapping(value = "listAll", method = RequestMethod.GET, produces = "application/json") 46 | @ResponseBody 47 | public ResultListBundle listAll(@RequestParam(value = "size", required = false, defaultValue = "10") int size, @RequestParam(value = "page", required = false, defaultValue = "1") int page) { 48 | return spiderInfoService.listAll(size, page); 49 | } 50 | 51 | /** 52 | * 根据domain获取结果 53 | * 54 | * @param domain 网站域名 55 | * @param size 每页数量 56 | * @param page 页码 57 | * @return 爬虫模板 58 | */ 59 | @RequestMapping(value = "getByDomain", method = RequestMethod.GET, produces = "application/json") 60 | @ResponseBody 61 | public ResultListBundle getByDomain(String domain, @RequestParam(value = "size", required = false, defaultValue = "10") int size, @RequestParam(value = "page", required = false, defaultValue = "1") int page) { 62 | return spiderInfoService.getByDomain(domain, size, page); 63 | } 64 | 65 | /** 66 | * 已抓取的网页列表 67 | * 68 | * @param query 查询词 69 | * @param domain 域名 70 | * @param page 页码 71 | * @return 72 | */ 73 | @RequestMapping(value = {"list", ""}, method = RequestMethod.GET) 74 | public ResultBundle, Long>> list(@RequestParam(required = false) String query, @RequestParam(required = false) String domain, @RequestParam(defaultValue = "1", required = false) int page) { 75 | StringBuilder sbf = new StringBuilder(); 76 | sbf.append("&query="); 77 | if (StringUtils.isNotBlank(query)) { 78 | query = query.trim(); 79 | sbf.append(query); 80 | } 81 | sbf.append("&domain="); 82 | if (StringUtils.isNotBlank(domain)) { 83 | domain = domain.trim(); 84 | sbf.append(domain); 85 | } 86 | page = page < 1 ? 1 : page; 87 | ResultBundle, Long>> resultBundle = commonWebpageService.getWebPageByKeywordAndDomain(query, domain, 10, page); 88 | return resultBundle; 89 | } 90 | 91 | /** 92 | * 根据网站domain删除数据 93 | * 94 | * @param domain 网站域名 95 | * @return 是否全部数据删除成功 96 | */ 97 | @RequestMapping(value = "deleteByDomain", method = RequestMethod.GET, produces = "application/json") 98 | @ResponseBody 99 | public ResultBundle deleteByDomain(String domain) { 100 | return spiderInfoService.deleteByDomain(domain); 101 | } 102 | 103 | /** 104 | * 根据id删除网页模板 105 | * 106 | * @param id 网页模板id 107 | * @return 是否删除 108 | */ 109 | @RequestMapping(value = "deleteById", method = RequestMethod.GET, produces = "application/json") 110 | @ResponseBody 111 | public ResultBundle deleteById(String id) { 112 | return spiderInfoService.deleteById(id); 113 | } 114 | 115 | /** 116 | * 存储模板 117 | * 118 | * @param spiderInfoJson 使用json格式进行序列化的spiderinfo 119 | * @return 模板id 120 | */ 121 | @RequestMapping(value = "save", method = {RequestMethod.GET, RequestMethod.POST}, produces = "application/json") 122 | @ResponseBody 123 | public ResultBundle save(String spiderInfoJson) { 124 | return spiderInfoService.index(gson.fromJson(spiderInfoJson, SpiderInfo.class)); 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/controller/home/HomeController.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.controller.home; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | import org.springframework.stereotype.Controller; 6 | import org.springframework.web.bind.annotation.RequestMapping; 7 | import org.springframework.web.bind.annotation.RequestMethod; 8 | import org.springframework.web.servlet.ModelAndView; 9 | 10 | import com.gs.spider.controller.BaseController; 11 | import com.gs.spider.utils.AppInfo; 12 | 13 | @Controller 14 | @RequestMapping("/") 15 | public class HomeController extends BaseController { 16 | 17 | private final static Logger logger = LogManager.getLogger(HomeController.class); 18 | 19 | @RequestMapping(value = { "/", "" }, method = RequestMethod.GET) 20 | public ModelAndView home() { 21 | ModelAndView modelAndView = new ModelAndView("panel/welcome/welcome"); 22 | modelAndView.addObject("appName", AppInfo.APP_NAME).addObject("appVersion", AppInfo.APP_VERSION) 23 | .addObject("onlineDocumentation", AppInfo.ONLINE_DOCUMENTATION); 24 | return modelAndView; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/CommonWebpagePipeline.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import com.google.common.collect.Maps; 4 | import com.google.common.collect.Sets; 5 | import com.google.common.hash.Hashing; 6 | import com.google.gson.*; 7 | import com.gs.spider.model.commons.SpiderInfo; 8 | import com.gs.spider.model.commons.Webpage; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | import org.elasticsearch.action.get.GetResponse; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.stereotype.Component; 14 | import us.codecraft.webmagic.Request; 15 | import us.codecraft.webmagic.ResultItems; 16 | import us.codecraft.webmagic.Task; 17 | import us.codecraft.webmagic.pipeline.Pipeline; 18 | import us.codecraft.webmagic.scheduler.component.DuplicateRemover; 19 | 20 | import java.nio.charset.Charset; 21 | import java.text.DateFormat; 22 | import java.util.Date; 23 | import java.util.Map; 24 | import java.util.Set; 25 | 26 | /** 27 | * CommonWebpagePipeline 28 | */ 29 | @Component 30 | public class CommonWebpagePipeline extends IDAO implements DuplicateRemover, Pipeline { 31 | 32 | private final static String INDEX_NAME = "commons", TYPE_NAME = "webpage"; 33 | 34 | private static final String DYNAMIC_FIELD = "dynamic_fields"; 35 | 36 | private static final Gson gson = new GsonBuilder() 37 | .registerTypeAdapter(Date.class, (JsonDeserializer) (json, typeOfT, context) -> new Date(json.getAsJsonPrimitive().getAsLong())) 38 | .registerTypeAdapter(Date.class, (JsonSerializer) (src, typeOfSrc, context) -> new JsonPrimitive(src.getTime())) 39 | .setDateFormat(DateFormat.LONG).create(); 40 | 41 | private static int COUNT = 0; 42 | 43 | private Logger logger = LogManager.getLogger(CommonWebpagePipeline.class); 44 | 45 | private Map> urls = Maps.newConcurrentMap(); 46 | 47 | @Autowired 48 | public CommonWebpagePipeline(ESClient esClient) { 49 | super(esClient, INDEX_NAME, TYPE_NAME); 50 | } 51 | 52 | /** 53 | * 将webmagic的resultItems转换成webpage对象 54 | * 55 | * @param resultItems 56 | * @return 57 | */ 58 | public static Webpage convertResultItems2Webpage(ResultItems resultItems) { 59 | Webpage webpage = new Webpage(); 60 | webpage.setContent(resultItems.get("content")); 61 | webpage.setTitle(resultItems.get("title")); 62 | webpage.setUrl(resultItems.get("url")); 63 | webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()); 64 | webpage.setDomain(resultItems.get("domain")); 65 | webpage.setSpiderInfoId(resultItems.get("spiderInfoId")); 66 | webpage.setGathertime(resultItems.get("gatherTime")); 67 | webpage.setSpiderUUID(resultItems.get("spiderUUID")); 68 | webpage.setKeywords(resultItems.get("keywords")); 69 | webpage.setSummary(resultItems.get("summary")); 70 | webpage.setNamedEntity(resultItems.get("namedEntity")); 71 | webpage.setPublishTime(resultItems.get("publishTime")); 72 | webpage.setCategory(resultItems.get("category")); 73 | webpage.setRawHTML(resultItems.get("rawHTML")); 74 | webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD)); 75 | webpage.setStaticFields(resultItems.get("staticField")); 76 | webpage.setAttachmentList(resultItems.get("attachmentList")); 77 | webpage.setImageList(resultItems.get("imageList")); 78 | webpage.setProcessTime(resultItems.get("processTime")); 79 | return webpage; 80 | } 81 | 82 | @Override 83 | public String index(Webpage webpage) { 84 | return null; 85 | } 86 | 87 | @Override 88 | protected boolean check() { 89 | return esClient.checkCommonsIndex() && esClient.checkWebpageType(); 90 | } 91 | 92 | @Override 93 | public boolean isDuplicate(Request request, Task task) { 94 | Set tempLists = urls.computeIfAbsent(task.getUUID(), k -> Sets.newConcurrentHashSet()); 95 | //初始化已采集网站列表缓存 96 | if (tempLists.add(request.getUrl())) {//先检查当前生命周期是否抓取过,如果当前生命周期未抓取,则进一步检查ES 97 | GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME, 98 | Hashing.md5().hashString(request.getUrl(), Charset.forName("utf-8")).toString() 99 | ).get(); 100 | return response.isExists(); 101 | } else { 102 | //如果当前生命周期已抓取,直接置为重复 103 | return true; 104 | } 105 | } 106 | 107 | @Override 108 | public void resetDuplicateCheck(Task task) { 109 | 110 | } 111 | 112 | @Override 113 | public int getTotalRequestsCount(Task task) { 114 | return COUNT++; 115 | } 116 | 117 | @Override 118 | public void process(ResultItems resultItems, Task task) { 119 | SpiderInfo spiderInfo = resultItems.get("spiderInfo"); 120 | Webpage webpage = convertResultItems2Webpage(resultItems); 121 | try { 122 | client.prepareIndex(INDEX_NAME, TYPE_NAME) 123 | .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()) 124 | .setSource(gson.toJson(webpage)) 125 | .get(); 126 | } catch (Exception e) { 127 | logger.error("索引 Webpage 出错," + e.getLocalizedMessage()); 128 | } 129 | } 130 | 131 | /** 132 | * 清除已停止任务的抓取url列表 133 | * 134 | * @param taskId 任务id 135 | */ 136 | public void deleteUrls(String taskId) { 137 | urls.remove(taskId); 138 | logger.info("任务{}已结束,抓取列表缓存已清除", taskId); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/CommonWebpageRedisPipeline.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import com.google.gson.Gson; 4 | import com.gs.spider.utils.StaticValue; 5 | import org.apache.logging.log4j.LogManager; 6 | import org.apache.logging.log4j.Logger; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.stereotype.Component; 9 | import redis.clients.jedis.Jedis; 10 | import us.codecraft.webmagic.ResultItems; 11 | import us.codecraft.webmagic.Task; 12 | import us.codecraft.webmagic.pipeline.Pipeline; 13 | 14 | @Component 15 | public class CommonWebpageRedisPipeline implements Pipeline { 16 | private static Jedis jedis; 17 | private final boolean needRedis; 18 | private final String publishChannelName; 19 | private final Gson gson = new Gson(); 20 | private Logger LOG = LogManager.getLogger(CommonWebpageRedisPipeline.class); 21 | 22 | @Autowired 23 | public CommonWebpageRedisPipeline(StaticValue staticValue) { 24 | this.needRedis = staticValue.isNeedRedis(); 25 | this.publishChannelName = staticValue.getWebpageRedisPublishChannelName(); 26 | if (this.needRedis) { 27 | LOG.info("正在初始化Redis客户端,Host:{},Port:{}", staticValue.getRedisHost(), staticValue.getRedisPort()); 28 | jedis = new Jedis(staticValue.getRedisHost(), staticValue.getRedisPort()); 29 | LOG.info("Jedis初始化成功,Clients List:{}", jedis.clientList()); 30 | } else { 31 | LOG.warn("未初始化Redis客户端"); 32 | } 33 | } 34 | 35 | @Override 36 | public void process(ResultItems resultItems, Task task) { 37 | if (!needRedis) return; 38 | long receivedClientsCount = jedis.publish(publishChannelName, gson.toJson(resultItems.getAll())); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/ESClient.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import com.gs.spider.utils.StaticValue; 4 | import org.apache.commons.io.FileUtils; 5 | import org.apache.logging.log4j.LogManager; 6 | import org.apache.logging.log4j.Logger; 7 | import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; 8 | import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; 9 | import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest; 10 | import org.elasticsearch.action.admin.indices.exists.types.TypesExistsRequest; 11 | import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest; 12 | import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse; 13 | import org.elasticsearch.client.Client; 14 | import org.elasticsearch.client.Requests; 15 | import org.elasticsearch.common.settings.Settings; 16 | import org.elasticsearch.common.transport.InetSocketTransportAddress; 17 | import org.elasticsearch.common.unit.TimeValue; 18 | import org.elasticsearch.transport.client.PreBuiltTransportClient; 19 | import org.springframework.beans.factory.annotation.Autowired; 20 | import org.springframework.context.annotation.Scope; 21 | import org.springframework.stereotype.Component; 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.net.InetAddress; 26 | 27 | /** 28 | * ESClient 29 | */ 30 | @Component 31 | @Scope("prototype") 32 | public class ESClient { 33 | private final static String COMMON_INDEX_CONFIG = "commonIndex.json"; 34 | private static final String COMMONS_INDEX_NAME = "commons"; 35 | private static final String WEBPAGE_TYPE_NAME = "webpage"; 36 | private static final String SPIDER_INFO_TYPE_NAME = "spiderinfo"; 37 | private static final String SPIDER_INFO_INDEX_NAME = "spiderinfo"; 38 | 39 | private Logger logger = LogManager.getLogger(ESClient.class); 40 | private Client client; 41 | 42 | @Autowired 43 | private StaticValue staticValue; 44 | 45 | public boolean checkCommonsIndex() { 46 | return checkIndex(COMMONS_INDEX_NAME, COMMON_INDEX_CONFIG); 47 | } 48 | 49 | public boolean checkWebpageType() { 50 | return checkType(COMMONS_INDEX_NAME, WEBPAGE_TYPE_NAME, "webpage.json"); 51 | } 52 | 53 | public boolean checkSpiderInfoIndex() { 54 | return checkIndex(SPIDER_INFO_INDEX_NAME, COMMON_INDEX_CONFIG); 55 | } 56 | 57 | public boolean checkSpiderInfoType() { 58 | return checkType(SPIDER_INFO_INDEX_NAME, SPIDER_INFO_TYPE_NAME, "spiderinfo.json"); 59 | } 60 | 61 | public Client getClient() { 62 | if (!staticValue.isNeedEs()) { 63 | logger.info("已在配置文件中声明不需要ES,如需要ES,请在配置文件中进行配置"); 64 | return null; 65 | } 66 | if (client != null) return client; 67 | logger.info("正在初始化ElasticSearch客户端," + staticValue.getEsHost()); 68 | 69 | Settings settings = Settings.builder() 70 | .put("cluster.name", staticValue.getEsClusterName()).build(); 71 | try { 72 | client = new PreBuiltTransportClient(settings) 73 | .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(staticValue.getEsHost()), staticValue.getEsPort())); 74 | final ClusterHealthResponse healthResponse = client.admin().cluster().prepareHealth() 75 | .setTimeout(TimeValue.timeValueMinutes(1)).execute().actionGet(); 76 | if (healthResponse.isTimedOut()) { 77 | logger.error("ES客户端初始化失败"); 78 | } else { 79 | logger.info("ES客户端初始化成功"); 80 | } 81 | } catch (IOException e) { 82 | logger.fatal("构建ElasticSearch客户端失败!"); 83 | } 84 | return client; 85 | } 86 | 87 | public boolean checkType(String index, String type, String mapping) { 88 | if (client == null) return false; 89 | if (!client.admin().indices().typesExists(new TypesExistsRequest(new String[]{index}, type)).actionGet().isExists()) { 90 | logger.info(type + " type不存在,正在准备创建type"); 91 | File mappingFile; 92 | try { 93 | mappingFile = new File(this.getClass().getClassLoader() 94 | .getResource(mapping).getFile()); 95 | } catch (Exception e) { 96 | logger.fatal("查找ES mapping配置文件出错, " + e.getLocalizedMessage()); 97 | return false; 98 | } 99 | logger.debug(type + " MappingFile:" + mappingFile.getPath()); 100 | PutMappingResponse mapPuttingResponse = null; 101 | 102 | PutMappingRequest putMappingRequest = null; 103 | try { 104 | putMappingRequest = Requests.putMappingRequest(index).type(type).source(FileUtils.readFileToString(mappingFile)); 105 | } catch (IOException e) { 106 | logger.error("创建 jvmSample mapping 失败," + e.getLocalizedMessage()); 107 | } 108 | mapPuttingResponse = client.admin().indices().putMapping(putMappingRequest).actionGet(); 109 | 110 | if (mapPuttingResponse.isAcknowledged()) logger.info("创建" + type + "type成功"); 111 | else { 112 | logger.error("创建" + type + "type索引失败"); 113 | return false; 114 | } 115 | } else logger.debug(type + " type 存在"); 116 | return true; 117 | } 118 | 119 | public boolean checkIndex(String index, String mapping) { 120 | if (client == null) return false; 121 | if (!client.admin().indices().exists(new IndicesExistsRequest(index)).actionGet().isExists()) { 122 | File indexMappingFile; 123 | try { 124 | indexMappingFile = new File(this.getClass().getClassLoader() 125 | .getResource(mapping).getFile()); 126 | } catch (Exception e) { 127 | logger.fatal("查找" + index + "index mapping配置文件出错, " + e.getLocalizedMessage()); 128 | return false; 129 | } 130 | logger.debug(index + "index MappingFile:" + indexMappingFile.getPath()); 131 | logger.info(index + " index 不存在,正在准备创建index"); 132 | CreateIndexResponse createIndexResponse = null; 133 | try { 134 | createIndexResponse = client.admin().indices() 135 | .prepareCreate(index) 136 | .setSettings(FileUtils.readFileToString(indexMappingFile)) 137 | .execute().actionGet(); 138 | } catch (IOException e) { 139 | logger.error("创建 " + index + " index 失败"); 140 | return false; 141 | } 142 | if (createIndexResponse.isAcknowledged()) logger.info(index + " index 成功"); 143 | else { 144 | logger.fatal(index + " index失败"); 145 | return false; 146 | } 147 | } else logger.debug(index + " index 存在"); 148 | return true; 149 | } 150 | 151 | public StaticValue getStaticValue() { 152 | return staticValue; 153 | } 154 | 155 | public ESClient setStaticValue(StaticValue staticValue) { 156 | this.staticValue = staticValue; 157 | return this; 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/ESPipeline.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | import org.apache.logging.log4j.LogManager; 5 | import org.apache.logging.log4j.Logger; 6 | import org.elasticsearch.action.index.IndexResponse; 7 | import org.elasticsearch.common.xcontent.XContentBuilder; 8 | import us.codecraft.webmagic.ResultItems; 9 | import us.codecraft.webmagic.Task; 10 | import us.codecraft.webmagic.pipeline.Pipeline; 11 | 12 | import java.io.IOException; 13 | import java.util.Iterator; 14 | import java.util.Map; 15 | 16 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 17 | 18 | /** 19 | * NewsPipeline 20 | */ 21 | public abstract class ESPipeline extends IDAO implements Pipeline { 22 | private final String INDEX_NAME, TYPE_NAME; 23 | private Logger LOG = LogManager.getLogger(ESPipeline.class); 24 | 25 | public ESPipeline(ESClient esClient, String indexName, String typeName) { 26 | super(esClient, indexName, typeName); 27 | this.INDEX_NAME = indexName; 28 | this.TYPE_NAME = typeName; 29 | } 30 | 31 | @Override 32 | public void process(ResultItems resultItems, Task task) { 33 | Iterator i$ = resultItems.getAll().entrySet().iterator(); 34 | try { 35 | XContentBuilder xContentBuilder = jsonBuilder().startObject(); 36 | while (i$.hasNext()) { 37 | Map.Entry entry = (Map.Entry) i$.next(); 38 | xContentBuilder.field((String) entry.getKey(), entry.getValue()); 39 | } 40 | String json = xContentBuilder.endObject().string(); 41 | IndexResponse response = null; 42 | if (StringUtils.isNotBlank(resultItems.get("id"))) { 43 | response = client 44 | .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id")) 45 | .setSource(json).get(); 46 | } else { 47 | response = client 48 | .prepareIndex(INDEX_NAME, TYPE_NAME) 49 | .setSource(json).get(); 50 | } 51 | if (response.getResult() != IndexResponse.Result.CREATED) 52 | LOG.error("索引失败,可能重复创建,resultItem:" + resultItems); 53 | } catch (IOException e) { 54 | LOG.error("索引出错," + e.getLocalizedMessage()); 55 | e.printStackTrace(); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/JsonFilePipeline.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import com.google.gson.Gson; 4 | import com.gs.spider.model.commons.Webpage; 5 | import org.apache.commons.io.FileUtils; 6 | import org.apache.logging.log4j.LogManager; 7 | import org.apache.logging.log4j.Logger; 8 | import org.springframework.stereotype.Component; 9 | import us.codecraft.webmagic.ResultItems; 10 | import us.codecraft.webmagic.Task; 11 | import us.codecraft.webmagic.pipeline.Pipeline; 12 | 13 | import java.io.File; 14 | import java.io.IOException; 15 | 16 | @Component 17 | public class JsonFilePipeline implements Pipeline { 18 | private final static Logger LOG = LogManager.getLogger(JsonFilePipeline.class); 19 | private final static Gson gson = new Gson(); 20 | 21 | @Override 22 | public void process(ResultItems resultItems, Task task) { 23 | Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems); 24 | try { 25 | FileUtils.writeStringToFile( 26 | new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"), 27 | gson.toJson(webpage) + "\n", 28 | true); 29 | } catch (IOException e) { 30 | LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage()); 31 | } 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/dao/SpiderInfoDAO.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.dao; 2 | 3 | import com.google.common.base.Preconditions; 4 | import com.google.common.collect.Lists; 5 | import com.google.gson.Gson; 6 | import com.google.gson.GsonBuilder; 7 | import com.gs.spider.model.commons.SpiderInfo; 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | import org.elasticsearch.action.delete.DeleteResponse; 12 | import org.elasticsearch.action.get.GetResponse; 13 | import org.elasticsearch.action.index.IndexResponse; 14 | import org.elasticsearch.action.search.SearchRequestBuilder; 15 | import org.elasticsearch.action.search.SearchResponse; 16 | import org.elasticsearch.action.update.UpdateRequest; 17 | import org.elasticsearch.action.update.UpdateResponse; 18 | import org.elasticsearch.index.query.Operator; 19 | import org.elasticsearch.index.query.QueryBuilders; 20 | import org.elasticsearch.search.SearchHit; 21 | import org.elasticsearch.search.SearchHits; 22 | import org.springframework.beans.factory.annotation.Autowired; 23 | import org.springframework.stereotype.Component; 24 | 25 | import java.util.List; 26 | import java.util.concurrent.ExecutionException; 27 | 28 | /** 29 | * SpiderInfoDAO 30 | */ 31 | @Component 32 | public class SpiderInfoDAO extends IDAO { 33 | private final static Logger logger = LogManager.getLogger(SpiderInfoDAO.class); 34 | private final static String INDEX_NAME = "spiderinfo", TYPE_NAME = "spiderinfo"; 35 | private static final Gson gson = new GsonBuilder().create(); 36 | 37 | @Autowired 38 | public SpiderInfoDAO(ESClient esClient) { 39 | super(esClient, INDEX_NAME, TYPE_NAME); 40 | } 41 | 42 | public SpiderInfoDAO() { 43 | } 44 | 45 | @Override 46 | public String index(SpiderInfo spiderInfo) { 47 | IndexResponse indexResponse; 48 | if (getByDomain(spiderInfo.getDomain(), 10, 1).size() > 0) { 49 | List mayDuplicate = Lists.newLinkedList(); 50 | List temp; 51 | int i = 1; 52 | do { 53 | temp = getByDomain(spiderInfo.getDomain(), 100, i++); 54 | mayDuplicate.addAll(temp); 55 | } while (temp.size() > 0); 56 | if (mayDuplicate.indexOf(spiderInfo) != -1 && (spiderInfo = mayDuplicate.get(mayDuplicate.indexOf(spiderInfo))) != null) { 57 | logger.warn("已经含有此模板,不再存储"); 58 | return spiderInfo.getId(); 59 | } 60 | } 61 | try { 62 | indexResponse = client.prepareIndex(INDEX_NAME, TYPE_NAME) 63 | .setSource(gson.toJson(spiderInfo)) 64 | .get(); 65 | logger.debug("索引爬虫模板成功"); 66 | return indexResponse.getId(); 67 | } catch (Exception e) { 68 | logger.error("索引 Webpage 出错," + e.getLocalizedMessage()); 69 | } 70 | return null; 71 | } 72 | 73 | @Override 74 | protected boolean check() { 75 | return esClient.checkSpiderInfoIndex() && esClient.checkSpiderInfoType(); 76 | } 77 | 78 | private SpiderInfo warpHits2Info(SearchHit hit) { 79 | SpiderInfo spiderInfo = gson.fromJson(hit.getSourceAsString(), SpiderInfo.class); 80 | spiderInfo.setId(hit.getId()); 81 | return spiderInfo; 82 | } 83 | 84 | private SpiderInfo warpHits2Info(String jsonSource, String id) { 85 | SpiderInfo spiderInfo = gson.fromJson(jsonSource, SpiderInfo.class); 86 | spiderInfo.setId(id); 87 | return spiderInfo; 88 | } 89 | 90 | private List warpHits2List(SearchHits hits) { 91 | List spiderInfoList = Lists.newLinkedList(); 92 | hits.forEach(searchHitFields -> { 93 | spiderInfoList.add(warpHits2Info(searchHitFields)); 94 | }); 95 | return spiderInfoList; 96 | } 97 | 98 | /** 99 | * 列出库中所有爬虫模板 100 | * 101 | * @param size 页面容量 102 | * @param page 页码 103 | * @return 104 | */ 105 | public List listAll(int size, int page) { 106 | SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME) 107 | .setTypes(TYPE_NAME) 108 | .setQuery(QueryBuilders.matchAllQuery()) 109 | .setSize(size).setFrom(size * (page - 1)); 110 | SearchResponse response = searchRequestBuilder.execute().actionGet(); 111 | return warpHits2List(response.getHits()); 112 | } 113 | 114 | /** 115 | * 根据domain获取结果 116 | * 117 | * @param domain 网站域名 118 | * @param size 每页数量 119 | * @param page 页码 120 | * @return 121 | */ 122 | public List getByDomain(String domain, int size, int page) { 123 | SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME) 124 | .setTypes(TYPE_NAME) 125 | .setQuery(QueryBuilders.matchQuery("domain", domain).operator(Operator.AND)) 126 | .setSize(size).setFrom(size * (page - 1)); 127 | SearchResponse response = searchRequestBuilder.execute().actionGet(); 128 | return warpHits2List(response.getHits()); 129 | } 130 | 131 | /** 132 | * 根据爬虫模板id获取指定爬虫模板 133 | * 134 | * @param id 爬虫模板id 135 | * @return 136 | */ 137 | public SpiderInfo getById(String id) { 138 | GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME, id).get(); 139 | Preconditions.checkArgument(response.isExists(), "无法找到ID为%s的模板,请检查参数", id); 140 | return warpHits2Info(response.getSourceAsString(), id); 141 | } 142 | 143 | /** 144 | * 根据网站domain删除数据 145 | * 146 | * @param domain 网站域名 147 | * @return 是否全部数据删除成功 148 | */ 149 | public boolean deleteByDomain(String domain) { 150 | return deleteByQuery(QueryBuilders.matchQuery("domain", domain), null); 151 | } 152 | 153 | /** 154 | * 根据id删除网页模板 155 | * 156 | * @param id 网页模板id 157 | * @return 是否删除 158 | */ 159 | public boolean deleteById(String id) { 160 | DeleteResponse response = client.prepareDelete(INDEX_NAME, TYPE_NAME, id).get(); 161 | return response.getResult() == DeleteResponse.Result.DELETED; 162 | } 163 | 164 | /** 165 | * 更新爬虫模板 166 | * 167 | * @param spiderInfo 爬虫模板实体 168 | * @return 爬虫模板id 169 | * @throws ExecutionException 170 | * @throws InterruptedException 171 | */ 172 | public String update(SpiderInfo spiderInfo) throws Exception { 173 | Preconditions.checkArgument(StringUtils.isNotBlank(spiderInfo.getId()), "待更新爬虫模板id不可为空"); 174 | UpdateRequest updateRequest = new UpdateRequest(INDEX_NAME, TYPE_NAME, spiderInfo.getId()); 175 | updateRequest.doc(gson.toJson(spiderInfo)); 176 | UpdateResponse updateResponse = null; 177 | try { 178 | updateResponse = client.update(updateRequest).get(); 179 | return updateResponse.getId(); 180 | } catch (ExecutionException e) { 181 | e.printStackTrace(); 182 | throw new Exception("没有此ID的模板,请删除ID字段的值或者使用正确的id值"); 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/async/AsyncGather.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.async; 2 | 3 | import com.gs.spider.model.async.State; 4 | import com.gs.spider.model.async.Task; 5 | import org.apache.logging.log4j.LogManager; 6 | import org.apache.logging.log4j.Logger; 7 | 8 | import java.util.Collection; 9 | import java.util.stream.Collectors; 10 | 11 | /** 12 | * AsyncGather 13 | * 异步抓取器基类,提供任务管理功能 14 | */ 15 | public class AsyncGather { 16 | protected TaskManager taskManager; 17 | protected int longConnectionPort; 18 | private Logger logger = LogManager.getLogger(AsyncGather.class); 19 | 20 | public AsyncGather() { 21 | } 22 | 23 | /** 24 | * 获取所有Task,包括已经完成的和未完成的 25 | * 26 | * @param containsExtraInfo 是否显示额外信息 27 | * @return 28 | */ 29 | public Collection getTasks(boolean containsExtraInfo) { 30 | return taskManager.getTasks(containsExtraInfo); 31 | } 32 | 33 | /** 34 | * 获取任务列表,通过状态过滤 35 | * 36 | * @param state 任务状态 37 | * @return 38 | */ 39 | public Collection getTasksFilterByState(State state, boolean containsExtraInfo) { 40 | return taskManager.getTasks(containsExtraInfo).stream().filter(task -> task.getState() == state).collect(Collectors.toList()); 41 | } 42 | 43 | /** 44 | * 获取任务列表,通过时间状态过滤 45 | * 46 | * @param start 开始时间 47 | * @param end 结束时间 48 | * @return 49 | */ 50 | public Collection getTasksFilterByTime(long start, long end, boolean containsExtraInfo) { 51 | return taskManager.getTasks(containsExtraInfo).stream().filter(task -> task.getTime() > start && task.getTime() < end).collect(Collectors.toList()); 52 | } 53 | 54 | /** 55 | * 根据任务ID获取单个任务信息 56 | * 57 | * @param taskId 任务ID 58 | * @return 59 | */ 60 | public Task getTaskById(String taskId, boolean containsExtraInfo) { 61 | return taskManager.getTaskById(taskId, containsExtraInfo); 62 | } 63 | 64 | /** 65 | * 根据任务ID获取当前任务已经获取的数据条数 66 | * 67 | * @param taskId 任务ID 68 | * @return 69 | */ 70 | public int getTaskCount(String taskId) { 71 | return taskManager.getTaskCount(taskId); 72 | } 73 | 74 | /** 75 | * 根据taskId删除任务 76 | * 77 | * @param taskId 任务ID 78 | */ 79 | public void deleteTaskById(String taskId) { 80 | taskManager.deleteTask(taskId); 81 | } 82 | 83 | /** 84 | * 获取长连接服务器端口 85 | * 86 | * @return 87 | */ 88 | public int getLongConnectionPort() { 89 | return this.longConnectionPort; 90 | } 91 | 92 | /** 93 | * 统计指定状态的任务数 94 | * 95 | * @param state 指定的任务状态 96 | * @return 指定任务状态的任务数量 97 | */ 98 | public long countByState(State state) { 99 | return taskManager.countByState(state); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/async/quartz/QuartzManager.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.async.quartz; 2 | 3 | import com.google.common.collect.Sets; 4 | import org.apache.commons.lang3.tuple.Pair; 5 | import org.quartz.*; 6 | import org.quartz.impl.matchers.GroupMatcher; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.stereotype.Component; 9 | 10 | import java.util.Map; 11 | import java.util.Set; 12 | 13 | @Component 14 | public class QuartzManager { 15 | @Autowired 16 | private Scheduler scheduler; 17 | 18 | /** 19 | * @param jobName 任务名 20 | * @param jobGroupName 任务组名 21 | * @param jobClass 任务 22 | * @param hours 时间设置,参考quartz说明文档 23 | * @Description: 添加一个定时任务 24 | * @Title: QuartzManager.java 25 | */ 26 | public Pair addJob(String jobName, String jobGroupName, String triggerName, String triggerGroupName, Class jobClass, Map data, int minutes) { 27 | try { 28 | JobDetail jobDetail = JobBuilder.newJob() 29 | .ofType(jobClass) 30 | .usingJobData(new JobDataMap(data)) 31 | .withIdentity(jobName, jobGroupName).build();// 任务名,任务组,任务执行类 32 | // 触发器 33 | Trigger trigger = TriggerBuilder.newTrigger() 34 | .forJob(jobName, jobGroupName) 35 | .withIdentity(triggerName, triggerGroupName) 36 | //.withSchedule(SimpleScheduleBuilder.repeatHourlyForever(hours)) 37 | .withSchedule(SimpleScheduleBuilder.repeatMinutelyForever(minutes)) 38 | .build();// 触发器名,触发器组 39 | // 启动 40 | if (!scheduler.isShutdown()) { 41 | scheduler.start(); 42 | } 43 | scheduler.scheduleJob(jobDetail, trigger); 44 | return Pair.of(trigger.getKey(), jobDetail.getKey()); 45 | } catch (Exception e) { 46 | throw new RuntimeException(e); 47 | } 48 | } 49 | 50 | public Pair findInfo(JobKey jobKey) { 51 | try { 52 | JobDetail jobDetail = scheduler.getJobDetail(jobKey); 53 | Trigger trigger = scheduler.getTriggersOfJob(jobKey).get(0); 54 | return Pair.of(jobDetail, trigger); 55 | } catch (Exception e) { 56 | return null; 57 | } 58 | } 59 | 60 | public Set listAll(String jobGroup) { 61 | try { 62 | return scheduler.getJobKeys(GroupMatcher.jobGroupEquals(jobGroup)); 63 | } catch (SchedulerException e) { 64 | e.printStackTrace(); 65 | } 66 | return Sets.newConcurrentHashSet(); 67 | } 68 | 69 | /** 70 | * @Description: 移除一个任务 71 | * @Title: QuartzManager.java 72 | */ 73 | public void removeJob(JobKey jobKey) { 74 | try { 75 | TriggerKey triggerKey = scheduler.getTriggersOfJob(jobKey).get(0).getKey(); 76 | scheduler.pauseTrigger(triggerKey);// 停止触发器 77 | scheduler.unscheduleJob(triggerKey);// 移除触发器 78 | scheduler.deleteJob(jobKey);// 删除任务 79 | } catch (Exception e) { 80 | throw new RuntimeException(e); 81 | } 82 | } 83 | 84 | /** 85 | * @Description:启动所有定时任务 86 | * @Title: QuartzManager.java 87 | */ 88 | public void startJobs() { 89 | try { 90 | scheduler.start(); 91 | } catch (Exception e) { 92 | throw new RuntimeException(e); 93 | } 94 | } 95 | 96 | /** 97 | * @Description:关闭所有定时任务 98 | * @Title: QuartzManager.java 99 | */ 100 | public void shutdownJobs() { 101 | try { 102 | if (!scheduler.isShutdown()) { 103 | scheduler.shutdown(); 104 | } 105 | } catch (Exception e) { 106 | throw new RuntimeException(e); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/async/quartz/WebpageSpiderJob.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.async.quartz; 2 | 3 | import com.gs.spider.model.commons.SpiderInfo; 4 | import com.gs.spider.service.commons.spider.CommonsSpiderService; 5 | import org.apache.logging.log4j.LogManager; 6 | import org.apache.logging.log4j.Logger; 7 | import org.quartz.DisallowConcurrentExecution; 8 | import org.quartz.JobExecutionContext; 9 | import org.quartz.JobExecutionException; 10 | import org.springframework.scheduling.quartz.QuartzJobBean; 11 | 12 | @DisallowConcurrentExecution 13 | public class WebpageSpiderJob extends QuartzJobBean { 14 | private Logger LOG = LogManager.getLogger(WebpageSpiderJob.class); 15 | private SpiderInfo spiderInfo; 16 | private CommonsSpiderService commonsSpiderService; 17 | 18 | public WebpageSpiderJob setCommonsSpiderService(CommonsSpiderService commonsSpiderService) { 19 | this.commonsSpiderService = commonsSpiderService; 20 | return this; 21 | } 22 | 23 | public WebpageSpiderJob setSpiderInfo(SpiderInfo spiderInfo) { 24 | this.spiderInfo = spiderInfo; 25 | return this; 26 | } 27 | 28 | @Override 29 | protected void executeInternal(JobExecutionContext jobExecutionContext) throws JobExecutionException { 30 | LOG.info("开始定时网页采集任务,网站:{},模板ID:{}", spiderInfo.getSiteName(), spiderInfo.getId()); 31 | String uuid = commonsSpiderService.start(spiderInfo).getResult(); 32 | LOG.info("定时网页采集任务完成,网站:{},模板ID:{},任务ID:{}", spiderInfo.getSiteName(), spiderInfo.getId(), uuid); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/commons/Casperjs.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.commons; 2 | 3 | import com.google.common.base.Preconditions; 4 | import com.google.gson.Gson; 5 | import com.google.gson.JsonParser; 6 | import com.gs.spider.model.commons.Request; 7 | import com.gs.spider.utils.HttpClientUtil; 8 | import com.gs.spider.utils.StaticValue; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.io.IOException; 15 | import java.util.Map; 16 | 17 | /** 18 | * Casperjs 19 | */ 20 | @Component 21 | public class Casperjs { 22 | private static Logger LOG = LogManager.getLogger(Casperjs.class); 23 | private static Gson gson = new Gson(); 24 | 25 | @Autowired 26 | private HttpClientUtil httpUtils; 27 | 28 | @Autowired 29 | private StaticValue staticValue; 30 | 31 | /** 32 | * 抓取html 33 | * 34 | * @param request 请求实体 35 | * @param url casper url 36 | * @return 37 | * @throws IOException 38 | */ 39 | private String gatherHtml(Request request, String url) throws IOException { 40 | Preconditions.checkArgument(request.getUrl().startsWith("http"), "url必须以http开头,当前url:%s", request.getUrl()); 41 | Fetch fetch = new Fetch().setUrl(request.getUrl()); 42 | String json = httpUtils.post(url, gson.toJson(fetch)); 43 | json = new String(json.getBytes("iso8859-1"), "utf8"); 44 | return new JsonParser().parse(json).getAsJsonObject().get("content").getAsString(); 45 | } 46 | 47 | /** 48 | * 抓取网页html 49 | * 50 | * @param request 请求实体 51 | * @return 52 | * @throws IOException 53 | */ 54 | public String gatherHtml(Request request) throws IOException { 55 | return gatherHtml(request, staticValue.getAjaxDownloader() + "html"); 56 | } 57 | 58 | 59 | public class Fetch { 60 | private String proxy = ""; 61 | private int jsViewportWidth = 1024; 62 | private int jsViewportHeight = 1024; 63 | private boolean loadImages = false; 64 | private int timeout = 5; 65 | private String url; 66 | private String method = "get"; 67 | private String data = ""; 68 | private Map headers; 69 | private String jsRunAt; 70 | private String jsScript; 71 | 72 | public String getProxy() { 73 | return proxy; 74 | } 75 | 76 | public Fetch setProxy(String proxy) { 77 | this.proxy = proxy; 78 | return this; 79 | } 80 | 81 | public int getJsViewportWidth() { 82 | return jsViewportWidth; 83 | } 84 | 85 | public Fetch setJsViewportWidth(int jsViewportWidth) { 86 | this.jsViewportWidth = jsViewportWidth; 87 | return this; 88 | } 89 | 90 | public int getJsViewportHeight() { 91 | return jsViewportHeight; 92 | } 93 | 94 | public Fetch setJsViewportHeight(int jsViewportHeight) { 95 | this.jsViewportHeight = jsViewportHeight; 96 | return this; 97 | } 98 | 99 | public boolean isLoadImages() { 100 | return loadImages; 101 | } 102 | 103 | public Fetch setLoadImages(boolean loadImages) { 104 | this.loadImages = loadImages; 105 | return this; 106 | } 107 | 108 | public int getTimeout() { 109 | return timeout; 110 | } 111 | 112 | public Fetch setTimeout(int timeout) { 113 | this.timeout = timeout; 114 | return this; 115 | } 116 | 117 | public String getUrl() { 118 | return url; 119 | } 120 | 121 | public Fetch setUrl(String url) { 122 | this.url = url; 123 | return this; 124 | } 125 | 126 | public String getMethod() { 127 | return method; 128 | } 129 | 130 | public Fetch setMethod(String method) { 131 | this.method = method; 132 | return this; 133 | } 134 | 135 | public String getData() { 136 | return data; 137 | } 138 | 139 | public Fetch setData(String data) { 140 | this.data = data; 141 | return this; 142 | } 143 | 144 | public Map getHeaders() { 145 | return headers; 146 | } 147 | 148 | public Fetch setHeaders(Map headers) { 149 | this.headers = headers; 150 | return this; 151 | } 152 | 153 | public String getJsRunAt() { 154 | return jsRunAt; 155 | } 156 | 157 | public Fetch setJsRunAt(String jsRunAt) { 158 | this.jsRunAt = jsRunAt; 159 | return this; 160 | } 161 | 162 | public String getJsScript() { 163 | return jsScript; 164 | } 165 | 166 | public Fetch setJsScript(String jsScript) { 167 | this.jsScript = jsScript; 168 | return this; 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/commons/CasperjsDownloader.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.commons; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | import org.springframework.beans.factory.annotation.Autowired; 6 | import org.springframework.stereotype.Component; 7 | import us.codecraft.webmagic.Page; 8 | import us.codecraft.webmagic.Request; 9 | import us.codecraft.webmagic.Site; 10 | import us.codecraft.webmagic.Task; 11 | import us.codecraft.webmagic.downloader.AbstractDownloader; 12 | import us.codecraft.webmagic.selector.PlainText; 13 | 14 | /** 15 | * CasperjsDownloader 16 | */ 17 | @Component 18 | public class CasperjsDownloader extends AbstractDownloader { 19 | private final static Logger LOG = LogManager.getLogger(CasperjsDownloader.class); 20 | @Autowired 21 | private Casperjs casperjs; 22 | 23 | @Override 24 | public Page download(Request request, Task task) { 25 | String html = null; 26 | Site site = null; 27 | if (task != null) { 28 | site = task.getSite(); 29 | } 30 | try { 31 | html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true)); 32 | } catch (Exception e) { 33 | if (site.getCycleRetryTimes() > 0) { 34 | return addToCycleRetry(request, site); 35 | } 36 | request.putExtra("EXCEPTION", e); 37 | onError(request); 38 | return null; 39 | } 40 | Page page = new Page(); 41 | page.setRawText(html); 42 | page.setUrl(new PlainText(request.getUrl())); 43 | page.setRequest(request); 44 | onSuccess(request); 45 | return page; 46 | } 47 | 48 | @Override 49 | public void setThread(int threadNum) { 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/gather/commons/PageConsumer.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.gather.commons; 2 | 3 | import com.gs.spider.model.async.Task; 4 | import com.gs.spider.model.commons.SpiderInfo; 5 | import us.codecraft.webmagic.Page; 6 | 7 | /** 8 | * PageConsumer 9 | */ 10 | @FunctionalInterface 11 | public interface PageConsumer { 12 | void accept(Page page, SpiderInfo info, Task task); 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/BaseMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * BaseMsg 7 | */ 8 | public abstract class BaseMsg implements Serializable { 9 | protected static final long serialVersionUID = 1L; 10 | private MsgType type; 11 | //必须唯一,否者会出现channel调用混乱 12 | private String clientId; 13 | 14 | //初始化客户端id 15 | public BaseMsg(String clientId) { 16 | this.clientId = clientId; 17 | } 18 | 19 | public String getClientId() { 20 | return clientId; 21 | } 22 | 23 | public void setClientId(String clientId) { 24 | this.clientId = clientId; 25 | } 26 | 27 | public MsgType getType() { 28 | return type; 29 | } 30 | 31 | public void setType(MsgType type) { 32 | this.type = type; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/CallbackMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | /** 4 | * CallbackMsg 5 | */ 6 | public class CallbackMsg extends BaseMsg { 7 | 8 | public CallbackMsg(String clientId) { 9 | super(clientId); 10 | this.setType(MsgType.CALLBACK); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/CallbackReplyMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | 6 | /** 7 | * CallbackReplyMsg 8 | */ 9 | public class CallbackReplyMsg extends InfoMsg { 10 | 11 | private Logger LOG = LogManager.getLogger(CallbackReplyMsg.class); 12 | 13 | public CallbackReplyMsg(String clientId) { 14 | super(clientId); 15 | this.setType(MsgType.CALLBACK_REPLY); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/InfoMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | /** 4 | * InfoMsg 5 | */ 6 | public class InfoMsg extends BaseMsg { 7 | private String info; 8 | 9 | public InfoMsg(String clientId) { 10 | super(clientId); 11 | this.setType(MsgType.INFO); 12 | } 13 | 14 | public String getInfo() { 15 | return info; 16 | } 17 | 18 | public void setInfo(String info) { 19 | this.info = info; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/LoginMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | /** 4 | * LoginMsg 5 | */ 6 | public class LoginMsg extends BaseMsg { 7 | 8 | public LoginMsg(String clientId) { 9 | super(clientId); 10 | this.setType(MsgType.LOGIN); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/MsgType.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | /** 4 | * MsgType 5 | */ 6 | public enum MsgType { 7 | PING, ASK, REPLY, LOGIN, INFO, CALLBACK_REPLY, CALLBACK 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/PingMsg.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | /** 4 | * PingMsg 5 | */ 6 | public class PingMsg extends InfoMsg { 7 | public PingMsg(String clientId) { 8 | super(clientId); 9 | this.setType(MsgType.PING); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/State.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | public enum State {INIT, RUNNING, STOP, FAIL} -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/async/Task.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.async; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.google.common.collect.Maps; 5 | import io.netty.channel.Channel; 6 | import org.apache.commons.lang3.builder.EqualsBuilder; 7 | import org.apache.commons.lang3.builder.HashCodeBuilder; 8 | 9 | import java.util.Date; 10 | import java.util.LinkedHashMap; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.concurrent.ScheduledFuture; 14 | import java.util.concurrent.TimeUnit; 15 | 16 | public class Task implements Cloneable { 17 | private String taskId; 18 | private String name; 19 | // private Map descriptions = new TreeMap<>((o1, o2) -> 20 | // o1.after(o2) ? 1 : -1); 21 | private Map descriptions = new LinkedHashMap<>(); 22 | private State state; 23 | private long time; 24 | private int count; 25 | private List callbackURL = Lists.newArrayList(); 26 | private String callbackPara; 27 | private long period; 28 | private TimeUnit timeUnit; 29 | private Map extraInfo = Maps.newHashMap(); 30 | 31 | public Task(String taskId, String name, long time) { 32 | this.taskId = taskId; 33 | this.name = name; 34 | this.state = State.INIT; 35 | this.time = time; 36 | } 37 | 38 | public String getTaskId() { 39 | return taskId; 40 | } 41 | 42 | public void setTaskId(String taskId) { 43 | this.taskId = taskId; 44 | } 45 | 46 | public State getState() { 47 | return state; 48 | } 49 | 50 | public void setState(State state) { 51 | this.state = state; 52 | } 53 | 54 | public long getTime() { 55 | return time; 56 | } 57 | 58 | public void setTime(long time) { 59 | this.time = time; 60 | } 61 | 62 | public String getName() { 63 | return name; 64 | } 65 | 66 | public void setName(String name) { 67 | this.name = name; 68 | } 69 | 70 | public void setDescription(String description) { 71 | setDescription(description, (Object) null); 72 | } 73 | 74 | /** 75 | * 使用{@link String#format(String, Object...)}格式化字符串 使用%s占位符 76 | * 77 | * @param description 78 | * @param para 79 | */ 80 | public void setDescription(String description, Object... para) { 81 | final String decs = para != null ? String.format(description, para) : description; 82 | Date date = new Date(); 83 | for (Date k : descriptions.keySet()) { 84 | if (Math.abs(k.getTime() - date.getTime()) < 2000) { 85 | descriptions.put(k, descriptions.get(k) + "
" + decs); 86 | return; 87 | } 88 | } 89 | descriptions.put(date, decs); 90 | } 91 | 92 | public int getCount() { 93 | return count; 94 | } 95 | 96 | public void setCount(int count) { 97 | this.count = count; 98 | } 99 | 100 | public List getCallbackURL() { 101 | return callbackURL; 102 | } 103 | 104 | public Task setCallbackURL(List callbackURL) { 105 | this.callbackURL = callbackURL; 106 | return this; 107 | } 108 | 109 | public void addCallbackURL(String callbackURL) { 110 | this.callbackURL.add(callbackURL); 111 | } 112 | 113 | public String getCallbackPara() { 114 | return callbackPara; 115 | } 116 | 117 | public void setCallbackPara(String callbackPara) { 118 | this.callbackPara = callbackPara; 119 | } 120 | 121 | public Map getDescriptions() { 122 | return descriptions; 123 | } 124 | 125 | public Task setDescriptions(Map descriptions) { 126 | this.descriptions = descriptions; 127 | return this; 128 | } 129 | 130 | public long getPeriod() { 131 | return period; 132 | } 133 | 134 | public void setPeriod(long period) { 135 | this.period = period; 136 | } 137 | 138 | public TimeUnit getTimeUnit() { 139 | return timeUnit; 140 | } 141 | 142 | public void setTimeUnit(TimeUnit timeUnit) { 143 | this.timeUnit = timeUnit; 144 | } 145 | 146 | public void increaseCount() { 147 | this.count++; 148 | } 149 | 150 | public Map getExtraInfo() { 151 | return extraInfo; 152 | } 153 | 154 | public void setExtraInfo(Map extraInfo) { 155 | this.extraInfo = extraInfo; 156 | } 157 | 158 | public void addExtraInfo(Object key, Object value) { 159 | extraInfo.put(key, value); 160 | } 161 | 162 | public Object getExtraInfoByKey(Object key) { 163 | return extraInfo.get(key); 164 | } 165 | 166 | @Override 167 | public String toString() { 168 | return "Task{" + "taskId='" + taskId + '\'' + ", name='" + name + '\'' + ", descriptions=" + descriptions 169 | + ", state=" + state + ", time=" + time + ", count=" + count + ", callbackURL='" + callbackURL + '\'' 170 | + ", callbackPara='" + callbackPara + '\'' + ", period=" + period + ", timeUnit=" + timeUnit 171 | + ", extraInfo=" + extraInfo + '}'; 172 | } 173 | 174 | @Override 175 | public Object clone() throws CloneNotSupportedException { 176 | return super.clone(); 177 | } 178 | 179 | @Override 180 | public boolean equals(Object o) { 181 | if (this == o) 182 | return true; 183 | 184 | if (o == null || getClass() != o.getClass()) 185 | return false; 186 | 187 | Task task = (Task) o; 188 | 189 | return new EqualsBuilder().append(getTaskId(), task.getTaskId()).append(getName(), task.getName()).isEquals(); 190 | } 191 | 192 | @Override 193 | public int hashCode() { 194 | return new HashCodeBuilder(17, 37).append(getTaskId()).append(getName()).toHashCode(); 195 | } 196 | } -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/commons/LoginInfo.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.commons; 2 | 3 | import java.util.Map; 4 | 5 | /** 6 | * LoginInfo 7 | */ 8 | public class LoginInfo { 9 | 10 | private Map initHeaders; 11 | private String usernameXPath; 12 | private String passwordXPath; 13 | private String clickXPath; 14 | private String username; 15 | private String password; 16 | private String loginUrl; 17 | private String user_agent; 18 | private int timeout = 5000; 19 | 20 | public Map getInitHeaders() { 21 | return initHeaders; 22 | } 23 | 24 | public void setInitHeaders(Map initHeaders) { 25 | this.initHeaders = initHeaders; 26 | } 27 | 28 | public String getUsernameXPath() { 29 | return usernameXPath; 30 | } 31 | 32 | public void setUsernameXPath(String usernameXPath) { 33 | this.usernameXPath = usernameXPath; 34 | } 35 | 36 | public String getPasswordXPath() { 37 | return passwordXPath; 38 | } 39 | 40 | public void setPasswordXPath(String passwordXPath) { 41 | this.passwordXPath = passwordXPath; 42 | } 43 | 44 | public String getUsername() { 45 | return username; 46 | } 47 | 48 | public void setUsername(String username) { 49 | this.username = username; 50 | } 51 | 52 | public String getPassword() { 53 | return password; 54 | } 55 | 56 | public void setPassword(String password) { 57 | this.password = password; 58 | } 59 | 60 | public String getLoginUrl() { 61 | return loginUrl; 62 | } 63 | 64 | public void setLoginUrl(String loginUrl) { 65 | this.loginUrl = loginUrl; 66 | } 67 | 68 | public String getUser_agent() { 69 | return user_agent; 70 | } 71 | 72 | public void setUser_agent(String user_agent) { 73 | this.user_agent = user_agent; 74 | } 75 | 76 | public int getTimeout() { 77 | return timeout; 78 | } 79 | 80 | public void setTimeout(int timeout) { 81 | this.timeout = timeout; 82 | } 83 | 84 | public String getClickXPath() { 85 | return clickXPath; 86 | } 87 | 88 | public void setClickXPath(String clickXPath) { 89 | this.clickXPath = clickXPath; 90 | } 91 | 92 | @Override 93 | public String toString() { 94 | return "LoginInfo{" + 95 | "initHeaders=" + initHeaders + 96 | ", usernameXPath='" + usernameXPath + '\'' + 97 | ", passwordXPath='" + passwordXPath + '\'' + 98 | ", clickXPath='" + clickXPath + '\'' + 99 | ", username='" + username + '\'' + 100 | ", password='" + password + '\'' + 101 | ", loginUrl='" + loginUrl + '\'' + 102 | ", user_agent='" + user_agent + '\'' + 103 | ", timeout=" + timeout + 104 | '}'; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/commons/Page.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.commons; 2 | 3 | import org.jsoup.nodes.Document; 4 | 5 | import java.util.Map; 6 | 7 | /** 8 | * Page 9 | */ 10 | public class Page { 11 | public transient Document document; 12 | public transient byte[] responseEntity; 13 | private String rawHtml; 14 | private Map responseHeaders; 15 | private int statusCode; 16 | private String cookies; 17 | private transient byte[] capture; 18 | public Page() { 19 | } 20 | 21 | public Page(String html) { 22 | this.rawHtml = html; 23 | } 24 | 25 | public String getRawHtml() { 26 | return rawHtml; 27 | } 28 | 29 | public void setRawHtml(String rawHtml) { 30 | this.rawHtml = rawHtml; 31 | } 32 | 33 | public Document getDocument() { 34 | return document; 35 | } 36 | 37 | public void setDocument(Document document) { 38 | this.document = document; 39 | } 40 | 41 | public Map getResponseHeaders() { 42 | return responseHeaders; 43 | } 44 | 45 | public void setResponseHeaders(Map responseHeaders) { 46 | this.responseHeaders = responseHeaders; 47 | } 48 | 49 | public int getStatusCode() { 50 | return statusCode; 51 | } 52 | 53 | public void setStatusCode(int statusCode) { 54 | this.statusCode = statusCode; 55 | } 56 | 57 | public byte[] getResponseEntity() { 58 | return responseEntity; 59 | } 60 | 61 | public void setResponseEntity(byte[] responseEntity) { 62 | this.responseEntity = responseEntity; 63 | } 64 | 65 | public String getCookies() { 66 | return cookies; 67 | } 68 | 69 | public void setCookies(String cookies) { 70 | this.cookies = cookies; 71 | } 72 | 73 | public byte[] getCapture() { 74 | return capture; 75 | } 76 | 77 | public void setCapture(byte[] capture) { 78 | this.capture = capture; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/commons/Request.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.commons; 2 | 3 | 4 | import org.springframework.http.HttpMethod; 5 | 6 | import java.util.Map; 7 | import java.util.function.Function; 8 | 9 | /** 10 | * Request 11 | */ 12 | public class Request { 13 | private String url; 14 | private String user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"; 15 | private HttpMethod httpMethod = HttpMethod.GET; 16 | private boolean followRedirect = true; 17 | private boolean ajax = false; 18 | private boolean needLogin = false; 19 | private LoginInfo loginInfo; 20 | private Function> loginFunc; 21 | private Map initHeaders; 22 | private Map para; 23 | private int timeout = 5000; 24 | 25 | public Request(String url) { 26 | this.url = url; 27 | } 28 | 29 | public Request(String url, boolean ajax) { 30 | this.url = url; 31 | this.ajax = ajax; 32 | } 33 | 34 | public Request() { 35 | } 36 | 37 | public String getUrl() { 38 | return url; 39 | } 40 | 41 | public void setUrl(String url) { 42 | this.url = url; 43 | } 44 | 45 | public HttpMethod getHttpMethod() { 46 | return httpMethod; 47 | } 48 | 49 | public void setHttpMethod(HttpMethod httpMethod) { 50 | this.httpMethod = httpMethod; 51 | } 52 | 53 | public boolean isFollowRedirect() { 54 | return followRedirect; 55 | } 56 | 57 | public void setFollowRedirect(boolean followRedirect) { 58 | this.followRedirect = followRedirect; 59 | } 60 | 61 | public boolean isAjax() { 62 | return ajax; 63 | } 64 | 65 | public void setAjax(boolean ajax) { 66 | this.ajax = ajax; 67 | } 68 | 69 | public Map getInitHeaders() { 70 | return initHeaders; 71 | } 72 | 73 | public void setInitHeaders(Map initHeaders) { 74 | this.initHeaders = initHeaders; 75 | } 76 | 77 | public int getTimeout() { 78 | return timeout; 79 | } 80 | 81 | public void setTimeout(int timeout) { 82 | this.timeout = timeout; 83 | } 84 | 85 | public String getUser_agent() { 86 | return user_agent; 87 | } 88 | 89 | public void setUser_agent(String user_agent) { 90 | this.user_agent = user_agent; 91 | } 92 | 93 | public Map getPara() { 94 | return para; 95 | } 96 | 97 | public void setPara(Map para) { 98 | this.para = para; 99 | } 100 | 101 | public boolean isNeedLogin() { 102 | return needLogin; 103 | } 104 | 105 | public void setNeedLogin(boolean needLogin) { 106 | this.needLogin = needLogin; 107 | } 108 | 109 | public LoginInfo getLoginInfo() { 110 | return loginInfo; 111 | } 112 | 113 | public void setLoginInfo(LoginInfo loginInfo) { 114 | this.loginInfo = loginInfo; 115 | } 116 | 117 | public Function> getLoginFunc() { 118 | return loginFunc; 119 | } 120 | 121 | public void setLoginFunc(Function> loginFunc) { 122 | this.loginFunc = loginFunc; 123 | } 124 | 125 | @Override 126 | public String toString() { 127 | return "Request{" + 128 | "url='" + url + '\'' + 129 | ", user_agent='" + user_agent + '\'' + 130 | ", httpMethod=" + httpMethod + 131 | ", followRedirect=" + followRedirect + 132 | ", ajax=" + ajax + 133 | ", needLogin=" + needLogin + 134 | ", loginInfo=" + loginInfo + 135 | ", loginFunc=" + loginFunc + 136 | ", initHeaders=" + initHeaders + 137 | ", para=" + para + 138 | ", timeout=" + timeout + 139 | '}'; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/utils/MySupplier.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.utils; 2 | 3 | /** 4 | * MySupplier 5 | */ 6 | @FunctionalInterface 7 | public interface MySupplier { 8 | 9 | /** 10 | * Gets a result. 11 | * 12 | * @return a result 13 | */ 14 | T get() throws Exception; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/utils/ResultBundle.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.utils; 2 | 3 | /** 4 | * 结果集 5 | */ 6 | public class ResultBundle { 7 | /** 8 | * 请求的参数 9 | */ 10 | protected String keyword; 11 | /** 12 | * 返回结果的数量 13 | */ 14 | protected int count; 15 | /** 16 | * 本次调用耗时 17 | */ 18 | protected long time; 19 | /** 20 | * 本次调用是否成功 21 | */ 22 | protected boolean success; 23 | /** 24 | * 如调用出现错误,错误信息 25 | */ 26 | protected String errorMsg; 27 | /** 28 | * 本次调用的追踪ID 29 | */ 30 | protected String traceId; 31 | /** 32 | * 结果 33 | */ 34 | private T result; 35 | 36 | public ResultBundle() { 37 | } 38 | 39 | public ResultBundle(T result, String keyword, long time) { 40 | this.result = result; 41 | this.keyword = keyword; 42 | this.time = time; 43 | this.count = 1; 44 | this.success = true; 45 | } 46 | 47 | public ResultBundle(String keyword, long time, boolean success, String errorMsg) { 48 | result = null; 49 | this.success = success; 50 | this.errorMsg = errorMsg; 51 | this.keyword = keyword; 52 | this.time = time; 53 | this.count = 0; 54 | } 55 | 56 | public T getResult() { 57 | return result; 58 | } 59 | 60 | public void setResult(T result) { 61 | this.result = result; 62 | } 63 | 64 | public String getKeyword() { 65 | return keyword; 66 | } 67 | 68 | public void setKeyword(String keyword) { 69 | this.keyword = keyword; 70 | } 71 | 72 | public int getCount() { 73 | return count; 74 | } 75 | 76 | public void setCount(int count) { 77 | this.count = count; 78 | } 79 | 80 | public long getTime() { 81 | return time; 82 | } 83 | 84 | public void setTime(long time) { 85 | this.time = time; 86 | } 87 | 88 | public boolean isSuccess() { 89 | return success; 90 | } 91 | 92 | public void setSuccess(boolean success) { 93 | this.success = success; 94 | } 95 | 96 | public String getErrorMsg() { 97 | return errorMsg; 98 | } 99 | 100 | public void setErrorMsg(String errorMsg) { 101 | this.errorMsg = errorMsg; 102 | } 103 | 104 | public String getTraceId() { 105 | return traceId; 106 | } 107 | 108 | public void setTraceId(String traceId) { 109 | this.traceId = traceId; 110 | } 111 | 112 | @Override 113 | public String toString() { 114 | return "ResultBundle{" + 115 | "result=" + result + 116 | ", keyword='" + keyword + '\'' + 117 | ", count=" + count + 118 | ", time=" + time + 119 | ", success=" + success + 120 | ", errorMsg='" + errorMsg + '\'' + 121 | ", traceId='" + traceId + '\'' + 122 | '}'; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/utils/ResultBundleBuilder.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.utils; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | import org.springframework.context.annotation.Scope; 6 | import org.springframework.stereotype.Component; 7 | 8 | import java.util.Collection; 9 | 10 | /** 11 | * ResultBundleBuilder 12 | */ 13 | @Component 14 | @Scope("singleton") 15 | public class ResultBundleBuilder { 16 | private Logger LOG = LogManager.getLogger(ResultBundleBuilder.class); 17 | 18 | public ResultBundle bundle(String keyword, MySupplier supplier) { 19 | ResultBundle resultBundle; 20 | long start = System.currentTimeMillis(); 21 | try { 22 | T t = supplier.get(); 23 | resultBundle = new ResultBundle<>(t, keyword, System.currentTimeMillis() - start); 24 | } catch (Exception e) { 25 | resultBundle = new ResultBundle<>(keyword, System.currentTimeMillis() - start, false, e.getClass().getName() + ":" + e.getLocalizedMessage()); 26 | e.printStackTrace(); 27 | } 28 | return resultBundle; 29 | } 30 | 31 | public ResultListBundle listBundle(String keyword, MySupplier> supplier) { 32 | ResultListBundle resultBundle; 33 | long start = System.currentTimeMillis(); 34 | try { 35 | Collection t = supplier.get(); 36 | resultBundle = new ResultListBundle<>(t, keyword, System.currentTimeMillis() - start); 37 | } catch (Exception e) { 38 | resultBundle = new ResultListBundle<>(keyword, System.currentTimeMillis() - start, false, e.getClass().getName() + ":" + e.getLocalizedMessage()); 39 | e.printStackTrace(); 40 | } 41 | return resultBundle; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/utils/ResultBundleResolver.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.utils; 2 | 3 | import com.google.gson.Gson; 4 | import com.google.gson.GsonBuilder; 5 | import com.google.gson.JsonDeserializer; 6 | import com.google.gson.JsonSyntaxException; 7 | import com.google.gson.reflect.TypeToken; 8 | import org.apache.logging.log4j.LogManager; 9 | import org.apache.logging.log4j.Logger; 10 | 11 | import java.lang.reflect.ParameterizedType; 12 | import java.lang.reflect.Type; 13 | import java.text.ParseException; 14 | import java.text.SimpleDateFormat; 15 | import java.util.Date; 16 | import java.util.Locale; 17 | 18 | /** 19 | * ResultBundleResolver 20 | */ 21 | public class ResultBundleResolver { 22 | private static final SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", Locale.US); 23 | private static final Gson gson = new GsonBuilder() 24 | .registerTypeAdapter(Date.class, (JsonDeserializer) (json, typeOfT, context) -> { 25 | try { 26 | return sdf.parse(json.getAsJsonPrimitive().getAsString().replaceAll("\"", "")); 27 | } catch (ParseException e) { 28 | e.printStackTrace(); 29 | } 30 | return null; 31 | }).create(); 32 | private static Logger LOG = LogManager.getLogger(ResultBundleResolver.class); 33 | 34 | /** 35 | * 解析ResultBundle 36 | * 37 | * @param json 服务器返回的json数据 38 | * @param 39 | * @return 40 | */ 41 | public ResultBundle bundle(String json) { 42 | ResultBundle resultBundle = null; 43 | try { 44 | Type objectType = new TypeToken>() { 45 | }.getType(); 46 | resultBundle = gson.fromJson(json, objectType); 47 | } catch (JsonSyntaxException e) { 48 | LOG.error("无法解析的返回值信息:" + json); 49 | e.printStackTrace(); 50 | } 51 | validate(resultBundle); 52 | return resultBundle; 53 | } 54 | 55 | /** 56 | * 解析ResultBundle 57 | * 58 | * @param json 服务器返回的json数据 59 | * @param 60 | * @return 61 | */ 62 | public ResultBundle bundle(String json, Type classOfT) { 63 | ResultBundle resultBundle = null; 64 | try { 65 | Type objectType = new ParameterizedType() { 66 | public Type getRawType() { 67 | return ResultBundle.class; 68 | } 69 | 70 | public Type[] getActualTypeArguments() { 71 | return new Type[]{classOfT}; 72 | } 73 | 74 | public Type getOwnerType() { 75 | return null; 76 | } 77 | }; 78 | resultBundle = gson.fromJson(json, objectType); 79 | } catch (JsonSyntaxException e) { 80 | LOG.error("无法解析的返回值信息:" + json); 81 | e.printStackTrace(); 82 | } 83 | validate(resultBundle); 84 | return resultBundle; 85 | } 86 | 87 | /** 88 | * 解析ResultListBundle 89 | * 90 | * @param json 服务器返回的json数据 91 | * @param 92 | * @return 93 | */ 94 | public ResultListBundle listBundle(String json, Class classOfT) { 95 | ResultListBundle resultBundle = null; 96 | try { 97 | Type objectType = new ParameterizedType() { 98 | public Type getRawType() { 99 | return ResultListBundle.class; 100 | } 101 | 102 | public Type[] getActualTypeArguments() { 103 | return new Type[]{classOfT}; 104 | } 105 | 106 | public Type getOwnerType() { 107 | return null; 108 | } 109 | }; 110 | resultBundle = gson.fromJson(json, objectType); 111 | } catch (JsonSyntaxException e) { 112 | LOG.error("无法解析的返回值信息:" + json); 113 | e.printStackTrace(); 114 | } 115 | validate(resultBundle); 116 | return resultBundle; 117 | } 118 | 119 | /** 120 | * 解析ResultListBundle 121 | * 122 | * @param json 服务器返回的json数据 123 | * @param 124 | * @return 125 | */ 126 | public ResultListBundle listBundle(String json) { 127 | ResultListBundle resultBundle = null; 128 | try { 129 | Type objectType = new TypeToken>() { 130 | }.getType(); 131 | resultBundle = gson.fromJson(json, objectType); 132 | } catch (JsonSyntaxException e) { 133 | LOG.error("无法解析的返回值信息:" + json); 134 | e.printStackTrace(); 135 | } 136 | validate(resultBundle); 137 | return resultBundle; 138 | } 139 | 140 | private void validate(ResultBundle resultBundle) { 141 | if (resultBundle == null) { 142 | LOG.error("返回值为空,请检查参数"); 143 | } else if (!resultBundle.isSuccess()) { 144 | LOG.error("调用出错,错误信息为:{},追踪编号:{}", resultBundle.getErrorMsg(), resultBundle.getTraceId()); 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/model/utils/ResultListBundle.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.model.utils; 2 | 3 | import java.util.Collection; 4 | import java.util.LinkedList; 5 | 6 | /** 7 | */ 8 | public class ResultListBundle extends ResultBundle { 9 | /** 10 | * 结果 11 | */ 12 | private Collection resultList; 13 | 14 | public ResultListBundle(Collection resultList, String keyword, long time) { 15 | this.resultList = resultList; 16 | this.keyword = keyword; 17 | this.time = time; 18 | this.count = resultList.size(); 19 | this.success = true; 20 | } 21 | 22 | public ResultListBundle(String keyword, long time, boolean success, String errorMsg) { 23 | resultList = new LinkedList<>(); 24 | this.success = success; 25 | this.errorMsg = errorMsg; 26 | this.keyword = keyword; 27 | this.time = time; 28 | this.count = 0; 29 | } 30 | 31 | public Collection getResultList() { 32 | return resultList; 33 | } 34 | 35 | public void setResultList(Collection resultList) { 36 | this.resultList = resultList; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "ResultListBundle{" + 42 | "resultList=" + resultList + 43 | '}'; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/service/AsyncGatherService.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.service; 2 | 3 | import com.gs.spider.gather.async.AsyncGather; 4 | import com.gs.spider.model.async.State; 5 | import com.gs.spider.model.async.Task; 6 | import com.gs.spider.model.utils.MySupplier; 7 | import com.gs.spider.model.utils.ResultBundle; 8 | import com.gs.spider.model.utils.ResultBundleBuilder; 9 | import com.gs.spider.model.utils.ResultListBundle; 10 | import org.apache.logging.log4j.LogManager; 11 | import org.apache.logging.log4j.Logger; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.stereotype.Component; 14 | 15 | import java.util.LinkedList; 16 | import java.util.List; 17 | 18 | /** 19 | * AsyncGatherService 20 | * 异步数据抓取服务,提供任务管理基础方法 21 | */ 22 | @Component 23 | public class AsyncGatherService { 24 | protected AsyncGather asyncGather; 25 | @Autowired 26 | protected ResultBundleBuilder bundleBuilder; 27 | private Logger LOG = LogManager.getLogger(AsyncGatherService.class); 28 | 29 | public AsyncGatherService(AsyncGather asyncGather) { 30 | this.asyncGather = asyncGather; 31 | } 32 | 33 | public AsyncGatherService() { 34 | } 35 | 36 | /** 37 | * 获取task列表,包括正在运行和已经完成的task 38 | * 39 | * @return 40 | */ 41 | public ResultListBundle getTaskList(boolean containsExtraInfo) { 42 | MySupplier> supplier = () -> new LinkedList<>(asyncGather.getTasks(containsExtraInfo)); 43 | return bundleBuilder.listBundle(null, supplier); 44 | } 45 | 46 | /** 47 | * 根据taskid获取task 48 | * 49 | * @param taskId 50 | * @return task 51 | */ 52 | public ResultBundle getTaskById(String taskId, boolean containsExtraInfo) { 53 | MySupplier supplier = () -> asyncGather.getTaskById(taskId, containsExtraInfo); 54 | return bundleBuilder.bundle(null, supplier); 55 | } 56 | 57 | /** 58 | * 获取指定task当前已经抓取的文章数 59 | * 60 | * @param taskId 任务ID 61 | * @return task当前已经抓取的文章数 62 | */ 63 | public ResultBundle getTaskCount(String taskId) { 64 | MySupplier supplier = () -> asyncGather.getTaskCount(taskId); 65 | return bundleBuilder.bundle(null, supplier); 66 | } 67 | 68 | /** 69 | * 获取异步抓取长连接服务器端口号 70 | * 71 | * @return 端口号 72 | */ 73 | public ResultBundle getLongConnectionPort() { 74 | MySupplier supplier = () -> asyncGather.getLongConnectionPort(); 75 | return bundleBuilder.bundle(null, supplier); 76 | } 77 | 78 | /** 79 | * 根据taskId删除任务 80 | * 81 | * @param taskId 任务ID 82 | * @return 成功返回OK! 83 | */ 84 | public ResultBundle deleteTaskById(String taskId) { 85 | MySupplier supplier = () -> { 86 | asyncGather.deleteTaskById(taskId); 87 | return "OK!"; 88 | }; 89 | return bundleBuilder.bundle(null, supplier); 90 | } 91 | 92 | /** 93 | * 统计指定任务状态的任务数量 94 | * 95 | * @param state 任务状态 96 | * @return 本状态的任务数 97 | */ 98 | public ResultBundle countByState(State state) { 99 | return bundleBuilder.bundle(state.name(), () -> asyncGather.countByState(state)); 100 | } 101 | 102 | /** 103 | * 获取任务列表,通过状态过滤 104 | * 105 | * @param state 任务状态 106 | * @return 107 | */ 108 | public ResultListBundle getTasksFilterByState(State state, boolean containsExtraInfo) { 109 | return bundleBuilder.listBundle(state.name(), () -> asyncGather.getTasksFilterByState(state, containsExtraInfo)); 110 | } 111 | 112 | /** 113 | * 获取任务列表,通过时间状态过滤 114 | * 115 | * @param start 开始时间 116 | * @param end 结束时间 117 | * @return 118 | */ 119 | public ResultListBundle getTasksFilterByTime(long start, long end, boolean containsExtraInfo) { 120 | return bundleBuilder.listBundle("start:" + start + ",end:" + end, () -> asyncGather.getTasksFilterByTime(start, end, containsExtraInfo)); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/service/commons/spiderinfo/SpiderInfoService.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.service.commons.spiderinfo; 2 | 3 | import com.gs.spider.dao.SpiderInfoDAO; 4 | import com.gs.spider.model.commons.SpiderInfo; 5 | import com.gs.spider.model.utils.ResultBundle; 6 | import com.gs.spider.model.utils.ResultBundleBuilder; 7 | import com.gs.spider.model.utils.ResultListBundle; 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.stereotype.Component; 13 | 14 | /** 15 | * SpiderInfoService 16 | */ 17 | @Component 18 | public class SpiderInfoService { 19 | private final static Logger LOG = LogManager.getLogger(SpiderInfoService.class); 20 | @Autowired 21 | private SpiderInfoDAO spiderInfoDAO; 22 | @Autowired 23 | private ResultBundleBuilder bundleBuilder; 24 | 25 | /** 26 | * 列出库中所有爬虫模板 27 | * 28 | * @param size 页面容量 29 | * @param page 页码 30 | * @return 31 | */ 32 | public ResultListBundle listAll(int size, int page) { 33 | return bundleBuilder.listBundle(null, () -> spiderInfoDAO.listAll(size, page)); 34 | } 35 | 36 | /** 37 | * 根据domain获取结果 38 | * 39 | * @param domain 网站域名 40 | * @param size 每页数量 41 | * @param page 页码 42 | * @return 43 | */ 44 | public ResultListBundle getByDomain(String domain, int size, int page) { 45 | return bundleBuilder.listBundle(domain, () -> spiderInfoDAO.getByDomain(domain, size, page)); 46 | } 47 | 48 | /** 49 | * 索引爬虫模板 50 | * 51 | * @param spiderInfo 爬虫模板 52 | * @return 如果爬虫模板索引成功则返回模板id, 否则返回null 53 | */ 54 | public ResultBundle index(SpiderInfo spiderInfo) { 55 | return bundleBuilder.bundle(spiderInfo.getDomain(), () -> StringUtils.isBlank(spiderInfo.getId()) ? spiderInfoDAO.index(spiderInfo) : spiderInfoDAO.update(spiderInfo)); 56 | } 57 | 58 | /** 59 | * 根据网站domain删除数据 60 | * 61 | * @param domain 网站域名 62 | * @return 是否全部数据删除成功 63 | */ 64 | public ResultBundle deleteByDomain(String domain) { 65 | return bundleBuilder.bundle(domain, () -> spiderInfoDAO.deleteByDomain(domain)); 66 | } 67 | 68 | /** 69 | * 根据id删除网页模板 70 | * 71 | * @param id 网页模板id 72 | * @return 是否删除 73 | */ 74 | public ResultBundle deleteById(String id) { 75 | return bundleBuilder.bundle(id, () -> spiderInfoDAO.deleteById(id)); 76 | } 77 | 78 | /** 79 | * 根据爬虫模板id获取指定爬虫模板 80 | * 81 | * @param id 爬虫模板id 82 | * @return 83 | */ 84 | public ResultBundle getById(String id) { 85 | return bundleBuilder.bundle(id, () -> spiderInfoDAO.getById(id)); 86 | } 87 | 88 | /** 89 | * 更新爬虫模板 90 | * 91 | * @param spiderInfo 爬虫模板实体 92 | * @return 爬虫模板id 93 | */ 94 | public ResultBundle update(SpiderInfo spiderInfo) { 95 | return bundleBuilder.bundle(spiderInfo.getId(), () -> spiderInfoDAO.update(spiderInfo)); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/utils/AppInfo.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.utils; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStreamReader; 5 | import java.util.Properties; 6 | 7 | /** 8 | * 关于程序本身的一些信息 9 | */ 10 | public final class AppInfo { 11 | 12 | /* 名称 */ 13 | public static String APP_NAME; 14 | 15 | /* 版本 */ 16 | public static String APP_VERSION; 17 | 18 | /* 在线文档 */ 19 | public static String ONLINE_DOCUMENTATION; 20 | 21 | static { 22 | Properties appinfo = new Properties(); 23 | try (InputStreamReader isr = new InputStreamReader(AppInfo.class.getResourceAsStream("/appinfo"), "UTF-8");) { 24 | appinfo.load(isr); 25 | APP_NAME = appinfo.getProperty("appName"); 26 | APP_VERSION = appinfo.getProperty("appVersion"); 27 | ONLINE_DOCUMENTATION = appinfo.getProperty("onlineDocumentation"); 28 | } catch (IOException e) { 29 | } 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/utils/HANLPExtractor.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.utils; 2 | 3 | import com.google.common.collect.Maps; 4 | import com.hankcs.hanlp.HanLP; 5 | import com.hankcs.hanlp.seg.Segment; 6 | import com.hankcs.hanlp.seg.common.Term; 7 | import org.apache.logging.log4j.LogManager; 8 | import org.apache.logging.log4j.Logger; 9 | import org.springframework.stereotype.Component; 10 | 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.Set; 14 | import java.util.stream.Collectors; 15 | 16 | /** 17 | * NamedEntityExtractor 18 | */ 19 | @Component 20 | public class HANLPExtractor implements NLPExtractor { 21 | private final static Logger LOG = LogManager.getLogger(HANLPExtractor.class); 22 | private static final Segment segment = HanLP.newSegment().enableOrganizationRecognize(true).enablePlaceRecognize(true); 23 | 24 | /** 25 | * 抽取命名实体 26 | * 27 | * @param content 文章正文 28 | * @return map的key是一下三种nr, ns, nt 其value就是对应的词表 29 | */ 30 | public Map> extractNamedEntity(String content) { 31 | List termList = segment.seg(content); 32 | Set nrList = termList.stream().filter(term -> term.nature.startsWith("nr")) 33 | .map(term -> term.word).collect(Collectors.toSet()); 34 | Set nsList = termList.stream().filter(term -> term.nature.startsWith("ns")) 35 | .map(term -> term.word).collect(Collectors.toSet()); 36 | Set ntList = termList.stream().filter(term -> term.nature.startsWith("nt")) 37 | .map(term -> term.word).collect(Collectors.toSet()); 38 | Map> namedEntity = Maps.newHashMap(); 39 | namedEntity.put("nr", nrList); 40 | namedEntity.put("ns", nsList); 41 | namedEntity.put("nt", ntList); 42 | return namedEntity; 43 | } 44 | 45 | /** 46 | * 抽取摘要 47 | * 48 | * @param content 文章正文 49 | * @return 摘要句子列表 50 | */ 51 | public List extractSummary(String content) { 52 | return HanLP.extractSummary(content, 5); 53 | } 54 | 55 | /** 56 | * 抽取关键词 57 | * 58 | * @param content 文章正文 59 | * @return 关键词列表 60 | */ 61 | public List extractKeywords(String content) { 62 | return HanLP.extractKeyword(content, 10); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/utils/HttpClientUtil.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.utils; 2 | 3 | import org.apache.commons.httpclient.Cookie; 4 | import org.apache.commons.httpclient.HttpClient; 5 | import org.apache.commons.httpclient.cookie.CookiePolicy; 6 | import org.apache.commons.httpclient.methods.GetMethod; 7 | import org.apache.commons.httpclient.methods.PostMethod; 8 | import org.apache.commons.httpclient.methods.StringRequestEntity; 9 | import org.apache.commons.httpclient.params.HttpMethodParams; 10 | import org.apache.commons.lang.StringUtils; 11 | import org.springframework.context.annotation.Scope; 12 | import org.springframework.stereotype.Component; 13 | 14 | import java.io.IOException; 15 | import java.net.URLEncoder; 16 | import java.util.Map; 17 | 18 | @Component 19 | @Scope("prototype") 20 | public class HttpClientUtil { 21 | static HttpClient hc = new HttpClient(); 22 | 23 | static { 24 | hc.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); 25 | hc.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2"); 26 | hc.getHttpConnectionManager().getParams().setConnectionTimeout(15000); 27 | hc.getHttpConnectionManager().getParams().setSoTimeout(15000); 28 | } 29 | 30 | public String get(String url) throws IOException { 31 | // clearCookies(); 32 | GetMethod g = new GetMethod(url); 33 | hc.executeMethod(g); 34 | return g.getResponseBodyAsString(); 35 | } 36 | 37 | public byte[] getAsByte(String url) throws IOException { 38 | // clearCookies(); 39 | GetMethod g = new GetMethod(url); 40 | hc.executeMethod(g); 41 | return g.getResponseBody(); 42 | } 43 | 44 | public String getWithRealHeader(String url) throws IOException { 45 | // clearCookies(); 46 | GetMethod g = new GetMethod(url); 47 | //////////////////////// 48 | g.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;"); 49 | g.addRequestHeader("Accept-Language", "zh-cn"); 50 | g.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3"); 51 | g.addRequestHeader("Keep-Alive", "300"); 52 | g.addRequestHeader("Connection", "Keep-Alive"); 53 | g.addRequestHeader("Cache-Control", "no-cache"); 54 | /////////////////////// 55 | hc.executeMethod(g); 56 | return g.getResponseBodyAsString(); 57 | } 58 | 59 | 60 | public String get(String url, String cookies) throws 61 | IOException { 62 | // clearCookies(); 63 | GetMethod g = new GetMethod(url); 64 | g.setFollowRedirects(false); 65 | if (StringUtils.isNotEmpty(cookies)) { 66 | g.addRequestHeader("cookie", cookies); 67 | } 68 | hc.executeMethod(g); 69 | return g.getResponseBodyAsString(); 70 | } 71 | 72 | public String get(String url, Cookie[] cookies) throws 73 | IOException { 74 | // clearCookies(); 75 | GetMethod g = new GetMethod(url); 76 | g.setFollowRedirects(false); 77 | hc.getState().addCookies(cookies); 78 | hc.executeMethod(g); 79 | return g.getResponseBodyAsString(); 80 | } 81 | 82 | public String get(String url, String cookies, boolean followRedirects) throws 83 | IOException { 84 | // clearCookies(); 85 | GetMethod g = new GetMethod(url); 86 | g.setFollowRedirects(followRedirects); 87 | if (StringUtils.isNotEmpty(cookies)) { 88 | g.addRequestHeader("cookie", cookies); 89 | } 90 | hc.executeMethod(g); 91 | return g.getResponseBodyAsString(); 92 | } 93 | 94 | public String get(String url, boolean followRedirects) throws 95 | IOException { 96 | // clearCookies(); 97 | GetMethod g = new GetMethod(url); 98 | g.setFollowRedirects(followRedirects); 99 | hc.executeMethod(g); 100 | return g.getResponseBodyAsString(); 101 | } 102 | 103 | public String getHeader(String url, String cookies, String headername) throws IOException { 104 | // clearCookies(); 105 | GetMethod g = new GetMethod(url); 106 | g.setFollowRedirects(false); 107 | if (StringUtils.isNotEmpty(cookies)) { 108 | g.addRequestHeader("cookie", cookies); 109 | } 110 | hc.executeMethod(g); 111 | return g.getResponseHeader(headername) == null ? null : g.getResponseHeader(headername).getValue(); 112 | } 113 | 114 | public String post(String postURL, Map partam, String cookies) 115 | throws IOException { 116 | // clearCookies(); 117 | PostMethod p = new PostMethod(postURL); 118 | for (String key : partam.keySet()) { 119 | if (partam.get(key) != null) { 120 | p.setParameter(key, partam.get(key)); 121 | } 122 | } 123 | if (StringUtils.isNotEmpty(cookies)) { 124 | p.addRequestHeader("cookie", cookies); 125 | } 126 | hc.executeMethod(p); 127 | return p.getResponseBodyAsString(); 128 | } 129 | 130 | public String post(String url, String data) throws IOException { 131 | PostMethod post = new PostMethod(url); 132 | if (data != null && !data.isEmpty()) { 133 | post.addRequestHeader("Content-Type", "application/json"); 134 | post.setRequestEntity(new StringRequestEntity(data, "application/json", "utf8")); 135 | } 136 | hc.executeMethod(post); 137 | return post.getResponseBodyAsString(); 138 | } 139 | 140 | public String post(String postURL, Map partam, String cookies, Map header) 141 | throws IOException { 142 | // clearCookies(); 143 | PostMethod p = new PostMethod(postURL); 144 | String reqEntity = ""; 145 | for (Map.Entry entry : partam.entrySet()) { 146 | reqEntity += entry.getKey() + "=" + URLEncoder.encode(entry.getValue(), "utf8") + "&"; 147 | } 148 | // p.setRequestBody(nameValuePair); 149 | p.setRequestEntity(new StringRequestEntity(reqEntity)); 150 | if (StringUtils.isNotEmpty(cookies)) { 151 | p.addRequestHeader("cookie", cookies); 152 | } 153 | for (Map.Entry entry : header.entrySet()) { 154 | p.addRequestHeader(entry.getKey(), entry.getValue()); 155 | } 156 | hc.executeMethod(p); 157 | return p.getResponseBodyAsString(); 158 | } 159 | 160 | public String getCookie() { 161 | Cookie[] cookies = hc.getState().getCookies(); 162 | String tmpcookies = ""; 163 | for (Cookie c : cookies) { 164 | tmpcookies += c.toString() + ";"; 165 | } 166 | return tmpcookies; 167 | } 168 | 169 | public void clearCookies() { 170 | hc.getState().clearCookies(); 171 | } 172 | 173 | public void addCookie(String cookie, String domain) { 174 | String[] data = cookie.split(";"); 175 | for (String s : data) { 176 | String[] kvPair = s.split("="); 177 | if (kvPair.length == 2) { 178 | String name = kvPair[0]; 179 | String value = kvPair[1]; 180 | if (!name.equals("path") && !name.equals("domain")) { 181 | hc.getState().addCookie(new Cookie(domain, name, value)); 182 | } 183 | } 184 | } 185 | 186 | } 187 | 188 | } 189 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/utils/NLPExtractor.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.utils; 2 | 3 | import org.springframework.stereotype.Component; 4 | 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | /** 10 | * NLPExtractor 11 | */ 12 | @Component 13 | public interface NLPExtractor { 14 | /** 15 | * 抽取命名实体 16 | * 17 | * @param content 文章正文 18 | * @return map的key是一下三种nr, ns, nt 其value就是对应的词表 19 | */ 20 | Map> extractNamedEntity(String content); 21 | 22 | /** 23 | * 抽取摘要 24 | * 25 | * @param content 文章正文 26 | * @return 摘要句子列表 27 | */ 28 | List extractSummary(String content); 29 | 30 | /** 31 | * 抽取关键词 32 | * 33 | * @param content 文章正文 34 | * @return 关键词列表 35 | */ 36 | List extractKeywords(String content); 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/gs/spider/utils/TablePage.java: -------------------------------------------------------------------------------- 1 | package com.gs.spider.utils; 2 | 3 | /** 4 | * 表格分页工具 5 | */ 6 | public final class TablePage { 7 | //总记录数 8 | private long totalRow; 9 | //当前页 10 | private int currentPage; 11 | //每页显示条数 12 | private int pageSize; 13 | //总页数 14 | private int pageCount; 15 | //底部显示页码长度 16 | private int showSize = 5; 17 | //底部页面范围 18 | private int[] pageRange; 19 | //其他参数 20 | private String otherParam; 21 | 22 | public TablePage(long totalRow, int currentPage, int pageSize) { 23 | this.totalRow = totalRow; 24 | this.currentPage = currentPage; 25 | this.pageSize = pageSize; 26 | } 27 | 28 | //再次检查参数,计算pageCount 29 | public void checkAgain() { 30 | //检查当前页 31 | if (currentPage < 1) { 32 | currentPage = 1; 33 | } 34 | //判断总记录数 35 | if (totalRow > 0) { 36 | //设置总页数 37 | pageCount = (int) (totalRow / pageSize); 38 | if (totalRow % pageSize != 0 ) { 39 | pageCount ++; 40 | } 41 | 42 | if (currentPage > pageCount) { 43 | currentPage = pageCount; 44 | } 45 | }else { 46 | pageCount = 0; 47 | } 48 | } 49 | 50 | /** 51 | * 根据总页数和当前页,显示最多5项,尽量以当前页为中心 52 | * @return 返回起始和结束位置 53 | */ 54 | public int[] getPageRange(){ 55 | int begin = 1,end = pageCount; 56 | if (pageCount > showSize) { 57 | if (currentPage - 1 <= 2) { 58 | end = showSize; 59 | }else if (currentPage - 1 > 2 && pageCount - currentPage > 2) { 60 | begin = currentPage - 2; 61 | end = currentPage + 2; 62 | }else{ 63 | begin = pageCount - showSize + 1 ; 64 | end = pageCount; 65 | } 66 | } 67 | pageRange = new int[]{begin,end}; 68 | return pageRange; 69 | } 70 | 71 | public String getOtherParam() { 72 | return otherParam; 73 | } 74 | 75 | public void setOtherParam(String otherParam) { 76 | this.otherParam = otherParam; 77 | } 78 | 79 | public long getTotalRow() { 80 | return totalRow; 81 | } 82 | 83 | public int getCurrentPage() { 84 | return currentPage; 85 | } 86 | 87 | public int getPageSize() { 88 | return pageSize; 89 | } 90 | 91 | public int getPageCount() { 92 | return pageCount; 93 | } 94 | 95 | public int getShowSize() { 96 | return showSize; 97 | } 98 | 99 | 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/resources/appinfo: -------------------------------------------------------------------------------- 1 | ## some information on this platform 2 | appName=数据采集平台 3 | appVersion=1.0 4 | onlineDocumentation=https://gsh199449.github.io/gather_platform_pages/ 5 | -------------------------------------------------------------------------------- /src/main/resources/commonIndex.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "number_of_shards": "5", 4 | "number_of_replicas": "0", 5 | "max_result_window": 999999 6 | } 7 | } -------------------------------------------------------------------------------- /src/main/resources/datePattern.txt: -------------------------------------------------------------------------------- 1 | \d{4}-\d{2}-\d{2}##yyyy-MM-dd 2 | \d{4}年\d{2}月\d{2}日##yyyy年MM月dd日 3 | \d{4}/\d{2}/\d{2}##yyyy/MM/dd 4 | \d{2}月\d{2}日##MM月dd日 5 | \d{2}-\d{2}##MM-dd 6 | ===== 7 | \d{2}:\d{2}:\d{2}##HH:mm:ss 8 | \d{2}:\d{2}##HH:mm -------------------------------------------------------------------------------- /src/main/resources/ignoredUrls.txt: -------------------------------------------------------------------------------- 1 | zip 2 | exe 3 | doc 4 | docx 5 | png 6 | jpeg 7 | mp3 8 | mp4 9 | mkv 10 | rmvb 11 | ppt 12 | pptx 13 | iso 14 | dmg 15 | apk 16 | jar 17 | pdf 18 | xls 19 | xlsx 20 | js 21 | cpp 22 | gif 23 | rar 24 | xml -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 10 | 11 | %d [%t] [%c] [%p] (%file:%line\)- %m%n 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/main/resources/mvc-dispatcher-servlet.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 47 | 48 | 49 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /src/main/resources/spiderinfo.json: -------------------------------------------------------------------------------- 1 | { 2 | "spiderinfo": { 3 | "properties": { 4 | "charset": { 5 | "type": "keyword" 6 | }, 7 | "defaultCategory": { 8 | "type": "keyword" 9 | }, 10 | "publishTimeXPath": { 11 | "type": "keyword" 12 | }, 13 | "siteName": { 14 | "type": "text", 15 | "analyzer": "index_ansj", 16 | "search_analyzer": "query_ansj", 17 | "include_in_all": true, 18 | "fields": { 19 | "raw": { 20 | "type": "keyword" 21 | } 22 | } 23 | }, 24 | "gatherFirstPage": { 25 | "type": "boolean" 26 | }, 27 | "timeout": { 28 | "type": "long" 29 | }, 30 | "sleep": { 31 | "type": "long" 32 | }, 33 | "startURL": { 34 | "type": "keyword" 35 | }, 36 | "doNLP": { 37 | "type": "boolean" 38 | }, 39 | "titleReg": { 40 | "type": "keyword" 41 | }, 42 | "callbackURL": { 43 | "type": "keyword" 44 | }, 45 | "titleXPath": { 46 | "type": "keyword" 47 | }, 48 | "publishTimeReg": { 49 | "type": "keyword" 50 | }, 51 | "categoryReg": { 52 | "type": "keyword" 53 | }, 54 | "retry": { 55 | "type": "long" 56 | }, 57 | "contentReg": { 58 | "type": "keyword" 59 | }, 60 | "categoryXPath": { 61 | "type": "keyword" 62 | }, 63 | "needContent": { 64 | "type": "boolean" 65 | }, 66 | "needPublishTime": { 67 | "type": "boolean" 68 | }, 69 | "contentXPath": { 70 | "type": "keyword" 71 | }, 72 | "ajaxSite": { 73 | "type": "boolean" 74 | }, 75 | "needTitle": { 76 | "type": "boolean" 77 | }, 78 | "thread": { 79 | "type": "long" 80 | }, 81 | "priority": { 82 | "type": "long" 83 | }, 84 | "publishTimeFormat": { 85 | "type": "keyword" 86 | }, 87 | "domain": { 88 | "type": "keyword" 89 | }, 90 | "urlReg": { 91 | "type": "keyword" 92 | }, 93 | "maxPageGather": { 94 | "type": "long" 95 | }, 96 | "lang": { 97 | "type": "keyword" 98 | }, 99 | "country": { 100 | "type": "keyword" 101 | }, 102 | "dynamicFields": { 103 | "type": "nested", 104 | "properties": { 105 | "regex": { 106 | "type": "keyword" 107 | }, 108 | "xpath": { 109 | "type": "keyword" 110 | }, 111 | "name": { 112 | "type": "keyword" 113 | }, 114 | "need": { 115 | "type": "boolean" 116 | } 117 | } 118 | } 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /src/main/resources/staticvalue.json: -------------------------------------------------------------------------------- 1 | { 2 | "esHost": "192.68.0.1", 3 | "esPort": 9300, 4 | "esClusterName": "elasticsearch", 5 | "commonsIndex": "commons", 6 | "maxHttpDownloadLength": 1048576, 7 | "commonsSpiderDebug": true, 8 | "taskDeleteDelay": 1, 9 | "taskDeletePeriod": 2, 10 | "limitOfCommonWebpageDownloadQueue": 100000, 11 | "needRedis": false, 12 | "needEs": true, 13 | "redisPort": 6379, 14 | "redisHost": "127.0.0.1", 15 | "webpageRedisPublishChannelName": "webpage", 16 | "commonsWebpageCrawlRatio": 2, 17 | "ajaxDownloader": "http://localhost:7788/" 18 | } 19 | -------------------------------------------------------------------------------- /src/main/resources/webpage.json: -------------------------------------------------------------------------------- 1 | { 2 | "webpage": { 3 | "dynamic_templates": [ 4 | { 5 | "strings": { 6 | "match_mapping_type": "text", 7 | "mapping": { 8 | "type": "text", 9 | "analyzer": "index_ansj", 10 | "search_analyzer": "query_ansj", 11 | "include_in_all": true, 12 | "fields": { 13 | "raw": { 14 | "type": "keyword" 15 | } 16 | } 17 | } 18 | } 19 | } 20 | ], 21 | "properties": { 22 | "content": { 23 | "type": "text", 24 | "analyzer": "index_ansj", 25 | "search_analyzer": "query_ansj", 26 | "include_in_all": true, 27 | "store": true 28 | }, 29 | "title": { 30 | "type": "text", 31 | "analyzer": "index_ansj", 32 | "search_analyzer": "query_ansj", 33 | "include_in_all": true, 34 | "store": true 35 | }, 36 | "dateStr": { 37 | "type": "keyword" 38 | }, 39 | "url": { 40 | "type": "keyword" 41 | }, 42 | "source": { 43 | "type": "text", 44 | "analyzer": "index_ansj", 45 | "search_analyzer": "query_ansj", 46 | "include_in_all": true, 47 | "fields": { 48 | "raw": { 49 | "type": "keyword" 50 | } 51 | } 52 | }, 53 | "id": { 54 | "type": "keyword" 55 | }, 56 | "spiderInfo": { 57 | "type": "keyword" 58 | }, 59 | "gatherTime": { 60 | "type": "date" 61 | }, 62 | "domain": { 63 | "type": "keyword" 64 | }, 65 | "spiderUUID": { 66 | "type": "keyword" 67 | }, 68 | "keywords": { 69 | "type": "keyword" 70 | }, 71 | "summary": { 72 | "type": "text", 73 | "analyzer": "index_ansj", 74 | "search_analyzer": "query_ansj", 75 | "include_in_all": true, 76 | "store": true 77 | }, 78 | "namedEntity": { 79 | "properties": { 80 | "nr": { 81 | "type": "keyword" 82 | }, 83 | "ns": { 84 | "type": "keyword" 85 | }, 86 | "nt": { 87 | "type": "keyword" 88 | } 89 | } 90 | }, 91 | "publishTime": { 92 | "type": "date" 93 | }, 94 | "category": { 95 | "type": "text", 96 | "analyzer": "index_ansj", 97 | "search_analyzer": "query_ansj", 98 | "include_in_all": true, 99 | "fields": { 100 | "raw": { 101 | "type": "keyword" 102 | } 103 | } 104 | }, 105 | "rawHTML": { 106 | "type": "binary", 107 | "include_in_all": false 108 | }, 109 | "dynamic_fields": { 110 | "dynamic": true, 111 | "properties": {} 112 | }, 113 | "processTime": { 114 | "type": "long" 115 | } 116 | } 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | Spring MVC Application 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | CharacterEncodingFilter 20 | 21 | org.springframework.web.filter.CharacterEncodingFilter 22 | 23 | 24 | encoding 25 | utf-8 26 | 27 | 28 | forceEncoding 29 | true 30 | 31 | 32 | 33 | 34 | CharacterEncodingFilter 35 | /* 36 | 37 | 38 | 39 | etagFilter 40 | org.springframework.web.filter.ShallowEtagHeaderFilter 41 | 42 | 43 | etagFilter 44 | /panel/* 45 | 46 | 47 | etagFilter 48 | /js/* 49 | 50 | 51 | etagFilter 52 | /css/* 53 | 54 | 55 | log4jServletFilter 56 | org.apache.logging.log4j.web.Log4jServletFilter 57 | 58 | 59 | log4jServletFilter 60 | /* 61 | REQUEST 62 | FORWARD 63 | INCLUDE 64 | ERROR 65 | 66 | 67 | mvc-dispatcher 68 | org.springframework.web.servlet.DispatcherServlet 69 | 70 | contextConfigLocation 71 | classpath*:/mvc-dispatcher-servlet.xml 72 | 73 | 1 74 | 75 | 76 | 77 | mvc-dispatcher 78 | / 79 | 80 | -------------------------------------------------------------------------------- /src/main/webapp/imgs/logos/logo_without_char_48X48.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemin1003/java-spider/db3e38c7e5ffc8f1fd91c02541d508b91612b39a/src/main/webapp/imgs/logos/logo_without_char_48X48.ico -------------------------------------------------------------------------------- /src/main/webapp/js/messages_zh.min.js: -------------------------------------------------------------------------------- 1 | /*! jQuery Validation Plugin - v1.15.0 - 2/24/2016 2 | * http://jqueryvalidation.org/ 3 | * Copyright (c) 2016 Jörn Zaefferer; Licensed MIT */ 4 | !function(a){"function"==typeof define&&define.amd?define(["jquery","../jquery.validate.min"],a):"object"==typeof module&&module.exports?module.exports=a(require("jquery")):a(jQuery)}(function(a){a.extend(a.validator.messages,{required:"这是必填字段",remote:"请修正此字段",email:"请输入有效的电子邮件地址",url:"请输入有效的网址",date:"请输入有效的日期",dateISO:"请输入有效的日期 (YYYY-MM-DD)",number:"请输入有效的数字",digits:"只能输入数字",creditcard:"请输入有效的信用卡号码",equalTo:"你的输入不相同",extension:"请输入有效的后缀",maxlength:a.validator.format("最多可以输入 {0} 个字符"),minlength:a.validator.format("最少要输入 {0} 个字符"),rangelength:a.validator.format("请输入长度在 {0} 到 {1} 之间的字符串"),range:a.validator.format("请输入范围在 {0} 到 {1} 之间的数值"),max:a.validator.format("请输入不大于 {0} 的数值"),min:a.validator.format("请输入不小于 {0} 的数值")})}); -------------------------------------------------------------------------------- /src/main/webapp/js/my.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by gaoshen on 16/5/17. 3 | */ 4 | var needShowResultModel = false; 5 | function showModal(title, content, cancelAction, confirmAction) { 6 | $("#confirmModalTitle").text(title); 7 | $("#confirmModalBody").html(content); 8 | if (cancelAction != undefined) { 9 | $("#cancelButton").one("click", cancelAction); 10 | } 11 | if (confirmAction != undefined) { 12 | $("#confirmButton").one("click", confirmAction); 13 | } 14 | $('#confirmModal').modal('show'); 15 | } 16 | function inputModal(dataName, callback) { 17 | $('#data').val(''); 18 | $("#inputModalTitle").text("请输入" + dataName); 19 | $("label[for='data']").html(dataName); 20 | $("#confirmInputButton").one("click", function () { 21 | $('#inputModal').modal('hide'); 22 | callback($('#data').val()); 23 | }); 24 | $('#inputModal').modal('show'); 25 | } 26 | function tableModal(data, title) { 27 | $("#tableModalTitle").text(title); 28 | var html = ""; 29 | $.each(data, function (k, v) { 30 | html += '\ 31 | ' + k + '\ 32 | ' + v + '\ 33 | '; 34 | }); 35 | $("#tableModalBody").html(html); 36 | $('#tableModal').modal('show'); 37 | } 38 | function rpc(url, pram, callback) { 39 | $("#confirmModalTitle").text("确定?"); 40 | $("#confirmModalBody").html("确定要执行" + url + "吗?"); 41 | $("#confirmButton").one("click", function () { 42 | $('#confirmModal').modal('hide'); 43 | needShowResultModel = true; 44 | }); 45 | $("#confirmModal").one('hidden.bs.modal', function () { 46 | if (needShowResultModel) { 47 | $.getJSON(url, pram, callback); 48 | } 49 | }); 50 | needShowResultModel = false; 51 | $('#confirmModal').modal('show'); 52 | } 53 | function rpcAndShowData(url, pram) { 54 | rpc(url, pram, function (data) { 55 | needShowResultModel = false; 56 | if (data.success) { 57 | showModal("成功", data.result != undefined ? data.result : data.resultList, function () { 58 | $('#confirmModal').modal('hide'); 59 | }, function () { 60 | $('#confirmModal').modal('hide'); 61 | }); 62 | } else { 63 | showModal("失败", "请重试" + data.errorMsg, function () { 64 | $('#confirmModal').modal('hide'); 65 | }, function () { 66 | $('#confirmModal').modal('hide'); 67 | }); 68 | } 69 | }) 70 | } 71 | -------------------------------------------------------------------------------- /src/main/webapp/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('./umd/util.js') 3 | require('./umd/alert.js') 4 | require('./umd/button.js') 5 | require('./umd/carousel.js') 6 | require('./umd/collapse.js') 7 | require('./umd/dropdown.js') 8 | require('./umd/modal.js') 9 | require('./umd/scrollspy.js') 10 | require('./umd/tab.js') 11 | require('./umd/tooltip.js') 12 | require('./umd/popover.js') -------------------------------------------------------------------------------- /src/main/webapp/pages/commons/allScript.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/main/webapp/pages/commons/head.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | 21 | 41 | 64 | 65 | 93 | 94 |
95 | 117 |
-------------------------------------------------------------------------------- /src/main/webapp/pages/commons/header.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | 3 | 4 | 5 | 6 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/main/webapp/pages/commons/minScript.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/main/webapp/pages/commons/tablePage.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 3 | 4 | 5 |
6 |
7 |
8 |
9 |   共 ${tablePage.totalRow } 10 | 条记录/共 11 | ${tablePage.pageCount } 12 | 页 13 |
14 |
15 |
16 |
17 | 66 |
67 |
68 |
69 |
70 |
71 | 72 |
-------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/createQuartz.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | <%@taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 3 | <%@taglib prefix="form" uri="http://www.springframework.org/tags/form" %> 4 | 5 | 6 | 7 | 定时网页抓取任务创建 8 | <%@include file="../../commons/header.jsp" %> 9 | 10 | <%@include file="../../commons/head.jsp" %> 11 | 12 |
13 |
14 |
16 |
17 | 18 | 20 |
21 |
22 | 23 | 24 |
25 |
26 | 27 |
28 |
29 |
30 |
31 | <%@include file="../../commons/allScript.jsp" %> 32 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/domainList.jsp: -------------------------------------------------------------------------------- 1 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 2 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 3 | 4 | 5 | 网站列表 6 | <%@include file="../../commons/header.jsp" %> 7 | 8 | 9 | <%@include file="../../commons/head.jsp" %> 10 |
11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 34 | 37 | 38 | 39 | 40 |
#网站域名资讯数查看列表删除导出数据
${index.count}${domain.key}${domain.value}查看资讯列表 30 | 删除网站数据 33 | 导出该网站数据JSON 36 |
41 |
42 |
43 | <%@include file="../../commons/allScript.jsp" %> 44 | 45 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/list.jsp: -------------------------------------------------------------------------------- 1 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/fmt" prefix="fmt" %> 3 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 4 | 5 | 6 | 资讯列表 7 | <%@include file="../../commons/header.jsp" %> 8 | <%@include file="../../commons/allScript.jsp" %> 9 | 65 | 71 | 72 | 73 | 74 | <%@include file="../../commons/head.jsp" %> 75 | 76 |
77 |
78 |
79 |
80 |
81 |
82 | 83 | 84 |
85 |
86 |
87 |
88 | (*支持模糊) 89 | 90 |
91 |
92 |
93 | 94 |   95 | 重置 96 |
97 |
98 |
99 |
100 |
101 |
102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 125 | 129 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 |
#标题网站时间查看转到删除
${wpIndex.count}${webpage.title}${webpage.domain} 123 | 124 | 126 | Go 128 | 130 | 132 |

无数据!

143 |
144 | 145 | <%@include file="../../commons/tablePage.jsp" %> 146 |
147 |
148 | 149 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/listQuartz.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | <%@taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 3 | <%@taglib prefix="form" uri="http://www.springframework.org/tags/form" %> 4 | <%@ taglib prefix="fmt" uri="http://java.sun.com/jsp/jstl/fmt" %> 5 | 6 | 7 | 8 | 定时网页抓取任务列表 9 | <%@include file="../../commons/header.jsp" %> 10 | 11 | 12 | 13 | <%@include file="../../commons/head.jsp" %> 14 |
15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 32 | 34 | 36 | 40 | 41 | 42 | 43 |
网站名称上次执行时间下次执行时间创建时间删除任务
${entry.value.left.siteName} 37 | 39 |
44 |
45 |
46 | 47 | <%@include file="../../commons/minScript.jsp" %> 48 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/listSpiderInfo.jsp: -------------------------------------------------------------------------------- 1 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/fmt" prefix="fmt" %> 3 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 4 | 5 | 6 | 爬虫模板列表 7 | <%@include file="../../commons/header.jsp" %> 8 | <%@include file="../../commons/allScript.jsp" %> 9 | 44 | 45 | 46 | <%@include file="../../commons/head.jsp" %> 47 |
48 |
50 |
51 | 52 | 53 |
54 |
55 | 56 | 57 |
58 | 59 |
60 |
61 |
62 |
63 | 64 | 65 |
66 | 67 | 68 | 69 | 70 | 71 | 72 | <%----%> 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 84 | 85 | 86 | <%----%> 87 | 91 | 97 | 101 | 102 | 103 | 104 |
#网站域名网站名称查看数据编辑删除定时任务
${info.domain}${info.siteName} 88 | 编辑 90 | 92 | 96 | 98 | 创建定时任务 100 |
105 |
106 | 107 | 108 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/showRelatedInfo.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 3 | 4 | 5 | 相关资讯 6 | <%@include file="/pages/commons/header.jsp" %> 7 | 8 | 9 | <%@include file="/pages/commons/head.jsp" %> 10 |
11 |
12 |

13 | ${title} 14 |

15 |
16 |
17 |
18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
人物名称提及次数
${bucket.key} ${bucket.docCount}
34 |
35 |
36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
地点名称提及次数
${bucket.key} ${bucket.docCount}
54 |
55 |
56 |
57 |
58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 |
机构名称提及次数
${bucket.key} ${bucket.docCount}
76 |
77 |
78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 |
关键词提及次数
${bucket.key} ${bucket.docCount}
96 |
97 |
98 |
99 | <%@include file="../../commons/minScript.jsp" %> 100 | 101 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/showWebpageById.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 3 | <%@ taglib prefix="fmt" uri="http://java.sun.com/jsp/jstl/fmt" %> 4 | 5 | 6 | ${webpage.title} 7 | <%@include file="/pages/commons/header.jsp" %> 8 | 9 | 10 | <%@include file="/pages/commons/head.jsp" %> 11 |
12 |
13 |
14 |

${webpage.title}

15 |

16 | ${webpage.content} 17 |

18 |

网页元信息

19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 46 | 47 | 48 | 49 | 55 | 56 | 57 | 58 | 64 | 65 | 66 | 67 | 73 | 74 | 75 | 76 | 79 | 80 | 81 | 82 | 85 | 86 | 87 | 88 | 91 | 92 | 93 | 94 | 98 | 99 | 100 | 101 | 105 | 106 | 107 |
属性名称属性值
关键词 30 | 31 | ${word} 32 | 33 |
类别${webpage.category}
摘要 42 | 43 | ${sentence}, 44 | 45 |
人名 50 | 51 | ${word} 53 | 54 |
地名 59 | 60 | ${word} 62 | 63 |
机构名 68 | 69 | ${word} 71 | 72 |
发布时间 77 | 78 |
采集时间 83 | 84 |
网页处理耗时 89 | ${webpage.processTime/(1000.0)}秒 90 |
采集模板 95 | 查看爬虫模板 97 |
原网站 102 | 查看${webpage.domain}的新闻 104 |
108 |

动态字段

109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 |
${entry.key}
${entry.value}
125 |

126 | 阅读原文 127 |

128 |
129 |
130 |

相关资讯

131 | 138 |
139 |
140 |
141 | <%@include file="../../commons/minScript.jsp" %> 142 | 143 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/commons/updateBySpiderInfoID.jsp: -------------------------------------------------------------------------------- 1 | <%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> 2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/fmt" prefix="fmt" %> 3 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 4 | 5 | 6 | 数据更新 7 | <%@include file="../../commons/header.jsp" %> 8 | <%@include file="../../commons/allScript.jsp" %> 9 | 45 | 46 | 47 | <%@include file="../../commons/head.jsp" %> 48 |
49 |
50 |
51 | 52 | 55 |
56 |
57 | 58 | 60 |
61 |
62 | 63 | 64 |
65 | 66 |
67 |
68 | 69 | -------------------------------------------------------------------------------- /src/main/webapp/pages/panel/welcome/welcome.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=UTF-8" language="java" %> 2 | 3 | 4 | ${appName } 5 | <%@include file="../../commons/header.jsp" %> 6 | 7 | 8 | <%@include file="../../commons/head.jsp" %> 9 |
10 |
11 |
12 |

欢迎使用${appName }   Version:${appVersion }

13 | 14 |
15 | 16 | 21 |
22 | 23 | <%-- 24 |
25 | 26 |
27 |
28 |

当前版本:${appVersion }

29 |
    30 |
  • 优化..
  • 31 |
  • 修复..
  • 32 |
33 |
34 |
35 | 36 | 37 | 38 |
39 |
40 |

历史版本  显示

41 | 59 |
60 |
61 | 62 |
63 | --%> 64 |
65 | 66 | <%@include file="../../commons/minScript.jsp" %> 67 | <%-- --%> 78 | 79 | 80 | --------------------------------------------------------------------------------