├── docs
└── tutorial.md
├── crawler-server
├── .gitignore
├── src
│ └── main
│ │ ├── webapp
│ │ ├── META-INF
│ │ │ └── context.xml
│ │ ├── index.jsp
│ │ ├── fonts
│ │ │ ├── glyphicons-halflings-regular.eot
│ │ │ ├── glyphicons-halflings-regular.ttf
│ │ │ ├── glyphicons-halflings-regular.woff
│ │ │ └── glyphicons-halflings-regular.woff2
│ │ ├── WEB-INF
│ │ │ └── web.xml
│ │ ├── js
│ │ │ ├── spider-list.js
│ │ │ └── bootstrap.min.js
│ │ └── jsp
│ │ │ ├── new-employee.jsp
│ │ │ └── spider-list.jsp
│ │ └── java
│ │ └── com
│ │ └── github
│ │ └── xbynet
│ │ └── crawler
│ │ └── server
│ │ ├── Main.java
│ │ ├── HelloServlet.java
│ │ ├── monitor
│ │ ├── SpiderManager.java
│ │ └── MonitorServlet.java
│ │ └── demo
│ │ └── GithubCrawler.java
└── pom.xml
├── crawler-core
├── src
│ ├── main
│ │ └── java
│ │ │ └── com
│ │ │ └── github
│ │ │ └── xbynet
│ │ │ └── crawler
│ │ │ ├── parser
│ │ │ ├── Parser.java
│ │ │ ├── JsonPathParser.java
│ │ │ ├── XpathParser.java
│ │ │ └── JsoupParser.java
│ │ │ ├── ISpider.java
│ │ │ ├── SpiderListener.java
│ │ │ ├── Const.java
│ │ │ ├── http
│ │ │ ├── Downloader.java
│ │ │ ├── FileDownloader.java
│ │ │ ├── CustomRedirectStrategy.java
│ │ │ ├── DefaultDownloader.java
│ │ │ ├── HttpClientFactory.java
│ │ │ └── AbsDownloader.java
│ │ │ ├── annotation
│ │ │ └── Nullable.java
│ │ │ ├── scheduler
│ │ │ ├── DuplicateRemover.java
│ │ │ ├── Scheduler.java
│ │ │ ├── DefaultScheduler.java
│ │ │ └── RedisScheduler.java
│ │ │ ├── IpProxyProvider.java
│ │ │ ├── RequestAction.java
│ │ │ ├── utils
│ │ │ ├── BeanUtil.java
│ │ │ ├── CrawlerUtils.java
│ │ │ └── CountableThreadPool.java
│ │ │ ├── Processor.java
│ │ │ ├── Site.java
│ │ │ ├── Response.java
│ │ │ ├── Request.java
│ │ │ └── Spider.java
│ └── test
│ │ ├── java
│ │ └── net
│ │ │ └── xby1993
│ │ │ └── crawler
│ │ │ ├── StartAllJoke.java
│ │ │ ├── AppTest.java
│ │ │ ├── ZhihuRecommendCrawler.java
│ │ │ ├── OSChinaTweetsCrawler.java
│ │ │ ├── QiushibaikeCrawler.java
│ │ │ ├── NeihanshequCrawler.java
│ │ │ └── GithubCrawler.java
│ │ └── resources
│ │ └── logback.xml
└── pom.xml
├── crawler-selenium
├── src
│ └── main
│ │ └── java
│ │ └── com
│ │ └── github
│ │ └── xbynet
│ │ └── crawler
│ │ └── selenium
│ │ ├── SeleniumAction.java
│ │ ├── WebDriverPool.java
│ │ ├── getCssAttr.js
│ │ ├── ImageRegion.java
│ │ ├── ImageUtil.java
│ │ ├── WebDriverManager.java
│ │ ├── SeleniumDownloader.java
│ │ ├── PhantomjsWebDriverPool.java
│ │ └── WindowUtil.java
└── pom.xml
├── .gitignore
├── LICENSE
├── README.md
└── pom.xml
/docs/tutorial.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawler-server/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/META-INF/context.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
1 |
2 |
3 | Hello World!
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/Parser.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.parser;
2 |
3 | public interface Parser {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/ISpider.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | public interface ISpider {
4 | String getName();
5 |
6 | }
7 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/SpiderListener.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | public interface SpiderListener {
4 | void success(Spider spider,Request request);
5 | void fail(Spider spider,Request request,Exception e);
6 | }
7 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Const.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | public class Const {
4 | public enum HttpMethod{
5 | GET,POST,HEAD
6 | }
7 | public enum CssAttr{
8 | innerHtml,text,allText
9 | }
10 | public enum ResponseType{
11 | TEXT,BIN
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/SeleniumAction.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.selenium;
2 |
3 | import org.openqa.selenium.WebDriver;
4 |
5 | /**
6 | * @author taojw
7 | *
8 | */
9 | public interface SeleniumAction {
10 | void execute(WebDriver driver);
11 | }
12 |
--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/StartAllJoke.java:
--------------------------------------------------------------------------------
1 | package net.xby1993.crawler;
2 |
3 | public class StartAllJoke {
4 | public static void main(String[] args) {
5 | new OSChinaTweetsCrawler().start();
6 | new QiushibaikeCrawler().start();
7 | new NeihanshequCrawler().start();
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverPool.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.selenium;
2 |
3 | import org.openqa.selenium.WebDriver;
4 |
5 | public interface WebDriverPool {
6 | WebDriver get() throws InterruptedException;
7 | void returnToPool(WebDriver webDriver);
8 | void close(WebDriver webDriver);
9 | void shutdown();
10 | }
11 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/Downloader.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.http;
2 |
3 | import java.io.Closeable;
4 |
5 | import com.github.xbynet.crawler.Request;
6 | import com.github.xbynet.crawler.Spider;
7 |
8 | public interface Downloader extends Closeable{
9 | void init();
10 | void download(Request request);
11 | void setSpider(Spider spider);
12 | }
13 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/annotation/Nullable.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.annotation;
2 |
3 | import java.lang.annotation.ElementType;
4 | import java.lang.annotation.Retention;
5 | import java.lang.annotation.RetentionPolicy;
6 | import java.lang.annotation.Target;
7 |
8 | @Target(ElementType.PARAMETER)
9 | @Retention(RetentionPolicy.SOURCE)
10 | public @interface Nullable {
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
7 | Archetype Created Web Application
8 |
9 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DuplicateRemover.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.scheduler;
2 |
3 | import com.github.xbynet.crawler.ISpider;
4 | import com.github.xbynet.crawler.Request;
5 |
6 | public interface DuplicateRemover {
7 | public boolean isDuplicate(Request request, ISpider spider);
8 | public void resetDuplicateCheck(ISpider spider);
9 | public int getTotalRequestsCount(ISpider spider);
10 |
11 | }
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/getCssAttr.js:
--------------------------------------------------------------------------------
1 | function getStyle(obj, attr) {
2 | if (obj.currentStyle) {
3 | return obj.currentStyle[attr];
4 | } else {
5 | return document.defaultView.getComputedStyle(obj, null)[attr];
6 | }
7 | }
8 | function getCssAttr(sel,attr){
9 | var tmp=document.querySelector(sel);
10 | var res=getStyle(tmp,attr);
11 | return res;
12 | }
13 | return getCssAttr(arguments[0],arguments[1]);
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/js/spider-list.js:
--------------------------------------------------------------------------------
1 | function changeState(name){
2 | var t=$("#stateBtn").text().trim();
3 | var method='start';
4 | if(t=='停止'){
5 | method='stop';
6 | }
7 | $.get(baseUrl+"monitor?name="+name+"&method="+method,function(data){
8 | if(data=='true'){
9 | $("#stateBtn").text(method=='start'?'停止':'启动');
10 | $("#status").text(method=='start'?"running":"stopping...");
11 | }else{
12 | alert("请求失败:"+data);
13 | }
14 | },"text")
15 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | /.settings
3 | /target
4 | /.classpath
5 | /crawler-core/target
6 | /crawler-selenium/target
7 | /crawler-core/.project
8 | /crawler-core/.settings
9 | /crawler-core/.classpath
10 | /crawler-selenium/.project
11 | /crawler-selenium/.settings
12 | /crawler-selenium/.classpath
13 | /crawler-server/.tern-project
14 | /crawler-server/.settings
15 | /crawler-server/target
16 | /crawler-server/tomcat.8666
17 | /crawler-server/.classpath
18 | /crawler-server/.project
19 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/Scheduler.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.scheduler;
2 |
3 | import com.github.xbynet.crawler.ISpider;
4 | import com.github.xbynet.crawler.Request;
5 |
6 | public interface Scheduler {
7 | public void push(Request request,ISpider spider);
8 | public Request poll(ISpider spider);
9 | public int getLeftRequestsCount(ISpider spider);
10 | public int getTotalRequestsCount(ISpider spider);
11 | public DuplicateRemover getDuplicateRemover();
12 | }
13 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/IpProxyProvider.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | import java.io.Closeable;
4 | import java.io.IOException;
5 |
6 | import org.apache.http.HttpHost;
7 |
8 | public class IpProxyProvider implements Closeable{
9 |
10 | public HttpHost getIp(){
11 | return null;
12 | }
13 | public void invalid(HttpHost host){
14 |
15 | }
16 | public void valid(HttpHost host){
17 |
18 | }
19 | @Override
20 | public void close() throws IOException {
21 |
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/RequestAction.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | import java.io.Serializable;
4 |
5 | import org.apache.http.client.methods.CloseableHttpResponse;
6 | import org.apache.http.client.methods.HttpUriRequest;
7 | import org.apache.http.impl.client.CloseableHttpClient;
8 |
9 | public interface RequestAction extends Serializable {
10 | void before(CloseableHttpClient client,HttpUriRequest req);
11 | void after(CloseableHttpClient client,CloseableHttpResponse resp);
12 | }
13 |
--------------------------------------------------------------------------------
/crawler-core/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %-5level %msg [%logger{16} %d{HH:mm:ss}]%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageRegion.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.selenium;
2 |
3 | /**
4 | * @author taojw
5 | *
6 | */
7 | public class ImageRegion {
8 | public int x;
9 | public int y;
10 | public int width;
11 | public int height;
12 | public ImageRegion(int x,int y,int width,int height){
13 | this.x=x;
14 | this.y=y;
15 | this.width=width;
16 | this.height=height;
17 | }
18 | @Override
19 | public String toString() {
20 | return "ImageRegion [x=" + x + ", y=" + y + ", width=" + width
21 | + ", height=" + height + "]";
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/crawler-core/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.github.xbynet
6 | crawler-parent
7 | 0.3.0
8 |
9 | crawler-core
10 | jar
11 |
12 |
13 | redis.clients
14 | jedis
15 |
16 |
17 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/BeanUtil.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.utils;
2 |
3 | import java.util.concurrent.ConcurrentHashMap;
4 |
5 | import net.sf.cglib.beans.BeanCopier;
6 |
7 | public class BeanUtil {
8 | public static ConcurrentHashMap beanCopierMap = new ConcurrentHashMap();
9 |
10 | public static void copyProperties(Object source, Object target) {
11 | String beanKey = generateKey(source.getClass(), target.getClass());
12 | BeanCopier copier = null;
13 | copier = BeanCopier.create(source.getClass(), target.getClass(), false);
14 | beanCopierMap.putIfAbsent(beanKey, copier);
15 | copier = beanCopierMap.get(beanKey);
16 | copier.copy(source, target, null);
17 | }
18 |
19 | private static String generateKey(Class> class1, Class> class2) {
20 | return class1.toString() + class2.toString();
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageUtil.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.selenium;
2 |
3 | import java.io.IOException;
4 |
5 | import net.coobird.thumbnailator.Thumbnails;
6 |
7 | /**
8 | * @author taojw
9 | *
10 | */
11 | public class ImageUtil {
12 | public static void crop(String srcfile,String destfile,ImageRegion region){
13 | //指定坐标
14 | try {
15 | Thumbnails.of(srcfile)
16 | .sourceRegion(region.x, region.y, region.width, region.height)
17 | .size(region.width, region.height).outputQuality(1.0)
18 | //.keepAspectRatio(false) //不保持比例
19 | .toFile(destfile);
20 | } catch (IOException e) {
21 | // TODO Auto-generated catch block
22 | e.printStackTrace();
23 | }
24 | }
25 | public static void main(String[] args) {
26 | crop("D:\\data\\111.png","D:\\data\\1112.png",new ImageRegion(66, 264, 422, 426));
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/Main.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.server;
2 |
3 | import org.apache.catalina.core.StandardContext;
4 | import org.apache.catalina.startup.Tomcat;
5 |
6 |
7 | /**
8 | *Embeded Tomcat
9 | *http://www.oracle.com/webfolder/technetwork/tutorials/obe/java/basic_app_embedded_tomcat/basic_app-tomcat-embedded.html
10 | *https://github.com/heroku/devcenter-embedded-tomcat
11 | */
12 | public class Main {
13 |
14 | public static void main(String[] args) throws Exception {
15 | String contextPath = "/";
16 | String appBase = ".";
17 | Tomcat tomcat = new Tomcat();
18 | tomcat.setPort(8666);
19 | tomcat.getHost().setAppBase(appBase);
20 | StandardContext ctx=(StandardContext)tomcat.addWebapp(contextPath, appBase);//Context ctx = tomcat.addContext("/", new File(".").getAbsolutePath());
21 |
22 | tomcat.start();
23 | tomcat.getServer().await();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package net.xby1993.crawler;
2 |
3 | import org.apache.http.client.methods.CloseableHttpResponse;
4 | import org.apache.http.client.methods.HttpUriRequest;
5 | import org.apache.http.impl.client.CloseableHttpClient;
6 |
7 | import com.alibaba.fastjson.JSONObject;
8 |
9 | import junit.framework.Test;
10 | import junit.framework.TestCase;
11 | import junit.framework.TestSuite;
12 |
13 | /**
14 | * Unit test for simple App.
15 | */
16 | public class AppTest
17 | extends TestCase
18 | {
19 | /**
20 | * Create the test case
21 | *
22 | * @param testName name of the test case
23 | */
24 | public AppTest( String testName )
25 | {
26 | super( testName );
27 | }
28 |
29 | /**
30 | * @return the suite of tests being tested
31 | */
32 | public static Test suite()
33 | {
34 | return new TestSuite( AppTest.class );
35 | }
36 |
37 | /**
38 | * Rigourous Test :-)
39 | */
40 | public void testApp()
41 | {
42 | assertTrue( true );
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 xbynet
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/crawler-selenium/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.github.xbynet
6 | crawler-parent
7 | 0.3.0
8 |
9 | crawler-selenium
10 | jar
11 |
12 |
13 | com.github.xbynet
14 | crawler-core
15 | ${project.version}
16 |
17 |
18 | org.seleniumhq.selenium
19 | selenium-java
20 |
21 |
22 | com.codeborne
23 | phantomjsdriver
24 |
25 |
26 | net.coobird
27 | thumbnailator
28 | 0.4.8
29 |
30 |
31 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Processor.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | import java.io.Closeable;
4 | import java.io.IOException;
5 |
6 | import com.github.xbynet.crawler.http.FileDownloader;
7 |
8 | /**
9 | *爬虫页面处理器,撰写爬虫时需要扩展此类
10 | */
11 | public abstract class Processor implements Closeable{
12 | private FileDownloader fileDownloader=null;
13 | private Spider spider=null;
14 |
15 | public abstract void process(Response resp);
16 |
17 | public boolean download(Request req,String savePath){
18 | return fileDownloader.download(req, savePath);
19 | }
20 | public boolean download(String url,String savePath){
21 | Request req=new Request(url);
22 | return fileDownloader.download(req, savePath);
23 | }
24 | public FileDownloader getFileDownloader() {
25 | return fileDownloader;
26 | }
27 |
28 | public void setFileDownloader(FileDownloader fileDownloader) {
29 | this.fileDownloader = fileDownloader;
30 | }
31 | @Override
32 | public void close()throws IOException{
33 |
34 | }
35 |
36 | public Spider getSpider() {
37 | return spider;
38 | }
39 |
40 | public void setSpider(Spider spider) {
41 | this.spider = spider;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/HelloServlet.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.server;
2 |
3 | import java.io.IOException;
4 |
5 | import javax.servlet.ServletException;
6 | import javax.servlet.ServletOutputStream;
7 | import javax.servlet.annotation.WebServlet;
8 | import javax.servlet.http.HttpServlet;
9 | import javax.servlet.http.HttpServletRequest;
10 | import javax.servlet.http.HttpServletResponse;
11 |
12 | import com.github.xbynet.crawler.Spider;
13 | import com.github.xbynet.crawler.server.demo.GithubCrawler;
14 | import com.github.xbynet.crawler.server.monitor.SpiderManager;
15 |
16 | @WebServlet(
17 | name = "MyServlet",
18 | urlPatterns = {"/hello"}
19 | )
20 | public class HelloServlet extends HttpServlet {
21 |
22 | @Override
23 | protected void doGet(HttpServletRequest req, HttpServletResponse resp)
24 | throws ServletException, IOException {
25 | ServletOutputStream out = resp.getOutputStream();
26 | Spider s=new GithubCrawler().createSpider();
27 | SpiderManager.get().add(s);
28 | out.write(("add spider of "+s.getName()).getBytes());
29 | out.flush();
30 | out.close();
31 | }
32 |
33 | }
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/CrawlerUtils.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.utils;
2 |
3 | import javax.script.Invocable;
4 | import javax.script.ScriptEngine;
5 | import javax.script.ScriptEngineManager;
6 |
7 | import org.apache.commons.lang3.StringUtils;
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 | import com.github.xbynet.crawler.annotation.Nullable;
12 |
13 | public class CrawlerUtils {
14 | private static final Logger log=LoggerFactory.getLogger(CrawlerUtils.class);
15 |
16 | public static void sleep(int millis){
17 | try {
18 | Thread.sleep(millis);
19 | } catch (InterruptedException e) {
20 | log.warn("",e);
21 | }
22 | }
23 |
24 | public Object executeJs(String js,@Nullable String funcName,Object... args){
25 | ScriptEngineManager manager = new ScriptEngineManager();
26 | ScriptEngine engine = manager.getEngineByName("javascript");
27 | try {
28 | Object res=engine.eval(js);
29 | if(StringUtils.isNotBlank(funcName)){
30 | if (engine instanceof Invocable) {
31 | Invocable invoke = (Invocable) engine;
32 | res = invoke.invokeFunction(funcName, args);
33 | }
34 | }
35 | return res;
36 | } catch (Exception e) {
37 | log.error("",e);
38 | }
39 | return null;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/JsonPathParser.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.parser;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import com.jayway.jsonpath.JsonPath;
7 | import com.jayway.jsonpath.ReadContext;
8 |
9 | public class JsonPathParser implements Parser {
10 | private ReadContext ctx;
11 |
12 | public JsonPathParser(String raw) {
13 | this.ctx = JsonPath.parse(raw);
14 | }
15 |
16 | public String single(String jsonpath) {
17 | Object object = ctx.read(jsonpath);
18 | if (object == null) {
19 | return null;
20 | }
21 | if (object instanceof List) {
22 | List list = (List) object;
23 | if (list != null && list.size() > 0) {
24 | return list.get(0).toString();
25 | }
26 | }
27 | return object.toString();
28 | }
29 |
30 | public List list(String jsonpath) {
31 | List reslist = new ArrayList();
32 | Object object = ctx.read(jsonpath);
33 | if (object == null) {
34 | return reslist;
35 | }
36 | if (object instanceof List) {
37 | List list = (List) object;
38 | for (Object item : list) {
39 | reslist.add(item.toString());
40 | }
41 | } else {
42 | reslist.add(object.toString());
43 | }
44 | return reslist;
45 | }
46 |
47 | public ReadContext getCtx() {
48 | return ctx;
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/XpathParser.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.parser;
2 |
3 | import java.util.List;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 |
9 | import us.codecraft.xsoup.XPathEvaluator;
10 | import us.codecraft.xsoup.Xsoup;
11 |
12 | public class XpathParser implements Parser{
13 |
14 | private Document doc;
15 |
16 | public XpathParser(String raw) {
17 | this.doc=Jsoup.parse(raw);
18 | }
19 |
20 | public String single(String xpathStr) {
21 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
22 | return xPathEvaluator.evaluate(doc).get();
23 | }
24 |
25 | public List list(String xpathStr) {
26 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
27 | return xPathEvaluator.evaluate(doc).list();
28 | }
29 |
30 | public Element element(String xpathStr) {
31 | List elements = elements(xpathStr);
32 | if (elements!=null && elements.size()>0){
33 | return elements.get(0);
34 | }
35 | return null;
36 | }
37 |
38 | public List elements(String xpathStr) {
39 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
40 | return xPathEvaluator.evaluate(doc).getElements();
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DefaultScheduler.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.scheduler;
2 |
3 | import java.util.Collections;
4 | import java.util.Set;
5 | import java.util.concurrent.BlockingQueue;
6 | import java.util.concurrent.ConcurrentHashMap;
7 | import java.util.concurrent.LinkedBlockingQueue;
8 |
9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 |
12 | import com.github.xbynet.crawler.Const;
13 | import com.github.xbynet.crawler.ISpider;
14 | import com.github.xbynet.crawler.Request;
15 |
16 | public class DefaultScheduler implements Scheduler, DuplicateRemover {
17 | private final Logger log = LoggerFactory.getLogger(DefaultScheduler.class);
18 | private Set urls = Collections
19 | .newSetFromMap(new ConcurrentHashMap());
20 | private BlockingQueue queue = new LinkedBlockingQueue();
21 |
22 | public void push(Request request, ISpider spider) {
23 | if (Const.HttpMethod.POST == request.getMethod()
24 | || !isDuplicate(request, spider)) {
25 | log.debug("push to queue {}", request.getUrl());
26 | queue.add(request);
27 | }
28 | }
29 |
30 | public Request poll(ISpider spider) {
31 | return queue.poll();
32 | }
33 |
34 | public DuplicateRemover getDuplicateRemover(){
35 | return this;
36 | }
37 | public boolean isDuplicate(Request request, ISpider spider) {
38 | return !urls.add(request.getUrl());
39 | }
40 |
41 | public void resetDuplicateCheck(ISpider spider) {
42 | urls.clear();
43 | }
44 |
45 | public int getTotalRequestsCount(ISpider spider) {
46 | return urls.size();
47 | }
48 |
49 | public int getLeftRequestsCount(ISpider spider) {
50 | return queue.size();
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/FileDownloader.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.http;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 |
7 | import org.apache.commons.io.IOUtils;
8 | import org.apache.http.client.methods.CloseableHttpResponse;
9 | import org.apache.http.client.methods.HttpUriRequest;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import com.github.xbynet.crawler.Request;
14 | import com.github.xbynet.crawler.Response;
15 | import com.github.xbynet.crawler.Site;
16 |
17 |
18 | public class FileDownloader extends AbsDownloader{
19 | private final Logger log=LoggerFactory.getLogger(FileDownloader.class);
20 |
21 |
22 | public boolean download(Request request,String savePath){
23 | log.debug("开始下载文件"+request.getUrl()+"到路径"+savePath);
24 | super.doDownload(request,savePath);
25 | File file=new File(savePath);
26 | return file.exists();
27 | }
28 | @Override
29 | protected void process(HttpUriRequest httpUriRequest,
30 | CloseableHttpResponse resp, Request request, Site site,Response response,Object... extras) {
31 | if(resp==null){
32 | log.error("文件"+httpUriRequest.getURI().toString()+"下载失败");
33 | return;
34 | }
35 | String savePath=extras[0].toString();
36 | File saveFile=new File(savePath);
37 | if(saveFile.exists()){
38 | saveFile.delete();
39 | }
40 | FileOutputStream fous=null;
41 | try {
42 | fous=new FileOutputStream(saveFile);
43 | IOUtils.copy(resp.getEntity().getContent(), fous);
44 | log.debug("文件"+httpUriRequest.getURI().toString()+"下载成功");
45 | } catch (UnsupportedOperationException e) {
46 | log.error("",e);
47 | } catch (IOException e) {
48 | log.error("",e);
49 | }finally{
50 | IOUtils.closeQuietly(fous);
51 | }
52 | }
53 |
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Site.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | public class Site {
7 | private String encoding="UTF-8";
8 | private String ua="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
9 | private int sleep=20;
10 | private int retry=3;
11 | private int retrySleep=500;
12 | private int timeout=30000;
13 | private Map headers=new HashMap();
14 |
15 | public Site(){
16 | getHeaders().put("User-Agent", ua);
17 | }
18 | public String getEncoding() {
19 | return encoding;
20 | }
21 |
22 | public Site setEncoding(String encoding) {
23 | this.encoding = encoding;
24 | return this;
25 | }
26 |
27 | public String getUa() {
28 | return ua;
29 | }
30 |
31 | public Site setUa(String ua) {
32 | getHeaders().put("User-Agent", ua);
33 | return this;
34 | }
35 |
36 | public int getSleep() {
37 | return sleep;
38 | }
39 |
40 | public Site setSleep(int sleep) {
41 | this.sleep = sleep;
42 | return this;
43 | }
44 |
45 | public int getRetry() {
46 | return retry;
47 | }
48 |
49 | public Site setRetry(int retry) {
50 | this.retry = retry;
51 | return this;
52 | }
53 |
54 | public int getRetrySleep() {
55 | return retrySleep;
56 | }
57 |
58 | public Site setRetrySleep(int retrySleep) {
59 | this.retrySleep = retrySleep;
60 | return this;
61 | }
62 |
63 | public int getTimeout() {
64 | return timeout;
65 | }
66 |
67 | public Site setTimeout(int timeout) {
68 | this.timeout = timeout;
69 | return this;
70 | }
71 |
72 | public Site setHeader(String name,String value){
73 | getHeaders().put(name, value);
74 | return this;
75 | }
76 | public Map getHeaders() {
77 | return headers;
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/CustomRedirectStrategy.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.http;
2 |
3 | import java.net.URI;
4 |
5 | import org.apache.http.HttpRequest;
6 | import org.apache.http.HttpResponse;
7 | import org.apache.http.ProtocolException;
8 | import org.apache.http.client.methods.HttpGet;
9 | import org.apache.http.client.methods.HttpPost;
10 | import org.apache.http.client.methods.HttpRequestWrapper;
11 | import org.apache.http.client.methods.HttpUriRequest;
12 | import org.apache.http.impl.client.LaxRedirectStrategy;
13 | import org.apache.http.protocol.HttpContext;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | /**
18 | *支持post 302跳转策略实现类
19 | *HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy());
20 | *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。
21 | *原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java
22 | */
23 | public class CustomRedirectStrategy extends LaxRedirectStrategy {
24 | private Logger logger = LoggerFactory.getLogger(getClass());
25 |
26 | @Override
27 | public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException {
28 | URI uri = getLocationURI(request, response, context);
29 | String method = request.getRequestLine().getMethod();
30 | if ("post".equalsIgnoreCase(method)) {
31 | try {
32 | HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request;
33 | httpRequestWrapper.setURI(uri);
34 | httpRequestWrapper.removeHeaders("Content-Length");
35 | return httpRequestWrapper;
36 | } catch (Exception e) {
37 | logger.error("强转为HttpRequestWrapper出错");
38 | }
39 | return new HttpPost(uri);
40 | } else {
41 | return new HttpGet(uri);
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverManager.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.selenium;
2 |
3 | import java.io.Closeable;
4 | import java.io.IOException;
5 |
6 | import org.openqa.selenium.WebDriver;
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | public class WebDriverManager implements Closeable{
11 | private static final Logger log=LoggerFactory.getLogger(WebDriverManager.class);
12 |
13 | private WebDriverPool webDriverPool=null;
14 |
15 | public WebDriverManager(String phantomjsPath){
16 | this.webDriverPool=new PhantomjsWebDriverPool(1,false,phantomjsPath);
17 | }
18 | public WebDriverManager(WebDriverPool webDriverPool){
19 | this.webDriverPool=webDriverPool;
20 | }
21 | public void load(String url,int sleepTimeMillis,SeleniumAction... actions){
22 | WebDriver driver=null;
23 | try {
24 | driver=webDriverPool.get();
25 | driver.get(url);
26 | sleep(sleepTimeMillis);
27 | WebDriver.Options manage = driver.manage();
28 | manage.window().maximize();
29 | for(SeleniumAction action:actions){
30 | action.execute(driver);
31 | }
32 | } catch (InterruptedException e) {
33 | e.printStackTrace();
34 | log.error("",e);
35 | }finally{
36 | if(driver!=null){
37 | webDriverPool.returnToPool(driver);
38 | }
39 | }
40 | }
41 | public void load(SeleniumAction... actions){
42 | WebDriver driver=null;
43 | try {
44 | driver=webDriverPool.get();
45 | WebDriver.Options manage = driver.manage();
46 | manage.window().maximize();
47 | for(SeleniumAction action:actions){
48 | action.execute(driver);
49 | }
50 | } catch (InterruptedException e) {
51 | e.printStackTrace();
52 | log.error("",e);
53 | }finally{
54 | if(driver!=null){
55 | webDriverPool.returnToPool(driver);
56 | }
57 | }
58 | }
59 | public void shutDown(){
60 | if(webDriverPool!=null){
61 | webDriverPool.shutdown();
62 | }
63 | }
64 | @Override
65 | public void close() throws IOException {
66 | shutDown();
67 | }
68 | public void sleep(long millis){
69 | try {
70 | Thread.sleep(millis);
71 | } catch (InterruptedException e) {
72 | e.printStackTrace();
73 | }
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/jsp/new-employee.jsp:
--------------------------------------------------------------------------------
1 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
2 |
3 |
4 |
5 |
6 |
7 |
8 |
40 |
41 |
--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/SpiderManager.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.server.monitor;
2 |
3 | import java.util.concurrent.ConcurrentHashMap;
4 |
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import com.github.xbynet.crawler.Spider;
9 |
10 | public class SpiderManager {
11 | private Logger log=LoggerFactory.getLogger(SpiderManager.class);
12 |
13 | private ConcurrentHashMap spiders=new ConcurrentHashMap<>();
14 |
15 | private SpiderManager(){
16 |
17 | }
18 |
19 | private static class SingleHolder{
20 | static SpiderManager instance=new SpiderManager();
21 | }
22 |
23 | public static SpiderManager get(){
24 | return SingleHolder.instance;
25 | }
26 |
27 | public synchronized void add(Spider... spiders1){
28 | for(Spider s:spiders1){
29 | getSpiders().put(s.getName(),s);
30 | }
31 | }
32 | public synchronized Spider remove(String name){
33 | return getSpiders().remove(name);
34 | }
35 | public synchronized void stopAll(){
36 | for(String key:getSpiders().keySet()){
37 | stop(key);
38 | }
39 | }
40 | public synchronized void startAll(){
41 | for(String key:getSpiders().keySet()){
42 | start(key);
43 | }
44 | }
45 | public String status(String name){
46 | if(!getSpiders().containsKey(name)){
47 | throw new IllegalArgumentException("the spider of "+name+" is not in manager");
48 | }
49 | Spider spider=getSpiders().get(name);
50 | return spider.getState().name();
51 | }
52 |
53 | public synchronized boolean stop(String name){
54 | if(!getSpiders().containsKey(name)){
55 | throw new IllegalArgumentException("the spider of "+name+" is not in manager");
56 | }
57 | Spider spider=getSpiders().get(name);
58 | if(spider.isRunning()){
59 | spider.stop();
60 | return true;
61 | }else{
62 | log.warn("illegal status "+spider.getState().name()+" for stop");
63 | return false;
64 | }
65 | }
66 | public synchronized boolean start(String name){
67 | if(!getSpiders().containsKey(name)){
68 | throw new IllegalArgumentException("the spider of "+name+" is not in manager");
69 | }
70 | Spider spider=getSpiders().get(name);
71 | if(spider.getState()==Spider.Status.NotRun){
72 | spider.runAsync();
73 | return true;
74 | }
75 | if(spider.isStopped()){
76 | if(spider.isShutdownOnComplete()){
77 | log.warn("spider of "+name+" setShutdownOnComplete=true, so it's not support restart");
78 | return false;
79 | }
80 | spider.runAsync();
81 | return true;
82 | }
83 | log.warn("illegal status "+spider.getState().name()+" for start");
84 | return false;
85 | }
86 |
87 | public ConcurrentHashMap getSpiders() {
88 | return spiders;
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/jsp/spider-list.jsp:
--------------------------------------------------------------------------------
1 | <%@ page contentType="text/html;charset=utf-8" %>
2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
14 | 爬虫监控
15 |
16 |
17 |
18 |
19 |
爬虫监控
20 |
21 |
22 |
23 |
24 |
25 | | 标识 |
26 | 页面处理器类名 |
27 | 状态 |
28 | 操作 |
29 | 运行信息 |
30 |
31 |
32 |
33 |
34 |
35 | | ${spider.name} |
36 | ${spider.processor} |
37 | ${spider.status} |
38 |
39 |
41 | |
42 | ${spider.info} |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | 没有正在运行的爬虫
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/DefaultDownloader.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.http;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.HashMap;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | import org.apache.commons.io.IOUtils;
10 | import org.apache.http.Header;
11 | import org.apache.http.HeaderElement;
12 | import org.apache.http.client.methods.CloseableHttpResponse;
13 | import org.apache.http.client.methods.HttpUriRequest;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | import com.github.xbynet.crawler.Const;
18 | import com.github.xbynet.crawler.Request;
19 | import com.github.xbynet.crawler.Response;
20 | import com.github.xbynet.crawler.Site;
21 |
22 | public class DefaultDownloader extends AbsDownloader {
23 | private final Logger log = LoggerFactory.getLogger(DefaultDownloader.class);
24 |
25 | @Override
26 | public void download(Request request){
27 | super.doDownload(request);
28 | }
29 | @Override
30 | protected void process(HttpUriRequest httpUriRequest,
31 | CloseableHttpResponse resp, Request request, Site site,Response response,
32 | Object... extras) {
33 | if (resp == null) {
34 | log.error(request.getUrl() + "请求失败");
35 | return ;
36 | }
37 | response.setCode(resp.getStatusLine().getStatusCode());
38 | response.setContentType(resp.getFirstHeader("Content-Type").getValue());
39 | Const.ResponseType type = null;
40 | try {
41 | if (response.getContentType().contains("text")
42 | || response.getContentType().contains("json")) {
43 | type = Const.ResponseType.TEXT;
44 | String raw=IOUtils.toString(resp.getEntity().getContent(),
45 | request.getEncoding() != null ? request.getEncoding()
46 | : site.getEncoding());
47 | response.setRaw(raw);
48 | } else {
49 | type = Const.ResponseType.BIN;
50 | response.setBytes(IOUtils.toByteArray(resp.getEntity()
51 | .getContent()));
52 | }
53 | } catch (UnsupportedOperationException e) {
54 | log.error("", e);
55 | } catch (IOException e) {
56 | log.error("", e);
57 | }
58 | response.setRespType(type);
59 | response.setRequest(request);
60 |
61 | Map> headers=new HashMap>();
62 | for(Header header:resp.getAllHeaders()){
63 | List value=new ArrayList();
64 | HeaderElement[] hes=header.getElements();
65 | if(hes!=null && hes.length>1){
66 | for(HeaderElement e:hes){
67 | value.add(e.getValue());
68 | }
69 | }else{
70 | value.add(header.getValue());
71 | }
72 | headers.put(header.getName(), value);
73 | }
74 | response.setHeaders(headers);
75 | try {
76 | getSpider().getProcessor().process(response);
77 | } catch (Exception e) {
78 | log.error("",e);
79 | }
80 | }
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/CountableThreadPool.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.utils;
2 |
3 | import java.util.concurrent.ExecutorService;
4 | import java.util.concurrent.Executors;
5 | import java.util.concurrent.TimeUnit;
6 | import java.util.concurrent.atomic.AtomicInteger;
7 | import java.util.concurrent.locks.Condition;
8 | import java.util.concurrent.locks.ReentrantLock;
9 |
10 |
11 | public class CountableThreadPool {
12 |
13 | private int threadNum;
14 |
15 | private AtomicInteger threadAlive = new AtomicInteger();
16 |
17 | private ReentrantLock reentrantLock = new ReentrantLock();
18 |
19 | private Condition condition = reentrantLock.newCondition();
20 |
21 | public CountableThreadPool(int threadNum) {
22 | this.threadNum = threadNum;
23 | this.executorService = Executors.newFixedThreadPool(threadNum);
24 | }
25 |
26 | public CountableThreadPool(int threadNum, ExecutorService executorService) {
27 | this.threadNum = threadNum;
28 | this.executorService = executorService;
29 | }
30 |
31 | public void setExecutorService(ExecutorService executorService) {
32 | this.executorService = executorService;
33 | }
34 |
35 | public int getThreadAlive() {
36 | return threadAlive.get();
37 | }
38 |
39 | public int getThreadNum() {
40 | return threadNum;
41 | }
42 |
43 | private ExecutorService executorService;
44 |
45 | public void execute(final Runnable runnable) {
46 |
47 |
48 | if (threadAlive.get() >= threadNum) {
49 | try {
50 | reentrantLock.lock();
51 | while (threadAlive.get() >= threadNum) {
52 | try {
53 | condition.await();
54 | } catch (InterruptedException e) {
55 | }
56 | }
57 | } finally {
58 | reentrantLock.unlock();
59 | }
60 | }
61 | threadAlive.incrementAndGet();
62 | executorService.execute(new Runnable() {
63 | public void run() {
64 | try {
65 | runnable.run();
66 | } finally {
67 | try {
68 | reentrantLock.lock();
69 | threadAlive.decrementAndGet();
70 | condition.signal();
71 | } finally {
72 | reentrantLock.unlock();
73 | }
74 | }
75 | }
76 | });
77 | }
78 |
79 | public boolean isShutdown() {
80 | return executorService.isShutdown();
81 | }
82 |
83 | public void shutdown() {
84 | executorService.shutdown();
85 | }
86 | public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException{
87 | return executorService.awaitTermination(timeout, unit);
88 | }
89 |
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/MonitorServlet.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.server.monitor;
2 |
3 | import java.io.IOException;
4 | import java.text.SimpleDateFormat;
5 | import java.util.ArrayList;
6 | import java.util.Date;
7 | import java.util.HashMap;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.concurrent.ConcurrentHashMap;
11 |
12 | import javax.servlet.ServletException;
13 | import javax.servlet.ServletOutputStream;
14 | import javax.servlet.annotation.WebServlet;
15 | import javax.servlet.http.HttpServlet;
16 | import javax.servlet.http.HttpServletRequest;
17 | import javax.servlet.http.HttpServletResponse;
18 |
19 | import org.apache.commons.lang3.StringUtils;
20 |
21 | import com.github.xbynet.crawler.Spider;
22 |
23 | @WebServlet(
24 | name = "MonitorServlet",
25 | urlPatterns = {"/monitor"}
26 | )
27 | public class MonitorServlet extends HttpServlet{
28 |
29 | @Override
30 | protected void doGet(HttpServletRequest req, HttpServletResponse resp)
31 | throws ServletException, IOException {
32 | String method=req.getParameter("method");
33 | String name=req.getParameter("name");
34 | String uri=req.getRequestURI();
35 | if(StringUtils.isBlank(method)){
36 | List