├── docs
    └── tutorial.md
├── crawler-server
    ├── .gitignore
    ├── src
    │   └── main
    │   │   ├── webapp
    │   │       ├── META-INF
    │   │       │   └── context.xml
    │   │       ├── index.jsp
    │   │       ├── fonts
    │   │       │   ├── glyphicons-halflings-regular.eot
    │   │       │   ├── glyphicons-halflings-regular.ttf
    │   │       │   ├── glyphicons-halflings-regular.woff
    │   │       │   └── glyphicons-halflings-regular.woff2
    │   │       ├── WEB-INF
    │   │       │   └── web.xml
    │   │       ├── js
    │   │       │   ├── spider-list.js
    │   │       │   └── bootstrap.min.js
    │   │       └── jsp
    │   │       │   ├── new-employee.jsp
    │   │       │   └── spider-list.jsp
    │   │   └── java
    │   │       └── com
    │   │           └── github
    │   │               └── xbynet
    │   │                   └── crawler
    │   │                       └── server
    │   │                           ├── Main.java
    │   │                           ├── HelloServlet.java
    │   │                           ├── monitor
    │   │                               ├── SpiderManager.java
    │   │                               └── MonitorServlet.java
    │   │                           └── demo
    │   │                               └── GithubCrawler.java
    └── pom.xml
├── crawler-core
    ├── src
    │   ├── main
    │   │   └── java
    │   │   │   └── com
    │   │   │       └── github
    │   │   │           └── xbynet
    │   │   │               └── crawler
    │   │   │                   ├── parser
    │   │   │                       ├── Parser.java
    │   │   │                       ├── JsonPathParser.java
    │   │   │                       ├── XpathParser.java
    │   │   │                       └── JsoupParser.java
    │   │   │                   ├── ISpider.java
    │   │   │                   ├── SpiderListener.java
    │   │   │                   ├── Const.java
    │   │   │                   ├── http
    │   │   │                       ├── Downloader.java
    │   │   │                       ├── FileDownloader.java
    │   │   │                       ├── CustomRedirectStrategy.java
    │   │   │                       ├── DefaultDownloader.java
    │   │   │                       ├── HttpClientFactory.java
    │   │   │                       └── AbsDownloader.java
    │   │   │                   ├── annotation
    │   │   │                       └── Nullable.java
    │   │   │                   ├── scheduler
    │   │   │                       ├── DuplicateRemover.java
    │   │   │                       ├── Scheduler.java
    │   │   │                       ├── DefaultScheduler.java
    │   │   │                       └── RedisScheduler.java
    │   │   │                   ├── IpProxyProvider.java
    │   │   │                   ├── RequestAction.java
    │   │   │                   ├── utils
    │   │   │                       ├── BeanUtil.java
    │   │   │                       ├── CrawlerUtils.java
    │   │   │                       └── CountableThreadPool.java
    │   │   │                   ├── Processor.java
    │   │   │                   ├── Site.java
    │   │   │                   ├── Response.java
    │   │   │                   ├── Request.java
    │   │   │                   └── Spider.java
    │   └── test
    │   │   ├── java
    │   │       └── net
    │   │       │   └── xby1993
    │   │       │       └── crawler
    │   │       │           ├── StartAllJoke.java
    │   │       │           ├── AppTest.java
    │   │       │           ├── ZhihuRecommendCrawler.java
    │   │       │           ├── OSChinaTweetsCrawler.java
    │   │       │           ├── QiushibaikeCrawler.java
    │   │       │           ├── NeihanshequCrawler.java
    │   │       │           └── GithubCrawler.java
    │   │   └── resources
    │   │       └── logback.xml
    └── pom.xml
├── crawler-selenium
    ├── src
    │   └── main
    │   │   └── java
    │   │       └── com
    │   │           └── github
    │   │               └── xbynet
    │   │                   └── crawler
    │   │                       └── selenium
    │   │                           ├── SeleniumAction.java
    │   │                           ├── WebDriverPool.java
    │   │                           ├── getCssAttr.js
    │   │                           ├── ImageRegion.java
    │   │                           ├── ImageUtil.java
    │   │                           ├── WebDriverManager.java
    │   │                           ├── SeleniumDownloader.java
    │   │                           ├── PhantomjsWebDriverPool.java
    │   │                           └── WindowUtil.java
    └── pom.xml
├── .gitignore
├── LICENSE
├── README.md
└── pom.xml


/docs/tutorial.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawler-server/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/META-INF/context.xml:
--------------------------------------------------------------------------------
1 | <Context reloadable="true" backgroundProcessorDelay="1">
2 | </Context>


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
1 | <html>
2 | <body>
3 | <h2>Hello World!</h2>
4 | <jsp:forward page="/hello" /> 
5 | </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/Parser.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler.parser;
2 | 
3 | public interface Parser {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/ISpider.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 | 
3 | public interface ISpider {
4 | 	String getName();
5 | 
6 | }
7 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/SpiderListener.java:
--------------------------------------------------------------------------------
1 | package com.github.xbynet.crawler;
2 | 
3 | public interface SpiderListener {
4 | 	void success(Spider spider,Request request);
5 | 	void fail(Spider spider,Request request,Exception e);
6 | }
7 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Const.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler;
 2 | 
 3 | public class Const {
 4 | 	public enum HttpMethod{
 5 | 		GET,POST,HEAD
 6 | 	}
 7 | 	public enum CssAttr{
 8 | 		innerHtml,text,allText
 9 | 	}
10 | 	public enum ResponseType{
11 | 		TEXT,BIN
12 | 	}
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/SeleniumAction.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.selenium;
 2 | 
 3 | import org.openqa.selenium.WebDriver;
 4 | 
 5 | /**
 6 |  * @author taojw
 7 |  *
 8 |  */
 9 | public interface SeleniumAction {
10 | 	void execute(WebDriver driver);
11 | }
12 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/StartAllJoke.java:
--------------------------------------------------------------------------------
 1 | package net.xby1993.crawler;
 2 | 
 3 | public class StartAllJoke {
 4 | 	public static void main(String[] args) {
 5 | 		new OSChinaTweetsCrawler().start();
 6 | 		new QiushibaikeCrawler().start();
 7 | 		new NeihanshequCrawler().start();
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverPool.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.selenium;
 2 | 
 3 | import org.openqa.selenium.WebDriver;
 4 | 
 5 | public interface WebDriverPool {
 6 | 	WebDriver get() throws InterruptedException;
 7 | 	void returnToPool(WebDriver webDriver);
 8 | 	void close(WebDriver webDriver);
 9 | 	void shutdown();
10 | }
11 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/Downloader.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.http;
 2 | 
 3 | import java.io.Closeable;
 4 | 
 5 | import com.github.xbynet.crawler.Request;
 6 | import com.github.xbynet.crawler.Spider;
 7 | 
 8 | public interface Downloader extends Closeable{
 9 | 	void init();
10 | 	void download(Request request);
11 | 	void setSpider(Spider spider);
12 | }
13 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/annotation/Nullable.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.annotation;
 2 | 
 3 | import java.lang.annotation.ElementType;
 4 | import java.lang.annotation.Retention;
 5 | import java.lang.annotation.RetentionPolicy;
 6 | import java.lang.annotation.Target;
 7 | 
 8 | @Target(ElementType.PARAMETER)
 9 | @Retention(RetentionPolicy.SOURCE)
10 | public @interface Nullable {
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 | <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2 | xmlns="http://java.sun.com/xml/ns/javaee"
3 | xmlns:web="http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"
4 | xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
5 | http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd" 
6 | id="WebApp_ID" version="3.0">
7 |   <display-name>Archetype Created Web Application</display-name>
8 | </web-app>
9 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DuplicateRemover.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.scheduler;
 2 | 
 3 | import com.github.xbynet.crawler.ISpider;
 4 | import com.github.xbynet.crawler.Request;
 5 | 
 6 | public interface DuplicateRemover {
 7 |     public boolean isDuplicate(Request request, ISpider spider);
 8 |     public void resetDuplicateCheck(ISpider spider);
 9 |     public int getTotalRequestsCount(ISpider spider);
10 | 
11 | }


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/getCssAttr.js:
--------------------------------------------------------------------------------
 1 | function getStyle(obj, attr) {
 2 |     if (obj.currentStyle) {
 3 |         return obj.currentStyle[attr];
 4 |     } else {
 5 |         return document.defaultView.getComputedStyle(obj, null)[attr];
 6 |     }
 7 | }
 8 | function getCssAttr(sel,attr){
 9 | 	var tmp=document.querySelector(sel);
10 | 	var res=getStyle(tmp,attr);
11 | 	return res;
12 | }
13 | return getCssAttr(arguments[0],arguments[1]);


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/js/spider-list.js:
--------------------------------------------------------------------------------
 1 | function changeState(name){
 2 | 	var t=$("#stateBtn").text().trim();
 3 | 	var method='start';
 4 | 	if(t=='停止'){
 5 | 		method='stop';
 6 | 	}
 7 | 	$.get(baseUrl+"monitor?name="+name+"&method="+method,function(data){
 8 | 		if(data=='true'){
 9 | 			$("#stateBtn").text(method=='start'?'停止':'启动');
10 | 			$("#status").text(method=='start'?"running":"stopping...");
11 | 		}else{
12 | 			alert("请求失败:"+data);
13 | 		}
14 | 	},"text")
15 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.project
 2 | /.settings
 3 | /target
 4 | /.classpath
 5 | /crawler-core/target
 6 | /crawler-selenium/target
 7 | /crawler-core/.project
 8 | /crawler-core/.settings
 9 | /crawler-core/.classpath
10 | /crawler-selenium/.project
11 | /crawler-selenium/.settings
12 | /crawler-selenium/.classpath
13 | /crawler-server/.tern-project
14 | /crawler-server/.settings
15 | /crawler-server/target
16 | /crawler-server/tomcat.8666
17 | /crawler-server/.classpath
18 | /crawler-server/.project
19 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/Scheduler.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.scheduler;
 2 | 
 3 | import com.github.xbynet.crawler.ISpider;
 4 | import com.github.xbynet.crawler.Request;
 5 | 
 6 | public interface Scheduler {
 7 |     public void push(Request request,ISpider spider);
 8 |     public Request poll(ISpider spider);
 9 |     public int getLeftRequestsCount(ISpider spider);
10 |     public int getTotalRequestsCount(ISpider spider);
11 |     public DuplicateRemover getDuplicateRemover();
12 | }
13 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/IpProxyProvider.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.http.HttpHost;
 7 | 
 8 | public class IpProxyProvider implements Closeable{
 9 | 	
10 | 	public HttpHost getIp(){
11 | 		return null;
12 | 	}
13 | 	public void invalid(HttpHost host){
14 | 		
15 | 	}
16 | 	public void valid(HttpHost host){
17 | 		
18 | 	}
19 | 	@Override
20 | 	public void close() throws IOException {
21 | 		
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/RequestAction.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | import org.apache.http.client.methods.CloseableHttpResponse;
 6 | import org.apache.http.client.methods.HttpUriRequest;
 7 | import org.apache.http.impl.client.CloseableHttpClient;
 8 | 
 9 | public interface RequestAction extends Serializable {
10 | 	void before(CloseableHttpClient client,HttpUriRequest req);
11 | 	void after(CloseableHttpClient client,CloseableHttpResponse resp);
12 | }
13 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |         <layout class="ch.qos.logback.classic.PatternLayout">
 4 |             <Pattern>%-5level %msg [%logger{16} %d{HH:mm:ss}]%n</Pattern>
 5 |         </layout>
 6 |     </appender>
 7 | 
 8 |     <root level="DEBUG">
 9 |         <appender-ref ref="STDOUT" />
10 |     </root>
11 |     <logger name="org.apache" level="INFO" />
12 |     <logger name="org.apache.http.wire" level="INFO" />
13 | </configuration>


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageRegion.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.selenium;
 2 | 
 3 | /**
 4 |  * @author taojw
 5 |  *
 6 |  */
 7 | public class ImageRegion {
 8 | 	public int x;
 9 | 	public int y;
10 | 	public int width;
11 | 	public int height;
12 | 	public ImageRegion(int x,int y,int width,int height){
13 | 		this.x=x;
14 | 		this.y=y;
15 | 		this.width=width;
16 | 		this.height=height;
17 | 	}
18 | 	@Override
19 | 	public String toString() {
20 | 		return "ImageRegion [x=" + x + ", y=" + y + ", width=" + width
21 | 				+ ", height=" + height + "]";
22 | 	}
23 | 	
24 | }
25 | 


--------------------------------------------------------------------------------
/crawler-core/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<parent>
 5 | 		<groupId>com.github.xbynet</groupId>
 6 | 		<artifactId>crawler-parent</artifactId>
 7 | 		<version>0.3.0</version>
 8 | 	</parent>
 9 | 	<artifactId>crawler-core</artifactId>
10 | 	<packaging>jar</packaging>
11 | 	<dependencies>
12 | 		<dependency>
13 | 			<groupId>redis.clients</groupId>
14 | 			<artifactId>jedis</artifactId>
15 | 		</dependency>
16 | 	</dependencies>
17 | </project>


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/BeanUtil.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.utils;
 2 | 
 3 | import java.util.concurrent.ConcurrentHashMap;
 4 | 
 5 | import net.sf.cglib.beans.BeanCopier;
 6 | 
 7 | public class BeanUtil {
 8 | 	public static ConcurrentHashMap<String, BeanCopier> beanCopierMap = new ConcurrentHashMap<String, BeanCopier>();
 9 | 
10 | 	public static void copyProperties(Object source, Object target) {
11 | 		String beanKey = generateKey(source.getClass(), target.getClass());
12 | 		BeanCopier copier = null;
13 | 		copier = BeanCopier.create(source.getClass(), target.getClass(), false);
14 | 		beanCopierMap.putIfAbsent(beanKey, copier);
15 | 		copier = beanCopierMap.get(beanKey);
16 | 		copier.copy(source, target, null);
17 | 	}
18 | 
19 | 	private static String generateKey(Class<?> class1, Class<?> class2) {
20 | 		return class1.toString() + class2.toString();
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageUtil.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.selenium;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import net.coobird.thumbnailator.Thumbnails;
 6 | 
 7 | /**
 8 |  * @author taojw
 9 |  *
10 |  */
11 | public class ImageUtil {
12 | 	public static void crop(String srcfile,String destfile,ImageRegion region){
13 | 		//指定坐标  
14 | 		try {
15 | 			Thumbnails.of(srcfile)  
16 | 			        .sourceRegion(region.x, region.y, region.width, region.height)  
17 | 			        .size(region.width, region.height).outputQuality(1.0) 
18 | 			        //.keepAspectRatio(false)  //不保持比例 
19 | 			        .toFile(destfile);
20 | 		} catch (IOException e) {
21 | 			// TODO Auto-generated catch block
22 | 			e.printStackTrace();
23 | 		}  
24 | 	}
25 | 	public static void main(String[] args) {
26 | 		crop("D:\\data\\111.png","D:\\data\\1112.png",new ImageRegion(66, 264, 422, 426));
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/Main.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.server;
 2 | 
 3 | import org.apache.catalina.core.StandardContext;
 4 | import org.apache.catalina.startup.Tomcat;
 5 | 
 6 | 
 7 | /**
 8 |  *Embeded Tomcat 
 9 |  *http://www.oracle.com/webfolder/technetwork/tutorials/obe/java/basic_app_embedded_tomcat/basic_app-tomcat-embedded.html
10 |  *https://github.com/heroku/devcenter-embedded-tomcat
11 |  */
12 | public class Main {
13 | 	
14 | 	public static void main(String[] args) throws Exception {
15 | 		String contextPath = "/";
16 | 		String appBase = ".";
17 | 		Tomcat tomcat = new Tomcat();
18 | 		tomcat.setPort(8666);
19 | 		tomcat.getHost().setAppBase(appBase);
20 | 		StandardContext ctx=(StandardContext)tomcat.addWebapp(contextPath, appBase);//Context ctx = tomcat.addContext("/", new File(".").getAbsolutePath());
21 | 		
22 | 		tomcat.start();
23 | 		tomcat.getServer().await();
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/AppTest.java:
--------------------------------------------------------------------------------
 1 | package net.xby1993.crawler;
 2 | 
 3 | import org.apache.http.client.methods.CloseableHttpResponse;
 4 | import org.apache.http.client.methods.HttpUriRequest;
 5 | import org.apache.http.impl.client.CloseableHttpClient;
 6 | 
 7 | import com.alibaba.fastjson.JSONObject;
 8 | 
 9 | import junit.framework.Test;
10 | import junit.framework.TestCase;
11 | import junit.framework.TestSuite;
12 | 
13 | /**
14 |  * Unit test for simple App.
15 |  */
16 | public class AppTest 
17 |     extends TestCase
18 | {
19 |     /**
20 |      * Create the test case
21 |      *
22 |      * @param testName name of the test case
23 |      */
24 |     public AppTest( String testName )
25 |     {
26 |         super( testName );
27 |     }
28 | 
29 |     /**
30 |      * @return the suite of tests being tested
31 |      */
32 |     public static Test suite()
33 |     {
34 |         return new TestSuite( AppTest.class );
35 |     }
36 | 
37 |     /**
38 |      * Rigourous Test :-)
39 |      */
40 |     public void testApp()
41 |     {
42 |         assertTrue( true );
43 |     }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 xbynet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crawler-selenium/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<parent>
 5 | 		<groupId>com.github.xbynet</groupId>
 6 | 		<artifactId>crawler-parent</artifactId>
 7 | 		<version>0.3.0</version>
 8 | 	</parent>
 9 | 	<artifactId>crawler-selenium</artifactId>
10 | 	<packaging>jar</packaging>
11 | 	<dependencies>
12 | 		<dependency>
13 | 			<groupId>com.github.xbynet</groupId>
14 | 			<artifactId>crawler-core</artifactId>
15 | 			<version>${project.version}</version>
16 | 		</dependency>
17 | 		<dependency>
18 | 			<groupId>org.seleniumhq.selenium</groupId>
19 | 			<artifactId>selenium-java</artifactId>
20 | 		</dependency>
21 | 		<dependency>
22 | 			<groupId>com.codeborne</groupId>
23 | 			<artifactId>phantomjsdriver</artifactId>
24 | 		</dependency>
25 | 		<dependency>
26 | 			<groupId>net.coobird</groupId>
27 | 			<artifactId>thumbnailator</artifactId>
28 | 			<version>0.4.8</version>
29 | 		</dependency>
30 | 	</dependencies>
31 | </project>


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Processor.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.io.IOException;
 5 | 
 6 | import com.github.xbynet.crawler.http.FileDownloader;
 7 | 
 8 | /**
 9 |  *爬虫页面处理器，撰写爬虫时需要扩展此类
10 |  */
11 | public abstract class Processor implements Closeable{
12 | 	private FileDownloader fileDownloader=null;
13 | 	private Spider spider=null;
14 | 	
15 | 	public abstract void process(Response resp);
16 | 	
17 | 	public boolean download(Request req,String savePath){
18 | 		return fileDownloader.download(req, savePath);
19 | 	}
20 | 	public boolean download(String url,String savePath){
21 | 		Request req=new Request(url);
22 | 		return fileDownloader.download(req, savePath);
23 | 	}
24 | 	public FileDownloader getFileDownloader() {
25 | 		return fileDownloader;
26 | 	}
27 | 
28 | 	public void setFileDownloader(FileDownloader fileDownloader) {
29 | 		this.fileDownloader = fileDownloader;
30 | 	}
31 | 	@Override
32 | 	public void close()throws IOException{
33 | 		
34 | 	}
35 | 
36 | 	public Spider getSpider() {
37 | 		return spider;
38 | 	}
39 | 
40 | 	public void setSpider(Spider spider) {
41 | 		this.spider = spider;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/HelloServlet.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.server;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import javax.servlet.ServletException;
 6 | import javax.servlet.ServletOutputStream;
 7 | import javax.servlet.annotation.WebServlet;
 8 | import javax.servlet.http.HttpServlet;
 9 | import javax.servlet.http.HttpServletRequest;
10 | import javax.servlet.http.HttpServletResponse;
11 | 
12 | import com.github.xbynet.crawler.Spider;
13 | import com.github.xbynet.crawler.server.demo.GithubCrawler;
14 | import com.github.xbynet.crawler.server.monitor.SpiderManager;
15 | 
16 | @WebServlet(
17 |         name = "MyServlet",
18 |         urlPatterns = {"/hello"}
19 |     )
20 | public class HelloServlet extends HttpServlet {
21 | 
22 |     @Override
23 |     protected void doGet(HttpServletRequest req, HttpServletResponse resp)
24 |             throws ServletException, IOException {
25 |         ServletOutputStream out = resp.getOutputStream();
26 |         Spider s=new GithubCrawler().createSpider();
27 |         SpiderManager.get().add(s);
28 |         out.write(("add spider of "+s.getName()).getBytes());
29 |         out.flush();
30 |         out.close();
31 |     }
32 | 
33 | }


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/CrawlerUtils.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.utils;
 2 | 
 3 | import javax.script.Invocable;
 4 | import javax.script.ScriptEngine;
 5 | import javax.script.ScriptEngineManager;
 6 | 
 7 | import org.apache.commons.lang3.StringUtils;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | import com.github.xbynet.crawler.annotation.Nullable;
12 | 
13 | public class CrawlerUtils {
14 | 	private static final Logger log=LoggerFactory.getLogger(CrawlerUtils.class);
15 | 	
16 | 	public static void sleep(int millis){
17 | 		try {
18 | 			Thread.sleep(millis);
19 | 		} catch (InterruptedException e) {
20 | 			log.warn("",e);
21 | 		}
22 | 	}
23 | 	
24 | 	public Object executeJs(String js,@Nullable String funcName,Object... args){
25 | 		ScriptEngineManager manager = new ScriptEngineManager();
26 | 		ScriptEngine engine = manager.getEngineByName("javascript");
27 | 		try {
28 | 			Object res=engine.eval(js);
29 | 			if(StringUtils.isNotBlank(funcName)){
30 | 				if (engine instanceof Invocable) {
31 | 					Invocable invoke = (Invocable) engine;
32 | 					res = invoke.invokeFunction(funcName, args);
33 | 				}
34 | 			}
35 | 			return res;
36 | 		} catch (Exception e) {
37 | 			log.error("",e);
38 | 		}
39 | 		return null;
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/JsonPathParser.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.parser;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import com.jayway.jsonpath.JsonPath;
 7 | import com.jayway.jsonpath.ReadContext;
 8 | 
 9 | public class JsonPathParser implements Parser {
10 | 	private ReadContext ctx;
11 | 
12 | 	public JsonPathParser(String raw) {
13 | 		this.ctx = JsonPath.parse(raw);
14 | 	}
15 | 
16 | 	public String single(String jsonpath) {
17 | 		Object object = ctx.read(jsonpath);
18 | 		if (object == null) {
19 | 			return null;
20 | 		}
21 | 		if (object instanceof List) {
22 | 			List list = (List) object;
23 | 			if (list != null && list.size() > 0) {
24 | 				return list.get(0).toString();
25 | 			}
26 | 		}
27 | 		return object.toString();
28 | 	}
29 | 
30 | 	public List<String> list(String jsonpath) {
31 | 		List<String> reslist = new ArrayList<String>();
32 | 		Object object = ctx.read(jsonpath);
33 | 		if (object == null) {
34 | 			return reslist;
35 | 		}
36 | 		if (object instanceof List) {
37 | 			List list = (List) object;
38 | 			for (Object item : list) {
39 | 				reslist.add(item.toString());
40 | 			}
41 | 		} else {
42 | 			reslist.add(object.toString());
43 | 		}
44 | 		return reslist;
45 | 	}
46 | 
47 | 	public ReadContext getCtx() {
48 | 		return ctx;
49 | 	}
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/XpathParser.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.parser;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.jsoup.Jsoup;
 6 | import org.jsoup.nodes.Document;
 7 | import org.jsoup.nodes.Element;
 8 | 
 9 | import us.codecraft.xsoup.XPathEvaluator;
10 | import us.codecraft.xsoup.Xsoup;
11 | 
12 | public class XpathParser implements Parser{
13 | 	
14 | 	private Document doc;
15 | 
16 |     public XpathParser(String raw) {
17 |         this.doc=Jsoup.parse(raw);
18 |     }
19 | 
20 |     public String single(String xpathStr) {
21 |     	XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
22 |         return xPathEvaluator.evaluate(doc).get();
23 |     }
24 | 
25 |     public List<String> list(String xpathStr) {
26 |     	XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
27 |         return xPathEvaluator.evaluate(doc).list();
28 |     }
29 | 
30 |     public Element element(String xpathStr) {
31 |         List<Element> elements = elements(xpathStr);
32 |         if (elements!=null && elements.size()>0){
33 |             return elements.get(0);
34 |         }
35 |         return null;
36 |     }
37 | 
38 |     public List<Element> elements(String xpathStr) {
39 |     	XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr);
40 |         return xPathEvaluator.evaluate(doc).getElements();
41 |     }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DefaultScheduler.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.scheduler;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.Set;
 5 | import java.util.concurrent.BlockingQueue;
 6 | import java.util.concurrent.ConcurrentHashMap;
 7 | import java.util.concurrent.LinkedBlockingQueue;
 8 | 
 9 | import org.slf4j.Logger;
10 | import org.slf4j.LoggerFactory;
11 | 
12 | import com.github.xbynet.crawler.Const;
13 | import com.github.xbynet.crawler.ISpider;
14 | import com.github.xbynet.crawler.Request;
15 | 
16 | public class DefaultScheduler implements Scheduler, DuplicateRemover {
17 | 	private final Logger log = LoggerFactory.getLogger(DefaultScheduler.class);
18 | 	private Set<String> urls = Collections
19 | 			.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
20 | 	private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();
21 | 
22 | 	public void push(Request request, ISpider spider) {
23 | 		if (Const.HttpMethod.POST == request.getMethod()
24 | 				|| !isDuplicate(request, spider)) {
25 | 			log.debug("push to queue {}", request.getUrl());
26 | 			queue.add(request);
27 | 		}
28 | 	}
29 | 
30 | 	public Request poll(ISpider spider) {
31 | 		return queue.poll();
32 | 	}
33 | 
34 | 	public DuplicateRemover getDuplicateRemover(){
35 | 		return this;
36 | 	}
37 | 	public boolean isDuplicate(Request request, ISpider spider) {
38 | 		return !urls.add(request.getUrl());
39 | 	}
40 | 
41 | 	public void resetDuplicateCheck(ISpider spider) {
42 | 		urls.clear();
43 | 	}
44 | 
45 | 	public int getTotalRequestsCount(ISpider spider) {
46 | 		return urls.size();
47 | 	}
48 | 
49 | 	public int getLeftRequestsCount(ISpider spider) {
50 | 		return queue.size();
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/FileDownloader.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.http;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileOutputStream;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.commons.io.IOUtils;
 8 | import org.apache.http.client.methods.CloseableHttpResponse;
 9 | import org.apache.http.client.methods.HttpUriRequest;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import com.github.xbynet.crawler.Request;
14 | import com.github.xbynet.crawler.Response;
15 | import com.github.xbynet.crawler.Site;
16 | 
17 | 
18 | public class FileDownloader extends AbsDownloader{
19 | 	private final Logger log=LoggerFactory.getLogger(FileDownloader.class);
20 | 	
21 | 	
22 | 	public boolean download(Request request,String savePath){
23 | 		log.debug("开始下载文件"+request.getUrl()+"到路径"+savePath);
24 | 		super.doDownload(request,savePath);
25 | 		File file=new File(savePath);
26 | 		return file.exists();
27 | 	}
28 | 	@Override
29 | 	protected void process(HttpUriRequest httpUriRequest,
30 | 			CloseableHttpResponse resp, Request request, Site site,Response response,Object... extras) {
31 | 		if(resp==null){
32 | 			log.error("文件"+httpUriRequest.getURI().toString()+"下载失败");
33 | 			return;
34 | 		}
35 | 		String savePath=extras[0].toString();
36 | 		File saveFile=new File(savePath);
37 | 		if(saveFile.exists()){
38 | 			saveFile.delete();
39 | 		}
40 | 		FileOutputStream fous=null;
41 | 		try {
42 | 			fous=new FileOutputStream(saveFile);
43 | 			IOUtils.copy(resp.getEntity().getContent(), fous);
44 | 			log.debug("文件"+httpUriRequest.getURI().toString()+"下载成功");
45 | 		} catch (UnsupportedOperationException e) {
46 | 			log.error("",e);
47 | 		} catch (IOException e) {
48 | 			log.error("",e);
49 | 		}finally{
50 | 			IOUtils.closeQuietly(fous);
51 | 		}
52 | 	}
53 | 
54 | 	
55 | }
56 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Site.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class Site {
 7 | 	private String encoding="UTF-8";
 8 | 	private String ua="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 9 | 	private int sleep=20;
10 | 	private int retry=3;
11 | 	private int retrySleep=500;
12 | 	private int timeout=30000;
13 | 	private Map<String,String> headers=new HashMap<String,String>();
14 | 	
15 | 	public Site(){
16 | 		getHeaders().put("User-Agent", ua);
17 | 	}
18 | 	public String getEncoding() {
19 | 		return encoding;
20 | 	}
21 | 
22 | 	public Site setEncoding(String encoding) {
23 | 		this.encoding = encoding;
24 | 		return this;
25 | 	}
26 | 
27 | 	public String getUa() {
28 | 		return ua;
29 | 	}
30 | 
31 | 	public Site setUa(String ua) {
32 | 		getHeaders().put("User-Agent", ua);
33 | 		return this;
34 | 	}
35 | 
36 | 	public int getSleep() {
37 | 		return sleep;
38 | 	}
39 | 
40 | 	public Site setSleep(int sleep) {
41 | 		this.sleep = sleep;
42 | 		return this;
43 | 	}
44 | 
45 | 	public int getRetry() {
46 | 		return retry;
47 | 	}
48 | 
49 | 	public Site setRetry(int retry) {
50 | 		this.retry = retry;
51 | 		return this;
52 | 	}
53 | 
54 | 	public int getRetrySleep() {
55 | 		return retrySleep;
56 | 	}
57 | 
58 | 	public Site setRetrySleep(int retrySleep) {
59 | 		this.retrySleep = retrySleep;
60 | 		return this;
61 | 	}
62 | 
63 | 	public int getTimeout() {
64 | 		return timeout;
65 | 	}
66 | 
67 | 	public Site setTimeout(int timeout) {
68 | 		this.timeout = timeout;
69 | 		return this;
70 | 	}
71 | 	
72 | 	public Site setHeader(String name,String value){
73 | 		getHeaders().put(name, value);
74 | 		return this;
75 | 	}
76 | 	public Map<String,String> getHeaders() {
77 | 		return headers;
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/CustomRedirectStrategy.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.http;
 2 | 
 3 | import java.net.URI;
 4 | 
 5 | import org.apache.http.HttpRequest;
 6 | import org.apache.http.HttpResponse;
 7 | import org.apache.http.ProtocolException;
 8 | import org.apache.http.client.methods.HttpGet;
 9 | import org.apache.http.client.methods.HttpPost;
10 | import org.apache.http.client.methods.HttpRequestWrapper;
11 | import org.apache.http.client.methods.HttpUriRequest;
12 | import org.apache.http.impl.client.LaxRedirectStrategy;
13 | import org.apache.http.protocol.HttpContext;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | /**
18 |  *支持post 302跳转策略实现类
19 |  *HttpClient默认跳转：httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy());
20 |  *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。
21 |  *原代码地址：https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java
22 |  */
23 | public class CustomRedirectStrategy extends LaxRedirectStrategy {
24 |     private Logger logger = LoggerFactory.getLogger(getClass());
25 | 
26 |     @Override
27 |     public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException {
28 |         URI uri = getLocationURI(request, response, context);
29 |         String method = request.getRequestLine().getMethod();
30 |         if ("post".equalsIgnoreCase(method)) {
31 |             try {
32 |                 HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request;
33 |                 httpRequestWrapper.setURI(uri);
34 |                 httpRequestWrapper.removeHeaders("Content-Length");
35 |                 return httpRequestWrapper;
36 |             } catch (Exception e) {
37 |                 logger.error("强转为HttpRequestWrapper出错");
38 |             }
39 |             return new HttpPost(uri);
40 |         } else {
41 |             return new HttpGet(uri);
42 |         }
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverManager.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.selenium;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.io.IOException;
 5 | 
 6 | import org.openqa.selenium.WebDriver;
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | 
10 | public class WebDriverManager implements Closeable{
11 | 	private static final Logger log=LoggerFactory.getLogger(WebDriverManager.class);
12 | 	
13 | 	private WebDriverPool webDriverPool=null;
14 | 	
15 | 	public WebDriverManager(String phantomjsPath){
16 | 		this.webDriverPool=new PhantomjsWebDriverPool(1,false,phantomjsPath);
17 | 	}
18 | 	public WebDriverManager(WebDriverPool webDriverPool){
19 | 		this.webDriverPool=webDriverPool;
20 | 	}
21 | 	public void load(String url,int sleepTimeMillis,SeleniumAction... actions){
22 | 		WebDriver driver=null;
23 | 		try {
24 | 			driver=webDriverPool.get();
25 | 			driver.get(url);
26 | 			sleep(sleepTimeMillis);
27 | 			WebDriver.Options manage = driver.manage();
28 | 			manage.window().maximize();
29 | 			for(SeleniumAction action:actions){
30 | 				action.execute(driver);
31 | 			}
32 | 		} catch (InterruptedException e) {
33 | 			e.printStackTrace();
34 | 			log.error("",e);
35 | 		}finally{
36 | 			if(driver!=null){
37 | 				webDriverPool.returnToPool(driver);
38 | 			}
39 | 		}
40 | 	}
41 | 	public void load(SeleniumAction... actions){
42 | 		WebDriver driver=null;
43 | 		try {
44 | 			driver=webDriverPool.get();
45 | 			WebDriver.Options manage = driver.manage();
46 | 			manage.window().maximize();
47 | 			for(SeleniumAction action:actions){
48 | 				action.execute(driver);
49 | 			}
50 | 		} catch (InterruptedException e) {
51 | 			e.printStackTrace();
52 | 			log.error("",e);
53 | 		}finally{
54 | 			if(driver!=null){
55 | 				webDriverPool.returnToPool(driver);
56 | 			}
57 | 		}
58 | 	}
59 | 	public void shutDown(){
60 | 		if(webDriverPool!=null){
61 | 			webDriverPool.shutdown();
62 | 		}
63 | 	}
64 | 	@Override
65 | 	public void close() throws IOException {
66 | 		shutDown();
67 | 	}
68 | 	public void sleep(long millis){
69 | 		try {
70 | 			Thread.sleep(millis);
71 | 		} catch (InterruptedException e) {
72 | 			e.printStackTrace();
73 | 		}
74 | 	}
75 | 	
76 | }
77 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/jsp/new-employee.jsp:
--------------------------------------------------------------------------------
 1 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
 2 | <html>
 3 |     <head>
 4 |         <link rel="stylesheet" href="../css/bootstrap.min.css">   		
 5 |         <script src="../js/bootstrap.min.js"></script>     
 6 |     </head>
 7 |     <body>
 8 |         <div class="container">
 9 |             <form action="/employee" method="post"  role="form" data-toggle="validator" >
10 |                 <c:if test ="${empty action}">                        	
11 |                     <c:set var="action" value="add"/>
12 |                 </c:if>
13 |                 <input type="hidden" id="action" name="action" value="${action}">
14 |                 <input type="hidden" id="idEmployee" name="idEmployee" value="${employee.id}">
15 |                 <h2>Employee</h2>
16 |                 <div class="form-group col-xs-4">
17 |                     <label for="name" class="control-label col-xs-4">Name:</label>
18 |                     <input type="text" name="name" id="name" class="form-control" value="${employee.name}" required="true"/>                                   
19 | 
20 |                     <label for="lastName" class="control-label col-xs-4">Last name:</label>                   
21 |                     <input type="text" name="lastName" id="lastName" class="form-control" value="${employee.lastName}" required="true"/> 
22 | 
23 |                     <label for="birthdate" class="control-label col-xs-4">Birth date</label>                 
24 |                     <input type="text"  pattern="^\d{2}-\d{2}-\d{4}$" name="birthDate" id="birthdate" class="form-control" value="${employee.birthDate}" maxlength="10" placeholder="dd-MM-yyy" required="true"/>
25 | 
26 |                     <label for="role" class="control-label col-xs-4">Role:</label>                    
27 |                     <input type="text" name="role" id="role" class="form-control" value="${employee.role}" required="true"/> 
28 | 
29 |                     <label for="department" class="control-label col-xs-4">Department:</label>
30 |                     <input type="text" name="department" id="department" class="form-control" value="${employee.department}" required="true"/>
31 | 
32 |                     <label for="department" class="control-label col-xs-4">E-mail:</label>                   
33 |                     <input type="text" name="email" id="email" class="form-control" value="${employee.email}" placeholder="smith@aol.com" required="true"/>
34 | 
35 |                     <br></br>
36 |                     <button type="submit" class="btn btn-primary  btn-md">Accept</button> 
37 |                 </div>                                                      
38 |             </form>
39 |         </div>
40 |     </body>
41 | </html>


--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/SpiderManager.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.server.monitor;
 2 | 
 3 | import java.util.concurrent.ConcurrentHashMap;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import com.github.xbynet.crawler.Spider;
 9 | 
10 | public class SpiderManager {
11 | 	private Logger log=LoggerFactory.getLogger(SpiderManager.class);
12 | 	
13 | 	private ConcurrentHashMap<String, Spider> spiders=new ConcurrentHashMap<>();
14 | 	
15 | 	private SpiderManager(){
16 | 		
17 | 	}
18 | 	
19 | 	private static class SingleHolder{
20 | 		static SpiderManager instance=new SpiderManager();
21 | 	}
22 | 	
23 | 	public static SpiderManager get(){
24 | 		return SingleHolder.instance;
25 | 	}
26 | 	
27 | 	public synchronized void add(Spider... spiders1){
28 | 		for(Spider s:spiders1){
29 | 			getSpiders().put(s.getName(),s);
30 | 		}
31 | 	}
32 | 	public synchronized Spider remove(String name){
33 | 		return getSpiders().remove(name);
34 | 	}
35 | 	public synchronized void stopAll(){
36 | 		for(String key:getSpiders().keySet()){
37 | 			stop(key);
38 | 		}
39 | 	}
40 | 	public synchronized void startAll(){
41 | 		for(String key:getSpiders().keySet()){
42 | 			start(key);
43 | 		}
44 | 	}
45 | 	public String status(String name){
46 | 		if(!getSpiders().containsKey(name)){
47 | 			throw new IllegalArgumentException("the spider of "+name+" is not in manager");
48 | 		}
49 | 		Spider spider=getSpiders().get(name);
50 | 		return spider.getState().name();
51 | 	}
52 | 	
53 | 	public synchronized boolean stop(String name){
54 | 		if(!getSpiders().containsKey(name)){
55 | 			throw new IllegalArgumentException("the spider of "+name+" is not in manager");
56 | 		}
57 | 		Spider spider=getSpiders().get(name);
58 | 		if(spider.isRunning()){
59 | 			spider.stop();
60 | 			return true;
61 | 		}else{
62 | 			log.warn("illegal status "+spider.getState().name()+" for stop");
63 | 			return false;
64 | 		}
65 | 	}
66 | 	public synchronized boolean start(String name){
67 | 		if(!getSpiders().containsKey(name)){
68 | 			throw new IllegalArgumentException("the spider of "+name+" is not in manager");
69 | 		}
70 | 		Spider spider=getSpiders().get(name);
71 | 		if(spider.getState()==Spider.Status.NotRun){
72 | 			spider.runAsync();
73 | 			return true;
74 | 		}
75 | 		if(spider.isStopped()){
76 | 			if(spider.isShutdownOnComplete()){
77 | 				log.warn("spider of "+name+" setShutdownOnComplete=true, so it's not support restart");
78 | 				return false;
79 | 			}
80 | 			spider.runAsync();
81 | 			return true;
82 | 		}
83 | 		log.warn("illegal status "+spider.getState().name()+" for start");
84 | 		return false;
85 | 	}
86 | 
87 | 	public ConcurrentHashMap<String, Spider> getSpiders() {
88 | 		return spiders;
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/jsp/spider-list.jsp:
--------------------------------------------------------------------------------
 1 | <%@ page contentType="text/html;charset=utf-8" %>
 2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
 3 | <!DOCTYPE html>
 4 | <html lang="zh">
 5 |     <head>
 6 |         <meta charset="UTF-8">
 7 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 8 |         <link rel="stylesheet" href="../css/bootstrap.min.css">
 9 |         <script src="../js/jquery.min.js"></script>   		
10 |         <script src="../js/bootstrap.min.js"></script>
11 |         <script>
12 |         	var baseUrl='${root}';
13 |         </script>
14 |         <title>爬虫监控</title>       
15 |     </head>
16 | 
17 |     <body>          
18 |         <div class="container">
19 |             <h2>爬虫监控</h2>
20 |                 <c:choose> 
21 |                     <c:when test="${not empty spiders}">
22 |                         <table  class="table table-striped">
23 |                             <thead>
24 |                                 <tr>
25 |                                     <td>标识</td>
26 |                                     <td>页面处理器类名</td>
27 |                                     <td>状态</td>
28 |                                     <td>操作</td>
29 |                                     <td>运行信息</td>
30 |                                 </tr>
31 |                             </thead>
32 |                             <c:forEach var="spider" items="${spiders}">
33 |                                 <c:set var="classSucess" value=""/>
34 |                                 <tr class="${classSucess}">
35 |                                     <td>${spider.name}</td>
36 |                                     <td>${spider.processor}</td>
37 |                                     <td id="status">${spider.status}</td>
38 |                                     <td>
39 |                                     	<button id="stateBtn" type="button" class="btn btn-info" onclick="changeState('${spider.name}');"> 
40 |                                     	<c:if test ="${spider.status=='running'}">停止</c:if><c:if test ="${spider.status=='stopped' || spider.status=='notrun'}">开始</c:if></button>
41 |                                     </td>
42 |                                     <td>${spider.info}</td>
43 |                                 </tr>
44 |                             </c:forEach>               
45 |                         </table>  
46 |                     </c:when>                    
47 |                     <c:otherwise>
48 |                         <br>           
49 |                         <div class="alert alert-info">
50 |                             	没有正在运行的爬虫
51 |                         </div>
52 |                     </c:otherwise>
53 |                 </c:choose>                        
54 |         </div>
55 |     </body>
56 |     <script src="../js/spider-list.js"></script>
57 | </html>


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/DefaultDownloader.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.HashMap;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | 
 9 | import org.apache.commons.io.IOUtils;
10 | import org.apache.http.Header;
11 | import org.apache.http.HeaderElement;
12 | import org.apache.http.client.methods.CloseableHttpResponse;
13 | import org.apache.http.client.methods.HttpUriRequest;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import com.github.xbynet.crawler.Const;
18 | import com.github.xbynet.crawler.Request;
19 | import com.github.xbynet.crawler.Response;
20 | import com.github.xbynet.crawler.Site;
21 | 
22 | public class DefaultDownloader extends AbsDownloader {
23 | 	private final Logger log = LoggerFactory.getLogger(DefaultDownloader.class);
24 | 
25 | 	@Override
26 | 	public void download(Request request){
27 | 		super.doDownload(request);
28 | 	}
29 | 	@Override
30 | 	protected void process(HttpUriRequest httpUriRequest,
31 | 			CloseableHttpResponse resp, Request request, Site site,Response response,
32 | 			Object... extras) {
33 | 		if (resp == null) {
34 | 			log.error(request.getUrl() + "请求失败");
35 | 			return ;
36 | 		}
37 | 		response.setCode(resp.getStatusLine().getStatusCode());
38 | 		response.setContentType(resp.getFirstHeader("Content-Type").getValue());
39 | 		Const.ResponseType type = null;
40 | 		try {
41 | 			if (response.getContentType().contains("text")
42 | 					|| response.getContentType().contains("json")) {
43 | 				type = Const.ResponseType.TEXT;
44 | 				String raw=IOUtils.toString(resp.getEntity().getContent(),
45 | 						request.getEncoding() != null ? request.getEncoding()
46 | 								: site.getEncoding());
47 | 				response.setRaw(raw);
48 | 			} else {
49 | 				type = Const.ResponseType.BIN;
50 | 				response.setBytes(IOUtils.toByteArray(resp.getEntity()
51 | 						.getContent()));
52 | 			}
53 | 		} catch (UnsupportedOperationException e) {
54 | 			log.error("", e);
55 | 		} catch (IOException e) {
56 | 			log.error("", e);
57 | 		}
58 | 		response.setRespType(type);
59 | 		response.setRequest(request);
60 | 		
61 | 		Map<String,List<String>> headers=new HashMap<String,List<String>>();
62 | 		for(Header header:resp.getAllHeaders()){
63 | 			List<String> value=new ArrayList<String>();
64 | 			HeaderElement[] hes=header.getElements();
65 | 			if(hes!=null && hes.length>1){
66 | 				for(HeaderElement e:hes){
67 | 					value.add(e.getValue());
68 | 				}
69 | 			}else{
70 | 				value.add(header.getValue());
71 | 			}
72 | 			headers.put(header.getName(), value);
73 | 		}
74 | 		response.setHeaders(headers);
75 | 		try {
76 | 			getSpider().getProcessor().process(response);
77 | 		} catch (Exception e) {
78 | 			log.error("",e);
79 | 		}
80 | 	}
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/utils/CountableThreadPool.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.utils;
 2 | 
 3 | import java.util.concurrent.ExecutorService;
 4 | import java.util.concurrent.Executors;
 5 | import java.util.concurrent.TimeUnit;
 6 | import java.util.concurrent.atomic.AtomicInteger;
 7 | import java.util.concurrent.locks.Condition;
 8 | import java.util.concurrent.locks.ReentrantLock;
 9 | 
10 | 
11 | public class CountableThreadPool {
12 | 
13 |     private int threadNum;
14 | 
15 |     private AtomicInteger threadAlive = new AtomicInteger();
16 | 
17 |     private ReentrantLock reentrantLock = new ReentrantLock();
18 | 
19 |     private Condition condition = reentrantLock.newCondition();
20 | 
21 |     public CountableThreadPool(int threadNum) {
22 |         this.threadNum = threadNum;
23 |         this.executorService = Executors.newFixedThreadPool(threadNum);
24 |     }
25 | 
26 |     public CountableThreadPool(int threadNum, ExecutorService executorService) {
27 |         this.threadNum = threadNum;
28 |         this.executorService = executorService;
29 |     }
30 | 
31 |     public void setExecutorService(ExecutorService executorService) {
32 |         this.executorService = executorService;
33 |     }
34 | 
35 |     public int getThreadAlive() {
36 |         return threadAlive.get();
37 |     }
38 | 
39 |     public int getThreadNum() {
40 |         return threadNum;
41 |     }
42 | 
43 |     private ExecutorService executorService;
44 | 
45 |     public void execute(final Runnable runnable) {
46 | 
47 | 
48 |         if (threadAlive.get() >= threadNum) {
49 |             try {
50 |                 reentrantLock.lock();
51 |                 while (threadAlive.get() >= threadNum) {
52 |                     try {
53 |                         condition.await();
54 |                     } catch (InterruptedException e) {
55 |                     }
56 |                 }
57 |             } finally {
58 |                 reentrantLock.unlock();
59 |             }
60 |         }
61 |         threadAlive.incrementAndGet();
62 |         executorService.execute(new Runnable() {
63 |             public void run() {
64 |                 try {
65 |                     runnable.run();
66 |                 } finally {
67 |                     try {
68 |                         reentrantLock.lock();
69 |                         threadAlive.decrementAndGet();
70 |                         condition.signal();
71 |                     } finally {
72 |                         reentrantLock.unlock();
73 |                     }
74 |                 }
75 |             }
76 |         });
77 |     }
78 | 
79 |     public boolean isShutdown() {
80 |         return executorService.isShutdown();
81 |     }
82 | 
83 |     public void shutdown() {
84 |         executorService.shutdown();
85 |     }
86 |     public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException{
87 |     	return executorService.awaitTermination(timeout, unit);
88 |     }
89 | 
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/MonitorServlet.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.server.monitor;
 2 | 
 3 | import java.io.IOException;
 4 | import java.text.SimpleDateFormat;
 5 | import java.util.ArrayList;
 6 | import java.util.Date;
 7 | import java.util.HashMap;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.concurrent.ConcurrentHashMap;
11 | 
12 | import javax.servlet.ServletException;
13 | import javax.servlet.ServletOutputStream;
14 | import javax.servlet.annotation.WebServlet;
15 | import javax.servlet.http.HttpServlet;
16 | import javax.servlet.http.HttpServletRequest;
17 | import javax.servlet.http.HttpServletResponse;
18 | 
19 | import org.apache.commons.lang3.StringUtils;
20 | 
21 | import com.github.xbynet.crawler.Spider;
22 | 
23 | @WebServlet(
24 |         name = "MonitorServlet",
25 |         urlPatterns = {"/monitor"}
26 |     )
27 | public class MonitorServlet extends HttpServlet{
28 | 
29 | 	@Override
30 | 	protected void doGet(HttpServletRequest req, HttpServletResponse resp)
31 | 			throws ServletException, IOException {
32 | 		String method=req.getParameter("method");
33 | 		String name=req.getParameter("name");
34 | 		String uri=req.getRequestURI();
35 | 		if(StringUtils.isBlank(method)){
36 | 			List<Map<String, String>> infolist = new ArrayList<>();
37 | 			ConcurrentHashMap<String, Spider> spiders = SpiderManager.get()
38 | 					.getSpiders();
39 | 			for (String key : spiders.keySet()) {
40 | 				Map<String, String> map = new HashMap<>();
41 | 				Spider spider = spiders.get(key);
42 | 				map.put("name", key);
43 | 				map.put("processor", spider.getProcessor().getClass().getName());
44 | 				map.put("status", spider.getState().name().toLowerCase());
45 | 				SimpleDateFormat sdf = new SimpleDateFormat(
46 | 						"yyyy-MM-dd HH:mm:ss");
47 | 				Date start = spider.getStartTime();
48 | 				Date end = spider.getEndTime();
49 | 				end = end == null ? new Date() : end;
50 | 				long runsecs = start == null ? 0 : (end.getTime() - start
51 | 						.getTime()) / 1000;
52 | 				map.put("info",
53 | 						"开始时间:"
54 | 								+ (start == null ? "无" : sdf.format(start))
55 | 								+ ",运行时间:"
56 | 								+ runsecs
57 | 								+ "秒，"
58 | 								+ "总请求数:"
59 | 								+ spider.getScheduler().getTotalRequestsCount(
60 | 										spider)
61 | 								+ ",剩余请求数:"
62 | 								+ spider.getScheduler().getLeftRequestsCount(
63 | 										spider));
64 | 
65 | 				infolist.add(map);
66 | 			}
67 | 			req.setAttribute("root", req.getServletContext().getContextPath());
68 | 			req.setAttribute("spiders", infolist);
69 | 			req.getRequestDispatcher("/jsp/spider-list.jsp").forward(req, resp);
70 | 		}else if(method.equals("start")){
71 | 			outString(resp, String.valueOf(SpiderManager.get().start(name)));
72 | 		}else if(method.equals("stop")){
73 | 			outString(resp, String.valueOf(SpiderManager.get().stop(name)));
74 | 		}
75 | 	}
76 | 	public void outString(HttpServletResponse resp,String content) throws IOException{
77 | 		ServletOutputStream out = resp.getOutputStream();
78 | 		out.write(content.getBytes());
79 | 		out.flush();
80 | 		out.close();
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/ZhihuRecommendCrawler.java:
--------------------------------------------------------------------------------
 1 | package net.xby1993.crawler;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.text.SimpleDateFormat;
 7 | import java.util.Date;
 8 | import java.util.concurrent.atomic.AtomicInteger;
 9 | 
10 | import org.apache.commons.io.FileUtils;
11 | import org.apache.commons.io.IOUtils;
12 | import org.jsoup.Jsoup;
13 | import org.jsoup.nodes.Element;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import com.github.xbynet.crawler.Const;
18 | import com.github.xbynet.crawler.Processor;
19 | import com.github.xbynet.crawler.Request;
20 | import com.github.xbynet.crawler.Response;
21 | import com.github.xbynet.crawler.Site;
22 | import com.github.xbynet.crawler.Spider;
23 | import com.github.xbynet.crawler.parser.JsonPathParser;
24 | 
25 | public class ZhihuRecommendCrawler extends Processor{
26 | 	private Logger log=LoggerFactory.getLogger(ZhihuRecommendCrawler.class);
27 | 	private AtomicInteger offset=new AtomicInteger(0);
28 | 	
29 | 	@Override
30 | 	public void process(Response resp) {
31 | 		String curUrl=resp.getRequest().getUrl();
32 | 		JsonPathParser parser=resp.json();
33 | 		int count=Integer.valueOf(parser.single("$.msg.length()"));
34 | 		if(count>0){
35 | 			resp.addRequest(getPostRequest(offset.addAndGet(20)));
36 | 		}
37 | 		StringBuilder sb=new StringBuilder();
38 | 		for(int i=0;i<count;i++){
39 | 			String itemStr=parser.single("$.msg["+i+"]");
40 | 			Element e=Jsoup.parse(itemStr).select("div.zm-item").get(0);
41 | 			String title=e.select("h2 a.question_link").text();
42 | 			String link="https://www.zhihu.com"+e.select("h2 a.question_link").attr("href");
43 | 			String authorAndInfo=e.select(".summary-wrapper").text();
44 | 			String content=e.select(".zm-item-rich-text .zh-summary").html();
45 | 			sb.append("<div style='margin: 6px 0 6px 400px;max-width:800px;border-bottom: 3px dashed #ccc;'><h4><a href=\""+link+"\">"+title+"</a></h4><span style='color:blue;'>"+authorAndInfo+"</span><a style='color:red;margin:0 10px;' href='"+link+"'>查看</a><div>"+content+"</div></div>\n");
46 | 		}
47 | 		appendToFile(sb.toString());
48 | 		
49 | 	}
50 | 	public void start() {
51 | 		Site site = new Site();
52 | 		site.setHeader("Referer", "https://www.zhihu.com/explore/recommendations");
53 | 		Spider spider = Spider.builder(this).threadNum(5).site(site)
54 | 				.requests(getPostRequest(0)).build();
55 | 		spider.run();
56 | 		appendToFile("</body></html>");
57 | 	}
58 | 	private Request getPostRequest(int offset){
59 | 		Request req=new Request("https://www.zhihu.com/node/ExploreRecommendListV2");
60 | 		req.setMethod(Const.HttpMethod.POST);
61 | 		req.setParams("method", "next");
62 | 		req.setParams("params", "{\"limit\":20,\"offset\":"+offset+"}");
63 | 		return req;
64 | 	}
65 | 	private synchronized void appendToFile(String content){
66 | 		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH");
67 | 		File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".zhihu.html");
68 | 		if(!f.exists()){
69 | 			try {
70 | 				f.createNewFile();
71 | 				FileUtils.write(f, "<!DOCTYPE html><html><head><title></title><meta charset=\"UTF-8\"></head><body>","UTF-8");
72 | 			} catch (IOException e) {
73 | 				e.printStackTrace();
74 | 			}
75 | 		}
76 | 		FileWriter writer=null;
77 | 		try {
78 | 			writer=new FileWriter(f,true);
79 | 			writer.write(content);
80 | 		} catch (IOException e) {
81 | 			e.printStackTrace();
82 | 		}finally{
83 | 			IOUtils.closeQuietly(writer);
84 | 		}
85 | 	}
86 | 	public static void main(String[] args) {
87 | 		new ZhihuRecommendCrawler().start();
88 | 	}
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/OSChinaTweetsCrawler.java:
--------------------------------------------------------------------------------
 1 | package net.xby1993.crawler;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.text.SimpleDateFormat;
 7 | import java.util.ArrayList;
 8 | import java.util.Date;
 9 | import java.util.List;
10 | import java.util.concurrent.atomic.AtomicInteger;
11 | 
12 | import org.apache.commons.io.FileUtils;
13 | import org.apache.commons.io.IOUtils;
14 | import org.jsoup.nodes.Element;
15 | import org.jsoup.select.Elements;
16 | 
17 | import com.github.xbynet.crawler.Processor;
18 | import com.github.xbynet.crawler.Request;
19 | import com.github.xbynet.crawler.Response;
20 | import com.github.xbynet.crawler.Site;
21 | import com.github.xbynet.crawler.Spider;
22 | import com.github.xbynet.crawler.parser.JsoupParser;
23 | 
24 | public class OSChinaTweetsCrawler extends Processor{
25 | 	private final int maxPageCount=20;
26 | 	private final AtomicInteger count=new AtomicInteger(0);
27 | 	@Override
28 | 	public void process(Response resp) {
29 | 		synchronized (count) {
30 | 			if(count.get()>maxPageCount)
31 | 				return;
32 | 		}
33 | 		count.addAndGet(1);
34 | 		String currentUrl=resp.getRequest().getUrl();
35 | 		JsoupParser parser=resp.html();
36 | 		List<String> lastIds=parser.list("span[data-last]","data-last");
37 | 		String lastId=lastIds.get(lastIds.size()-1);
38 | 		String continueUrl="https://www.oschina.net/tweets?lastLogId="+lastId;
39 | 		Request req=new Request(continueUrl);
40 | 		req.setHeader("Referer", currentUrl);
41 | 		req.setHeader("X-Requested-With", "XMLHttpRequest");
42 | 		resp.addRequest(req);
43 | 		
44 | 		StringBuilder sb=new StringBuilder();
45 | 		List<String> authors=parser.list(".tweetitem .box-fl > a","title");
46 | 		List<String> itemUrls=parser.list(".tweetitem .ti-toolbox a[title=\"查看详情\"]","href");
47 | 		List<String> itemContents=new ArrayList<String>(itemUrls.size());
48 | 		Elements els=parser.elements(".tweetitem");
49 | 		for(Element e:els){
50 | 			String tmp=e.select(".ti-content > .inner-content").first().html();
51 | 			itemContents.add(tmp.replace("src=\"/", "src=\"https://www.oschina.net/"));
52 | 		}
53 | 		for(int i=0;i<itemContents.size();i++){
54 | 			sb.append("<div style='margin: 6px 0 6px 400px;max-width:800px;border-bottom: 3px dashed #ccc;'><span style='color:blue;'>"+authors.get(i)+"</span><a style='color:red;margin:0 10px;' href='"+itemUrls.get(i)+"'>查看</a><span>"+itemContents.get(i)+"</span></div>\n");
55 | 		}
56 | 		appendToFile(sb.toString());
57 | 	}
58 | 	public void start() {
59 | 		Site site = new Site();
60 | 		site.setEncoding("UTF-8");
61 | 		site.setHeader("Referer", "https://www.oschina.net/");
62 | 		Spider spider = Spider.builder(this).threadNum(1).site(site)
63 | 				.urls("https://www.oschina.net/tweets?nocache="+System.currentTimeMillis()).build();
64 | 		spider.run();
65 | 		appendToFile("</body></html>");
66 | 	}
67 | 	public static void main(String[] args) {
68 | 		new OSChinaTweetsCrawler().start();
69 | 	}
70 | 	private synchronized void appendToFile(String content){
71 | 		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH");
72 | 		File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".oschina.html");
73 | 		if(!f.exists()){
74 | 			try {
75 | 				f.createNewFile();
76 | 				FileUtils.write(f, "<!DOCTYPE html><html><head><title></title><meta charset=\"UTF-8\"></head><body>","UTF-8");
77 | 			} catch (IOException e) {
78 | 				e.printStackTrace();
79 | 			}
80 | 		}
81 | 		FileWriter writer=null;
82 | 		try {
83 | 			writer=new FileWriter(f,true);
84 | 			writer.write(content);
85 | 		} catch (IOException e) {
86 | 			e.printStackTrace();
87 | 		}finally{
88 | 			IOUtils.closeQuietly(writer);
89 | 		}
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/QiushibaikeCrawler.java:
--------------------------------------------------------------------------------
 1 | package net.xby1993.crawler;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.text.SimpleDateFormat;
 7 | import java.util.ArrayList;
 8 | import java.util.Date;
 9 | import java.util.List;
10 | import java.util.concurrent.atomic.AtomicInteger;
11 | 
12 | import org.apache.commons.io.FileUtils;
13 | import org.apache.commons.io.IOUtils;
14 | import org.jsoup.nodes.Element;
15 | import org.jsoup.select.Elements;
16 | 
17 | import com.github.xbynet.crawler.Processor;
18 | import com.github.xbynet.crawler.Response;
19 | import com.github.xbynet.crawler.Site;
20 | import com.github.xbynet.crawler.Spider;
21 | import com.github.xbynet.crawler.parser.JsoupParser;
22 | 
23 | public class QiushibaikeCrawler extends Processor{
24 | 	@Override
25 | 	public void process(Response resp) {
26 | 		String currentUrl=resp.getRequest().getUrl();
27 | 		JsoupParser parser=resp.html();
28 | 		if(currentUrl.equals("https://www.qiushibaike.com/")){
29 | 			int pageCount=Integer.valueOf(parser.single("ul.pagination > li:nth-last-child(2) .page-numbers","text").trim());
30 | 			System.out.println("8hr共有"+pageCount+"页");
31 | 			for(int i=2;i<=pageCount;i++){
32 | 				resp.addRequest("https://www.qiushibaike.com/8hr/page/"+i+"/", false);
33 | 			}
34 | 		}else if(currentUrl.equals("https://www.qiushibaike.com/hot/")){
35 | 			int pageCount=Integer.valueOf(parser.single("ul.pagination > li:nth-last-child(2) .page-numbers","text").trim());
36 | 			System.out.println("hot共有"+pageCount+"页");
37 | 			for(int i=2;i<=pageCount;i++){
38 | 				resp.addRequest("https://www.qiushibaike.com/hot/page/"+i+"/", false);
39 | 			}
40 | 		}
41 | 		Elements els=parser.elements("#content-left > div");
42 | 		StringBuilder sb=new StringBuilder();
43 | 		for(Element e:els){
44 | 			String author=e.select(".author > a:nth-child(2)").attr("title").trim();
45 | 			String link="https://www.qiushibaike.com"+e.select(".contentHerf").attr("href");
46 | 			String content=e.select(".contentHerf .content").html();
47 | 			Elements thumbEls=e.select(".thumb");
48 | 			if(thumbEls!=null && thumbEls.size()>0){
49 | 				content+=thumbEls.get(0).outerHtml().replace("src=\"//", "src=\"http://");
50 | 			}
51 | 			sb.append("<div style='margin: 6px 0 6px 400px;max-width:800px;border-bottom: 3px dashed #ccc;'><span style='color:blue;'>"+author+"</span><a style='color:red;margin:0 10px;' href='"+link+"'>查看</a><span>"+content+"</span></div>\n");
52 | 			
53 | 		}
54 | 		appendToFile(sb.toString());
55 | 	}
56 | 	public void start() {
57 | 		Site site = new Site();
58 | //		site.setEncoding("UTF-8");
59 | 		site.setHeader("Referer", "https://www.qiushibaike.com/");
60 | 		Spider spider = Spider.builder(this).threadNum(1).site(site)
61 | 				.urls("https://www.qiushibaike.com/","https://www.qiushibaike.com/hot/").build();
62 | 		spider.run();
63 | 		appendToFile("</body></html>");
64 | 	}
65 | 	public static void main(String[] args) {
66 | 		new QiushibaikeCrawler().start();
67 | 	}
68 | 	private synchronized void appendToFile(String content){
69 | 		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH");
70 | 		File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".qiushibaike.html");
71 | 		if(!f.exists()){
72 | 			try {
73 | 				f.createNewFile();
74 | 				FileUtils.write(f, "<!DOCTYPE html><html><head><title></title><meta charset=\"UTF-8\"></head><body>","UTF-8");
75 | 			} catch (IOException e) {
76 | 				e.printStackTrace();
77 | 			}
78 | 		}
79 | 		FileWriter writer=null;
80 | 		try {
81 | 			writer=new FileWriter(f,true);
82 | 			writer.write(content);
83 | 		} catch (IOException e) {
84 | 			e.printStackTrace();
85 | 		}finally{
86 | 			IOUtils.closeQuietly(writer);
87 | 		}
88 | 	}
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/SeleniumDownloader.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.selenium;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.openqa.selenium.By;
 10 | import org.openqa.selenium.WebDriver;
 11 | import org.openqa.selenium.WebElement;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.github.xbynet.crawler.Const;
 16 | import com.github.xbynet.crawler.Request;
 17 | import com.github.xbynet.crawler.Response;
 18 | import com.github.xbynet.crawler.Spider;
 19 | import com.github.xbynet.crawler.http.Downloader;
 20 | 
 21 | public class SeleniumDownloader implements Downloader {
 22 | 	private static final Logger log = LoggerFactory
 23 | 			.getLogger(SeleniumDownloader.class);
 24 | 	private int sleepTime = 3000;// 3s
 25 | 	private SeleniumAction action = null;
 26 | 	private WebDriverPool webDriverPool;
 27 | 	private Spider spider;
 28 | 
 29 | 	public SeleniumDownloader(WebDriverPool webDriverPool) {
 30 | 		this.webDriverPool = webDriverPool;
 31 | 	}
 32 | 
 33 | 	public SeleniumDownloader(int sleepTime, WebDriverPool pool) {
 34 | 		this(sleepTime, pool, null);
 35 | 	}
 36 | 
 37 | 	public SeleniumDownloader(int sleepTime, WebDriverPool pool,
 38 | 			SeleniumAction action) {
 39 | 		this.sleepTime = sleepTime;
 40 | 		this.action = action;
 41 | 		this.webDriverPool = pool;
 42 | 	}
 43 | 
 44 | 	public void setOperator(SeleniumAction action) {
 45 | 		this.action = action;
 46 | 	}
 47 | 
 48 | 	@Override
 49 | 	public void download(Request request) {
 50 | 		WebDriver webDriver;
 51 | 		try {
 52 | 			webDriver = webDriverPool.get();
 53 | 		} catch (InterruptedException e) {
 54 | 			log.warn("interrupted", e);
 55 | 			return;
 56 | 		}
 57 | 		log.info("downloading page " + request.getUrl());
 58 | 		Response resp = new Response();
 59 | 		resp.setRequest(request);
 60 | 		resp.setRespType(Const.ResponseType.TEXT);
 61 | 		try {
 62 | 			webDriver.get(request.getUrl());
 63 | 			Thread.sleep(sleepTime);
 64 | 		} catch (Exception e) {
 65 | 			log.error("", e);
 66 | 			webDriverPool.close(webDriver);
 67 | 			return;
 68 | 		}
 69 | 		try {
 70 | 			WebDriver.Options manage = webDriver.manage();
 71 | 			manage.window().maximize();
 72 | 			if (action != null) {
 73 | 				action.execute(webDriver);
 74 | 			}
 75 | 			SeleniumAction reqAction = null;
 76 | 			if (request.getExtras() != null
 77 | 					&& request.getExtras().containsKey("action")) {
 78 | 				reqAction = (SeleniumAction) request.getExtras().get("action");
 79 | 			}
 80 | 			if (reqAction != null) {
 81 | 				reqAction.execute(webDriver);
 82 | 			}
 83 | 
 84 | 			WebElement webElement = webDriver.findElement(By.xpath("/html"));
 85 | 			String content = webElement.getAttribute("outerHTML");
 86 | 
 87 | 			resp.setRaw(content);
 88 | 			Map<String, List<String>> headers = new HashMap<String, List<String>>();
 89 | 			List<String> cookielist = new ArrayList<String>(1);
 90 | 			cookielist.add(WindowUtil.getHttpCookieString(webDriver.manage()
 91 | 					.getCookies()));
 92 | 			headers.put("Set-Cookie", cookielist);
 93 | 			resp.setHeaders(headers);
 94 | 			
 95 | 			getSpider().getProcessor().process(resp);
 96 | 		} catch (Exception e) {
 97 | 			log.error("", e);
 98 | 		} finally {
 99 | 			webDriverPool.returnToPool(webDriver);
100 | 		}
101 | 	}
102 | 	public Spider getSpider() {
103 | 		return spider;
104 | 	}
105 | 
106 | 	public void setSpider(Spider spider) {
107 | 		this.spider = spider;
108 | 	}
109 | 
110 | 	@Override
111 | 	public void close() throws IOException {
112 | 		webDriverPool.shutdown();
113 | 	}
114 | 
115 | 	@Override
116 | 	public void init() {
117 | 		
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/parser/JsoupParser.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.parser;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.jsoup.Jsoup;
  7 | import org.jsoup.nodes.Document;
  8 | import org.jsoup.nodes.Element;
  9 | import org.jsoup.nodes.Node;
 10 | import org.jsoup.nodes.TextNode;
 11 | import org.jsoup.select.Elements;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.github.xbynet.crawler.Const;
 16 | 
 17 | public class JsoupParser implements Parser {
 18 | 	private static final Logger log = LoggerFactory
 19 | 			.getLogger(JsoupParser.class);
 20 | 
 21 | 	private Document doc;
 22 | 
 23 | 	public JsoupParser(String raw) {
 24 | 		doc=Jsoup.parse(raw);
 25 | 	}
 26 | 
 27 | 	public String single(String cssSelector) {
 28 | 		Elements els = getDoc().select(cssSelector);
 29 | 		if (els == null || els.size() == 0) {
 30 | 			log.warn("所选元素不存在" + cssSelector);
 31 | 			return null;
 32 | 		}
 33 | 		return getValue(getDoc().select(cssSelector).get(0), null);
 34 | 	}
 35 | 
 36 | 	public String single(String cssSelector, String attrName) {
 37 | 		Elements els = getDoc().select(cssSelector);
 38 | 		if (els == null || els.size() == 0) {
 39 | 			log.warn("所选元素不存在" + cssSelector);
 40 | 			return null;
 41 | 		}
 42 | 		return getValue(getDoc().select(cssSelector).get(0), attrName);
 43 | 	}
 44 | 
 45 | 	public List<String> list(String cssSelector) {
 46 | 		List<String> reslist = new ArrayList<String>();
 47 | 		Elements els = getDoc().select(cssSelector);
 48 | 		if (els == null || els.size() == 0) {
 49 | 			log.warn("所选元素不存在" + cssSelector);
 50 | 			return reslist;
 51 | 		}
 52 | 		for (Element e : els) {
 53 | 			reslist.add(getValue(e, null));
 54 | 		}
 55 | 		return reslist;
 56 | 	}
 57 | 
 58 | 	public List<String> list(String cssSelector, String attrName) {
 59 | 		List<String> reslist = new ArrayList<String>();
 60 | 		Elements els = getDoc().select(cssSelector);
 61 | 		if (els == null || els.size() == 0) {
 62 | 			log.warn("所选元素不存在" + cssSelector);
 63 | 			return reslist;
 64 | 		}
 65 | 		for (Element e : els) {
 66 | 			reslist.add(getValue(e, attrName));
 67 | 		}
 68 | 		return reslist;
 69 | 	}
 70 | 
 71 | 	private String getValue(Element element, String attrName) {
 72 | 		if (attrName == null) {
 73 | 			return element.outerHtml();
 74 | 		} else if ("innerHtml".equalsIgnoreCase(attrName)) {
 75 | 			return element.html();
 76 | 		} else if ("text".equalsIgnoreCase(attrName)) {
 77 | 			return getText(element);
 78 | 		} else if ("allText".equalsIgnoreCase(attrName)) {
 79 | 			return element.text();
 80 | 		} else {
 81 | 			return element.attr(attrName);
 82 | 		}
 83 | 	}
 84 | 
 85 | 	protected String getText(Element element) {
 86 | 		StringBuilder accum = new StringBuilder();
 87 | 		for (Node node : element.childNodes()) {
 88 | 			if (node instanceof TextNode) {
 89 | 				TextNode textNode = (TextNode) node;
 90 | 				accum.append(textNode.text());
 91 | 			}
 92 | 		}
 93 | 		return accum.toString();
 94 | 	}
 95 | 
 96 | 	public Element element(String cssSelector) {
 97 | 		Elements els = getDoc().select(cssSelector);
 98 | 		if (els == null || els.size() == 0) {
 99 | 			log.warn("所选元素不存在" + cssSelector);
100 | 			return null;
101 | 		}
102 | 		return els.get(0);
103 | 	}
104 | 
105 | 	public Elements elements(String cssSelector) {
106 | 		Elements els = getDoc().select(cssSelector);
107 | 		return els;
108 | 	}
109 | 	public String script(String cssSelector) {
110 | 		return single(cssSelector,Const.CssAttr.innerHtml.name());
111 | 	}
112 | 	public List<String> scripts(String cssSelector) {
113 | 		return list(cssSelector,Const.CssAttr.innerHtml.name());
114 | 	}
115 | 
116 | 	public Document getDoc() {
117 | 		return doc;
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------
/crawler-server/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<parent>
  5 | 		<groupId>com.github.xbynet</groupId>
  6 | 		<artifactId>crawler-parent</artifactId>
  7 | 		<version>0.3.0</version>
  8 | 	</parent>
  9 | 	<artifactId>crawler-server</artifactId>
 10 | 	<packaging>jar</packaging>
 11 | 	<properties>
 12 | 		<tomcat.version>7.0.57</tomcat.version>
 13 | 		<myscope>compile</myscope>
 14 | 	</properties>
 15 | 	<dependencies>
 16 | 		<dependency>
 17 | 			<groupId>com.github.xbynet</groupId>
 18 | 			<artifactId>crawler-core</artifactId>
 19 | 			<version>${project.version}</version>
 20 | 		</dependency>
 21 | 		<dependency>
 22 | 			<groupId>org.apache.tomcat.embed</groupId>
 23 | 			<artifactId>tomcat-embed-core</artifactId>
 24 | 			<version>${tomcat.version}</version>
 25 | 			<scope>${myscope}</scope>
 26 | 		</dependency>
 27 | 		<dependency>
 28 | 			<groupId>org.apache.tomcat.embed</groupId>
 29 | 			<artifactId>tomcat-embed-logging-juli</artifactId>
 30 | 			<version>${tomcat.version}</version>
 31 | 			<scope>${myscope}</scope>
 32 | 		</dependency>
 33 | 		<dependency>
 34 | 			<groupId>org.apache.tomcat.embed</groupId>
 35 | 			<artifactId>tomcat-embed-jasper</artifactId>
 36 | 			<version>${tomcat.version}</version>
 37 | 			<scope>${myscope}</scope>
 38 | 		</dependency>
 39 | 		<dependency>
 40 | 			<groupId>org.apache.tomcat</groupId>
 41 | 			<artifactId>tomcat-jasper</artifactId>
 42 | 			<version>${tomcat.version}</version>
 43 | 			<scope>${myscope}</scope>
 44 | 		</dependency>
 45 | 		<dependency>
 46 | 			<groupId>org.apache.tomcat</groupId>
 47 | 			<artifactId>tomcat-jasper-el</artifactId>
 48 | 			<version>${tomcat.version}</version>
 49 | 			<scope>${myscope}</scope>
 50 | 		</dependency>
 51 | 		<dependency>
 52 | 			<groupId>org.apache.tomcat</groupId>
 53 | 			<artifactId>tomcat-jsp-api</artifactId>
 54 | 			<version>${tomcat.version}</version>
 55 | 			<scope>${myscope}</scope>
 56 | 		</dependency>
 57 | 		<dependency>
 58 | 			<groupId>jstl</groupId>
 59 | 			<artifactId>jstl</artifactId>
 60 | 			<version>1.2</version>
 61 | 		</dependency>
 62 | 	</dependencies>
 63 | 	<profiles>
 64 | 		<profile>
 65 | 			<id>pkg</id>
 66 | 			<build>
 67 | 				<plugins>
 68 | 					<plugin>
 69 | 						<groupId>org.apache.maven.plugins</groupId>
 70 | 						<artifactId>maven-assembly-plugin</artifactId>
 71 | 						<configuration>
 72 | 							<descriptorRefs>
 73 | 								<descriptorRef>jar-with-dependencies</descriptorRef>
 74 | 							</descriptorRefs>
 75 | 							<finalName>crawler-server-${project.version}</finalName>
 76 | 							<archive>
 77 | 								<manifest>
 78 | 									<mainClass>com.github.xbynet.crawler.server.Main</mainClass>
 79 | 								</manifest>
 80 | 							</archive>
 81 | 						</configuration>
 82 | 						<executions>
 83 | 							<execution>
 84 | 								<phase>package</phase>
 85 | 								<goals>
 86 | 									<goal>single</goal>
 87 | 								</goals>
 88 | 							</execution>
 89 | 						</executions>
 90 | 					</plugin>
 91 | 				</plugins>
 92 | 			</build>
 93 | 		</profile>
 94 | 	</profiles>
 95 | 	<build>
 96 | 		<finalName>crawler-server</finalName>
 97 | 		<resources>
 98 | 			<resource>
 99 | 				<directory>src/main/webapp</directory>
100 | 				<targetPath>META-INF/resources</targetPath>
101 | 			</resource>
102 | 		</resources>
103 | 		<plugins>
104 | 			<plugin>
105 | 				<groupId>org.apache.maven.plugins</groupId>
106 | 				<artifactId>maven-compiler-plugin</artifactId>
107 | 				<version>2.3.2</version>
108 | 				<inherited>true</inherited>
109 | 				<configuration>
110 | 					<source>1.7</source>
111 | 					<target>1.7</target>
112 | 				</configuration>
113 | 			</plugin>
114 | 		</plugins>
115 | 	</build>
116 | </project>


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/HttpClientFactory.java:
--------------------------------------------------------------------------------
 1 | package com.github.xbynet.crawler.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.security.KeyManagementException;
 5 | import java.security.KeyStoreException;
 6 | import java.security.NoSuchAlgorithmException;
 7 | import java.security.cert.X509Certificate;
 8 | 
 9 | import javax.net.ssl.SSLContext;
10 | 
11 | import org.apache.http.HttpException;
12 | import org.apache.http.HttpRequest;
13 | import org.apache.http.HttpRequestInterceptor;
14 | import org.apache.http.config.Registry;
15 | import org.apache.http.config.RegistryBuilder;
16 | import org.apache.http.config.SocketConfig;
17 | import org.apache.http.conn.socket.ConnectionSocketFactory;
18 | import org.apache.http.conn.socket.PlainConnectionSocketFactory;
19 | import org.apache.http.conn.ssl.NoopHostnameVerifier;
20 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
21 | import org.apache.http.impl.client.CloseableHttpClient;
22 | import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
23 | import org.apache.http.impl.client.HttpClients;
24 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
25 | import org.apache.http.protocol.HttpContext;
26 | import org.apache.http.ssl.SSLContexts;
27 | import org.apache.http.ssl.TrustStrategy;
28 | import org.slf4j.Logger;
29 | import org.slf4j.LoggerFactory;
30 | 
31 | 
32 | public class HttpClientFactory {
33 | 	private static final Logger log=LoggerFactory.getLogger(HttpClientFactory.class);
34 | 	
35 | 	public  CloseableHttpClient getClient(){
36 | 		return getClient(30000, 3);
37 | 	}
38 | 	public  CloseableHttpClient getClient(int timeout,int retry){
39 | 		RegistryBuilder<ConnectionSocketFactory> registryBuilder = RegistryBuilder.<ConnectionSocketFactory>create();  
40 | 		registryBuilder.register("http", PlainConnectionSocketFactory.INSTANCE);
41 | 		// Fixing: https://code.google.com/p/crawler4j/issues/detail?id=174
42 | 		                // By always trusting the ssl certificate
43 | 		SSLContext sslContext=null;
44 | 		try {
45 | 			sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustStrategy() {   
46 | 			    public boolean isTrusted(final X509Certificate[] chain, String authType) {   
47 | 			        return true;   
48 | 			    }   
49 | 			}).build();
50 | 		} catch (KeyManagementException e) {
51 | 			log.error("",e);
52 | 		} catch (NoSuchAlgorithmException e) {
53 | 			log.error("",e);
54 | 		} catch (KeyStoreException e) {
55 | 			log.error("",e);
56 | 		}   
57 | 		SSLConnectionSocketFactory sslsf=new SSLConnectionSocketFactory(sslContext, NoopHostnameVerifier.INSTANCE);   
58 | 		registryBuilder.register("https", sslsf);
59 | 		Registry<ConnectionSocketFactory> registry = registryBuilder.build();  
60 | 		//设置连接管理器  
61 | 		PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registry);  
62 | 		poolingHttpClientConnectionManager.setMaxTotal(500);
63 | 		poolingHttpClientConnectionManager.setDefaultMaxPerRoute(1000);
64 | 		
65 | 		SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
66 | 	    socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
67 | 	    socketConfigBuilder.setSoTimeout(timeout);
68 | 	    SocketConfig socketConfig = socketConfigBuilder.build();
69 | 		//构建客户端  
70 | 		CloseableHttpClient client= HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager)
71 | 				.addInterceptorFirst(new HttpRequestInterceptor() {
72 | 
73 | 			        public void process(
74 | 			                final HttpRequest request,
75 | 			                final HttpContext context) throws HttpException, IOException {
76 | 			            if (!request.containsHeader("Accept-Encoding")) {
77 | 			                request.addHeader("Accept-Encoding", "gzip");
78 | 			            }
79 | 			        }
80 | 			    })
81 | 			    .setDefaultSocketConfig(socketConfig)
82 | 			    .setRetryHandler(new DefaultHttpRequestRetryHandler(retry, true))
83 | 			    .build();  
84 | 		return client;
85 | 	}
86 | 	
87 |     
88 | }
89 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/NeihanshequCrawler.java:
--------------------------------------------------------------------------------
  1 | package net.xby1993.crawler;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileWriter;
  5 | import java.io.IOException;
  6 | import java.math.BigDecimal;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Date;
  9 | import java.util.List;
 10 | import java.util.concurrent.atomic.AtomicInteger;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | import org.apache.commons.io.FileUtils;
 15 | import org.apache.commons.io.IOUtils;
 16 | 
 17 | import com.github.xbynet.crawler.Processor;
 18 | import com.github.xbynet.crawler.Response;
 19 | import com.github.xbynet.crawler.Site;
 20 | import com.github.xbynet.crawler.Spider;
 21 | import com.github.xbynet.crawler.parser.JsonPathParser;
 22 | import com.github.xbynet.crawler.parser.JsoupParser;
 23 | 
 24 | public class NeihanshequCrawler  extends Processor{
 25 | 	private static final int maxCount=100;
 26 | 	private AtomicInteger count=new AtomicInteger(0);
 27 | 	
 28 | 	@Override
 29 | 	public void process(Response resp) {
 30 | 		String currentUrl=resp.getRequest().getUrl();
 31 | 		
 32 | 		if(currentUrl.equals("http://neihanshequ.com/")){
 33 | 			JsoupParser parser=resp.html();
 34 | 			List<String> scripts=parser.scripts("script");
 35 | 			for(String str:scripts){
 36 | 				if(str.contains("var gListViewConfig")){
 37 | 					Pattern p=Pattern.compile("max_time: '(.*?)',",Pattern.MULTILINE);
 38 | 					Matcher m=p.matcher(str);
 39 | 					if(m.find()){
 40 | 						String maxTime=m.group(1);
 41 | 						if(maxTime.contains(".")){
 42 | 							maxTime=maxTime.split("\\.")[0];
 43 | 						}
 44 | 						if(count.getAndIncrement()<=maxCount){
 45 | 							resp.addRequest("http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time="+maxTime, true);
 46 | 						}
 47 | 						return;
 48 | 					}
 49 | 					break;
 50 | 				}
 51 | 			}
 52 | 		}else{
 53 | 			JsonPathParser parser=resp.json();
 54 | 			String maxTime=parser.single("$.data.max_time");
 55 | 			if(maxTime.contains("E")){
 56 | 				maxTime=new BigDecimal(maxTime).toPlainString();
 57 | 			}
 58 | 			if(count.getAndIncrement()<=maxCount){
 59 | 				resp.addRequest("http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time="+maxTime, true);
 60 | 			}
 61 | 			StringBuilder sb=new StringBuilder();
 62 | 			int size=Integer.valueOf(parser.single("$.data.data.length()"));
 63 | 			for(int i=0;i<size;i++){
 64 | 				String author=parser.single("$.data.data["+i+"].group.user.name");
 65 | 				String link="http://neihanshequ.com/p"+parser.single("$.data.data["+i+"].group.id")+"/";
 66 | 				String content=parser.single("$.data.data["+i+"].group.content");
 67 | 				sb.append("<div style='margin: 6px 0 6px 400px;max-width:800px;border-bottom: 3px dashed #ccc;'><span style='color:blue;'>"+author+"</span><a style='color:red;margin:0 10px;' href='"+link+"'>查看</a><span>"+content+"</span></div>\n");
 68 | 			}
 69 | 			appendToFile(sb.toString());
 70 | 		}
 71 | 	}
 72 | 	public void start() {
 73 | 		Site site = new Site();
 74 | //		site.setEncoding("UTF-8");
 75 | 		site.setHeader("Referer", "http://neihanshequ.com/");
 76 | 		Spider spider = Spider.builder(this).threadNum(1).site(site)
 77 | 				.urls("http://neihanshequ.com/").build();
 78 | 		spider.run();
 79 | 		appendToFile("</body></html>");
 80 | 	}
 81 | 	public static void main(String[] args) {
 82 | 		new NeihanshequCrawler().start();
 83 | 	}
 84 | 	private synchronized void appendToFile(String content){
 85 | 		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH");
 86 | 		File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".neihanshequ.html");
 87 | 		if(!f.exists()){
 88 | 			try {
 89 | 				f.createNewFile();
 90 | 				FileUtils.write(f, "<!DOCTYPE html><html><head><title></title><meta charset=\"UTF-8\"></head><body>","UTF-8");
 91 | 			} catch (IOException e) {
 92 | 				e.printStackTrace();
 93 | 			}
 94 | 		}
 95 | 		FileWriter writer=null;
 96 | 		try {
 97 | 			writer=new FileWriter(f,true);
 98 | 			writer.write(content);
 99 | 		} catch (IOException e) {
100 | 			e.printStackTrace();
101 | 		}finally{
102 | 			IOUtils.closeQuietly(writer);
103 | 		}
104 | 	}
105 | 
106 | }


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Response.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | 
  7 | import com.github.xbynet.crawler.parser.JsonPathParser;
  8 | import com.github.xbynet.crawler.parser.JsoupParser;
  9 | import com.github.xbynet.crawler.parser.XpathParser;
 10 | import com.github.xbynet.crawler.utils.BeanUtil;
 11 | 
 12 | public class Response {
 13 | 	private int code;
 14 | 	private String contentType;
 15 | 	private Map<String,List<String>> headers;
 16 | 	private Const.ResponseType respType;
 17 | 	private String raw;//如果respType为Const.ResponseType.TEXT，则有值
 18 | 	private byte[] bytes;//如果respType为Const.ResponseType.BIN，则有值
 19 | 	private Request request;
 20 | 	private List<Request> continueRequest;
 21 | 	private Response parentResponse=null;//用于分块时
 22 | 
 23 | 	public Response(){
 24 | 		
 25 | 	}
 26 | 	public Response(Response parent){
 27 | 		this.parentResponse=parent;
 28 | 	}
 29 | 	public JsoupParser html(){
 30 | 		return new JsoupParser(raw);
 31 | 	}
 32 | 	public JsoupParser xml(){
 33 | 		return new JsoupParser(raw);
 34 | 	}
 35 | 	public JsonPathParser json(){
 36 | 		//处理jsonp的情形
 37 | 		if(!raw.startsWith("{")&&!raw.startsWith("[")){
 38 | 			raw=raw.substring(raw.indexOf("(")+1,raw.length()-1);
 39 | 		}
 40 | 		return new JsonPathParser(raw);
 41 | 	}
 42 | 	public XpathParser xpath(){
 43 | 		return new XpathParser(raw);
 44 | 	}
 45 | 	
 46 | 	public String getRaw(){
 47 | 		return raw;
 48 | 	}
 49 | 	public Response setRaw(String raw) {
 50 | 		this.raw = raw;
 51 | 		return this;
 52 | 	}
 53 | 	public int getCode() {
 54 | 		return code;
 55 | 	}
 56 | 	public Response setCode(int code) {
 57 | 		this.code = code;
 58 | 		return this;
 59 | 	}
 60 | 	public String getContentType() {
 61 | 		return contentType;
 62 | 	}
 63 | 	public Response setContentType(String contentType) {
 64 | 		this.contentType = contentType;
 65 | 		return this;
 66 | 	}
 67 | 	public Map<String,List<String>> getHeaders() {
 68 | 		return headers;
 69 | 	}
 70 | 	public Response setHeaders(Map<String,List<String>> headers) {
 71 | 		this.headers = headers;
 72 | 		return this;
 73 | 	}
 74 | 	public Const.ResponseType getRespType() {
 75 | 		return respType;
 76 | 	}
 77 | 	public Response setRespType(Const.ResponseType respType) {
 78 | 		this.respType = respType;
 79 | 		return this;
 80 | 	}
 81 | 	public byte[] getBytes() {
 82 | 		return bytes;
 83 | 	}
 84 | 	public Response setBytes(byte[] bytes) {
 85 | 		this.bytes = bytes;
 86 | 		return this;
 87 | 	}
 88 | 	public Request getRequest() {
 89 | 		return request;
 90 | 	}
 91 | 	public Response setRequest(Request request) {
 92 | 		this.request = request;
 93 | 		return this;
 94 | 	}
 95 | 	
 96 | 	public Response addRequest(String url,boolean copyParent){
 97 | 		if(continueRequest==null){
 98 | 			continueRequest=new ArrayList<Request>();
 99 | 		}
100 | 		Request req=new Request();
101 | 		if(copyParent){
102 | 			BeanUtil.copyProperties(request, req);
103 | 		}
104 | 		req.setUrl(url);
105 | 		continueRequest.add(req);
106 | 		return this;
107 | 	}
108 | 	public Response addRequest(Request req){
109 | 		if(continueRequest==null){
110 | 			continueRequest=new ArrayList<Request>();
111 | 		}
112 | 		continueRequest.add(req);
113 | 		return this;
114 | 	}
115 | 	public List<Request> getContinueReqeusts(){
116 | 		return continueRequest;
117 | 	}
118 | 	public Response addPartRequest(String url,boolean copyParent){
119 | 		Request req=new Request();
120 | 		if(copyParent){
121 | 			//不支持分块嵌套分块
122 | 			if(parentResponse==null){
123 | 				BeanUtil.copyProperties(request, req);
124 | 			}else{
125 | 				BeanUtil.copyProperties(parentResponse.getRequest(),req);
126 | 			}
127 | 		}
128 | 		req.setUrl(url);
129 | 		req.setPartRequest(null);
130 | 		return this;
131 | 	}
132 | 	public Response addPartRequest(Request req){
133 | 		if(parentResponse==null){
134 | 			request.getPartRequest().add(req);
135 | 		}else{
136 | 			parentResponse.getRequest().getPartRequest().add(req);
137 | 		}
138 | 		return this;
139 | 	}
140 | 	public boolean isPartResponse(){
141 | 		return parentResponse!=null;
142 | 	}
143 | 	public Response getParentResponse() {
144 | 		return parentResponse;
145 | 	}
146 | 
147 | }
148 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Request.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler;
  2 | 
  3 | import java.beans.Transient;
  4 | import java.io.Serializable;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | 
 10 | import org.apache.http.HttpEntity;
 11 | import org.apache.http.client.protocol.HttpClientContext;
 12 | 
 13 | import com.alibaba.fastjson.annotation.JSONField;
 14 | 
 15 | public class Request implements Serializable{
 16 | 	private String url;
 17 | 	private String encoding;
 18 | 	private Const.HttpMethod method=Const.HttpMethod.GET;
 19 | 	
 20 | 	private int retrySleepTime=-1;//millis
 21 | 	private int retryCount=-1;//millis
 22 | 	
 23 | 	private Map<String,String> headers=new HashMap<String,String>();
 24 | 	private Map<String,String> params=new HashMap<String,String>();
 25 | 	/**可以在添加请求时附加额外信息*/
 26 | 	private Map<String, Object> extras=new HashMap<String,Object>();
 27 | 	
 28 | 	private transient HttpClientContext ctx;
 29 | 	
 30 | 	/**
 31 | 	 * support for json,xml or more,在post时，设置此选项会使params参数失效。
 32 | 	  */
 33 | 	private transient HttpEntity entity;
 34 | 	
 35 | 	private RequestAction action;
 36 | 	
 37 | 	/**支持存在分块请求的情形，(比如一篇文章需要翻多页抓取，歌手信息不分布在多个页面中)*/
 38 | 	private List<Request> partRequest=new ArrayList<Request>();
 39 | 	/**是否分块*/
 40 | 	private boolean supportPart=false;
 41 | 	
 42 | 	public Request(){
 43 | 		
 44 | 	}
 45 | 	public Request(String url){
 46 | 		this.url=url;
 47 | 	}
 48 | 	public Const.HttpMethod getMethod() {
 49 | 		return method;
 50 | 	}
 51 | 	public Request setMethod(Const.HttpMethod method) {
 52 | 		this.method = method;
 53 | 		return this;
 54 | 	}
 55 | 	public Map<String,String> getHeaders() {
 56 | 		return headers;
 57 | 	}
 58 | 	public Request setHeader(String key,String value) {
 59 | 		headers.put(key, value);
 60 | 		return this;
 61 | 	}
 62 | 	public Map<String,String> getParams() {
 63 | 		return params;
 64 | 	}
 65 | 	public Request setParams(String key,String value) {
 66 | 		params.put(key, value);
 67 | 		return this;
 68 | 	}
 69 | 	public Map<String, Object> getExtras() {
 70 | 		return extras;
 71 | 	}
 72 | 	public Request setExtras(Map<String, Object> extras) {
 73 | 		this.extras=extras;
 74 | 		return this;
 75 | 	}
 76 | 	public Request putExtra(String key,String value) {
 77 | 		extras.put(key, value);
 78 | 		return this;
 79 | 	}
 80 | 	
 81 | 	public HttpClientContext getCtx() {
 82 | 		return ctx;
 83 | 	}
 84 | 	public Request setCtx(HttpClientContext ctx) {
 85 | 		this.ctx = ctx;
 86 | 		return this;
 87 | 	}
 88 | 	
 89 | 	public HttpEntity getEntity() {
 90 | 		return entity;
 91 | 	}
 92 | 	public Request setEntity(HttpEntity entity) {
 93 | 		this.entity = entity;
 94 | 		return this;
 95 | 	}
 96 | 	public String getEncoding() {
 97 | 		return encoding;
 98 | 	}
 99 | 	public Request setEncoding(String encoding) {
100 | 		this.encoding = encoding;
101 | 		return this;
102 | 	}
103 | 
104 | 	public int getRetryCount() {
105 | 		return retryCount;
106 | 	}
107 | 	public Request setRetryCount(int retryCount) {
108 | 		this.retryCount = retryCount;
109 | 		return this;
110 | 	}
111 | 	public int getRetrySleepTime() {
112 | 		return retrySleepTime;
113 | 	}
114 | 	public Request setRetrySleepTime(int retrySleepTime) {
115 | 		this.retrySleepTime = retrySleepTime;
116 | 		return this;
117 | 	}
118 | 	public RequestAction getAction() {
119 | 		return action;
120 | 	}
121 | 	public Request setAction(RequestAction action) {
122 | 		this.action = action;
123 | 		return this;
124 | 	}
125 | 	public String getUrl() {
126 | 		return url;
127 | 	}
128 | 	public void setUrl(String url) {
129 | 		this.url = url;
130 | 	}
131 | 	
132 | 	public List<Request> getPartRequest() {
133 | 		return partRequest;
134 | 	}
135 | 	public Request setPartRequest(List<Request> list) {
136 | 		this.partRequest=list;
137 | 		return this;
138 | 	}
139 | 	public void addPartRequest(Request req) {
140 | 		this.partRequest.add(req);
141 | 		supportPart=true;
142 | 	}
143 | 	
144 | 	@Override
145 | 	public String toString() {
146 | 		return "Request [url=" + url + ", encoding=" + encoding + ", method="
147 | 				+ method + ", retrySleepTime=" + retrySleepTime
148 | 				+ ", retryCount=" + retryCount + ", headers=" + headers
149 | 				+ ", params=" + params + ", extras=" + extras + ", ctx=" + ctx
150 | 				+ ", entity=" + entity + ", action=" + action + "]";
151 | 	}
152 | 	public boolean isSupportPart() {
153 | 		return supportPart;
154 | 	}
155 | 
156 | }
157 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/RedisScheduler.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.scheduler;
  2 | 
  3 | import java.io.UnsupportedEncodingException;
  4 | import java.util.ArrayList;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.concurrent.ConcurrentHashMap;
  8 | 
  9 | import org.apache.commons.codec.digest.DigestUtils;
 10 | import org.apache.commons.lang3.SerializationUtils;
 11 | import org.apache.http.HttpEntity;
 12 | import org.apache.http.client.protocol.HttpClientContext;
 13 | import org.slf4j.Logger;
 14 | import org.slf4j.LoggerFactory;
 15 | 
 16 | import redis.clients.jedis.Jedis;
 17 | import redis.clients.jedis.JedisPool;
 18 | import redis.clients.jedis.JedisPoolConfig;
 19 | 
 20 | import com.alibaba.fastjson.JSON;
 21 | import com.github.xbynet.crawler.Const;
 22 | import com.github.xbynet.crawler.ISpider;
 23 | import com.github.xbynet.crawler.Request;
 24 | import com.github.xbynet.crawler.RequestAction;
 25 | /**
 26 |  * Use Redis as url scheduler for distributed crawlers.<br>
 27 |  *
 28 |  * @author code4crafter@gmail.com <br>
 29 |  * @since 0.2.0
 30 |  */
 31 | public class RedisScheduler  implements Scheduler, DuplicateRemover  {
 32 | 	private Logger log=LoggerFactory.getLogger(RedisScheduler.class);
 33 | 	
 34 |     protected JedisPool pool;
 35 | 
 36 |     private static final String QUEUE_PREFIX = "queue_";
 37 | 
 38 |     private static final String SET_PREFIX = "set_";
 39 | 
 40 |     private static final String ITEM_PREFIX = "item_";
 41 |     
 42 | 
 43 |     public RedisScheduler(String host) {
 44 |         this(new JedisPool(new JedisPoolConfig(), host));
 45 |     }
 46 | 
 47 |     public RedisScheduler(JedisPool pool) {
 48 |         this.pool = pool;
 49 |     }
 50 | 
 51 |     @Override
 52 |     public void resetDuplicateCheck(ISpider spider) {
 53 |         Jedis jedis = pool.getResource();
 54 |         try {
 55 |             jedis.del(getSetKey(spider));
 56 |         } finally {
 57 |             jedis.close();
 58 |         }
 59 |     }
 60 | 
 61 |     @Override
 62 |     public boolean isDuplicate(Request request, ISpider spider) {
 63 |         Jedis jedis = pool.getResource();
 64 |         try {
 65 |             return jedis.sadd(getSetKey(spider), request.getUrl()) > 0;
 66 |         } finally {
 67 |         	jedis.close();
 68 |         }
 69 | 
 70 |     }
 71 | 
 72 |     @Override
 73 | 	public void push(Request request, ISpider spider) {
 74 |         Jedis jedis = pool.getResource();
 75 |         if (Const.HttpMethod.POST == request.getMethod()
 76 | 				|| !isDuplicate(request, spider)) {
 77 | 			log.debug("push to queue {}", request.getUrl());
 78 | 			 try {
 79 | 		            jedis.rpush(getQueueKey(spider), request.getUrl());
 80 | 		            String field = DigestUtils.md5Hex(request.getUrl());
 81 | 		            byte[] data=SerializationUtils.serialize(request);
 82 | 		            jedis.hset((ITEM_PREFIX + spider.getName()).getBytes(), field.getBytes(), data);
 83 | 				} finally {
 84 | 		            jedis.close();
 85 | 		        }
 86 | 		}
 87 |     }
 88 | 
 89 |     @Override
 90 |     public synchronized Request poll(ISpider spider) {
 91 |         Jedis jedis = pool.getResource();
 92 |         try {
 93 |             String url = jedis.lpop(getQueueKey(spider));
 94 |             if (url == null) {
 95 |                 return null;
 96 |             }
 97 |             String key = ITEM_PREFIX + spider.getName();
 98 |             String field = DigestUtils.md5Hex(url);
 99 |             byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
100 |             Request request=SerializationUtils.deserialize(bytes);
101 |             return request;
102 |         } finally {
103 |         	jedis.close();
104 |         }
105 |     }
106 | 
107 |     protected String getSetKey(ISpider spider) {
108 |         return SET_PREFIX + spider.getName();
109 |     }
110 | 
111 |     protected String getQueueKey(ISpider spider) {
112 |         return QUEUE_PREFIX + spider.getName();
113 |     }
114 | 
115 |     protected String getItemKey(ISpider spider)
116 |     {
117 |         return ITEM_PREFIX + spider.getName();
118 |     }
119 | 
120 |     @Override
121 |     public int getLeftRequestsCount(ISpider spider) {
122 |         Jedis jedis = pool.getResource();
123 |         try {
124 |             Long size = jedis.llen(getQueueKey(spider));
125 |             return size.intValue();
126 |         } finally {
127 |             jedis.close();
128 |         }
129 |     }
130 | 
131 |     @Override
132 |     public int getTotalRequestsCount(ISpider spider) {
133 |         Jedis jedis = pool.getResource();
134 |         try {
135 |             Long size = jedis.scard(getSetKey(spider));
136 |             return size.intValue();
137 |         } finally {
138 |         	jedis.close();
139 |         }
140 |     }
141 | 
142 | 
143 | 	@Override
144 | 	public DuplicateRemover getDuplicateRemover() {
145 | 		return this;
146 | 	}
147 | }
148 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/PhantomjsWebDriverPool.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.selenium;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Set;
  5 | import java.util.concurrent.BlockingDeque;
  6 | import java.util.concurrent.LinkedBlockingDeque;
  7 | import java.util.concurrent.TimeUnit;
  8 | import java.util.concurrent.atomic.AtomicBoolean;
  9 | import java.util.concurrent.atomic.AtomicInteger;
 10 | 
 11 | import org.openqa.selenium.WebDriver;
 12 | import org.openqa.selenium.phantomjs.PhantomJSDriver;
 13 | import org.openqa.selenium.phantomjs.PhantomJSDriverService;
 14 | import org.openqa.selenium.remote.DesiredCapabilities;
 15 | import org.slf4j.Logger;
 16 | import org.slf4j.LoggerFactory;
 17 | 
 18 | /**
 19 |  * @author taojw
 20 |  */
 21 | public class PhantomjsWebDriverPool implements WebDriverPool {
 22 | 	private Logger logger = LoggerFactory.getLogger(getClass());
 23 | 
 24 | 	private int CAPACITY = 5;
 25 | 	private AtomicInteger refCount = new AtomicInteger(0);
 26 | 	private static final String DRIVER_PHANTOMJS = "phantomjs";
 27 | 
 28 | 	/**
 29 | 	 * store webDrivers available
 30 | 	 */
 31 | 	private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>(
 32 | 			CAPACITY);
 33 | 
 34 | 	private AtomicBoolean shutdowned = new AtomicBoolean(false);
 35 | 
 36 | 	private String PHANTOMJS_PATH;
 37 | 	private DesiredCapabilities caps = DesiredCapabilities.phantomjs();
 38 | 
 39 | 	public PhantomjsWebDriverPool(String phantomjsPath) {
 40 | 		this(5, false, phantomjsPath);
 41 | 	}
 42 | 
 43 | 	/**
 44 | 	 * 
 45 | 	 * @param poolsize
 46 | 	 * @param loadImg
 47 | 	 *            是否加载图片，默认不加载
 48 | 	 */
 49 | 	public PhantomjsWebDriverPool(int poolsize, boolean loadImg,
 50 | 			String phantomjsPath) {
 51 | 		this.CAPACITY = poolsize;
 52 | 		innerQueue = new LinkedBlockingDeque<WebDriver>(poolsize);
 53 | 		PHANTOMJS_PATH = phantomjsPath;
 54 | 		caps.setJavascriptEnabled(true);
 55 | 		caps.setCapability("webStorageEnabled", true);
 56 | 		caps.setCapability(
 57 | 				PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
 58 | 				PHANTOMJS_PATH);
 59 | 		// caps.setCapability("takesScreenshot", false);
 60 | 		caps.setCapability(
 61 | 				PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX
 62 | 						+ "User-Agent",
 63 | 				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
 64 | 		ArrayList<String> cliArgsCap = new ArrayList<String>();
 65 | 		// http://phantomjs.org/api/command-line.html
 66 | 		cliArgsCap.add("--web-security=false");
 67 | 		cliArgsCap.add("--ssl-protocol=any");
 68 | 		cliArgsCap.add("--ignore-ssl-errors=true");
 69 | 		if (loadImg) {
 70 | 			cliArgsCap.add("--load-images=true");
 71 | 		} else {
 72 | 			cliArgsCap.add("--load-images=false");
 73 | 		}
 74 | 		caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
 75 | 				cliArgsCap);
 76 | 		caps.setCapability(
 77 | 				PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
 78 | 				new String[] { "--logLevel=INFO" });
 79 | 
 80 | 	}
 81 | 
 82 | 	public WebDriver get() throws InterruptedException {
 83 | 		WebDriver poll = innerQueue.poll();
 84 | 		if (poll != null) {
 85 | 			return poll;
 86 | 		}
 87 | 		if (refCount.get() < CAPACITY) {
 88 | 			synchronized (innerQueue) {
 89 | 				if (refCount.get() < CAPACITY) {
 90 | 
 91 | 					WebDriver mDriver = new PhantomJSDriver(caps);
 92 | 					// 尝试性解决：https://github.com/ariya/phantomjs/issues/11526问题
 93 | 					mDriver.manage().timeouts()
 94 | 							.pageLoadTimeout(60, TimeUnit.SECONDS);
 95 | 					// mDriver.manage().window().setSize(new Dimension(1366,
 96 | 					// 768));
 97 | 					innerQueue.add(mDriver);
 98 | 					refCount.incrementAndGet();
 99 | 				}
100 | 			}
101 | 		}
102 | 		return innerQueue.take();
103 | 	}
104 | 
105 | 	public void returnToPool(WebDriver webDriver) {
106 | 		if (shutdowned.get()) {
107 | 			webDriver.quit();
108 | 			webDriver = null;
109 | 		} else {
110 | 			Set<String> handles = webDriver.getWindowHandles();
111 | 			if (handles.size() > 1) {
112 | 				int index = 0;
113 | 				for (String handle : handles) {
114 | 					if (index == 0) {
115 | 						index++;
116 | 						continue;
117 | 					}
118 | 					WindowUtil.changeWindowTo(webDriver, handle);
119 | 					webDriver.close();
120 | 					index++;
121 | 				}
122 | 			}
123 | 			synchronized (shutdowned) {
124 | 				if(!shutdowned.get()){
125 | 					innerQueue.add(webDriver);
126 | 				}else{
127 | 					webDriver.quit();
128 | 					webDriver = null;
129 | 				}
130 | 			}
131 | 		}
132 | 	}
133 | 
134 | 	public void close(WebDriver webDriver) {
135 | 		refCount.decrementAndGet();
136 | 		webDriver.quit();
137 | 		webDriver = null;
138 | 	}
139 | 
140 | 	public void shutdown() {
141 | 		synchronized (shutdowned) {
142 | 			shutdowned.set(true);
143 | 		}
144 | 		try {
145 | 			for (WebDriver driver : innerQueue) {
146 | 				close(driver);
147 | 			}
148 | 			innerQueue.clear();
149 | 			refCount.set(0);
150 | 		} catch (Exception e) {
151 | 			logger.warn("webdriverpool关闭失败", e);
152 | 		}
153 | 	}
154 | }
155 | 


--------------------------------------------------------------------------------
/crawler-core/src/test/java/net/xby1993/crawler/GithubCrawler.java:
--------------------------------------------------------------------------------
  1 | package net.xby1993.crawler;
  2 | 
  3 | import java.nio.file.Paths;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | import java.util.UUID;
  7 | 
  8 | import org.apache.commons.io.IOUtils;
  9 | import org.apache.http.client.methods.CloseableHttpResponse;
 10 | import org.apache.http.client.methods.HttpUriRequest;
 11 | import org.apache.http.client.protocol.HttpClientContext;
 12 | import org.apache.http.impl.client.BasicCookieStore;
 13 | import org.apache.http.impl.client.CloseableHttpClient;
 14 | 
 15 | import com.github.xbynet.crawler.Const;
 16 | import com.github.xbynet.crawler.Processor;
 17 | import com.github.xbynet.crawler.Request;
 18 | import com.github.xbynet.crawler.RequestAction;
 19 | import com.github.xbynet.crawler.Response;
 20 | import com.github.xbynet.crawler.Site;
 21 | import com.github.xbynet.crawler.Spider;
 22 | import com.github.xbynet.crawler.http.DefaultDownloader;
 23 | import com.github.xbynet.crawler.http.FileDownloader;
 24 | import com.github.xbynet.crawler.http.HttpClientFactory;
 25 | import com.github.xbynet.crawler.parser.JsoupParser;
 26 | import com.github.xbynet.crawler.scheduler.DefaultScheduler;
 27 | 
 28 | public class GithubCrawler extends Processor {
 29 | 	@Override
 30 | 	public void process(Response resp) {
 31 | 		String currentUrl = resp.getRequest().getUrl();
 32 | 		System.out.println("CurrentUrl:" + currentUrl);
 33 | 		int respCode = resp.getCode();
 34 | 		System.out.println("ResponseCode:" + respCode);
 35 | 		System.out.println("type:" + resp.getRespType().name());
 36 | 		String contentType = resp.getContentType();
 37 | 		System.out.println("ContentType:" + contentType);
 38 | 		Map<String, List<String>> headers = resp.getHeaders();
 39 | 		System.out.println("ResonseHeaders:");
 40 | 		for (String key : headers.keySet()) {
 41 | 			List<String> values=headers.get(key);
 42 | 			for(String str:values){
 43 | 				System.out.println(key + ":" +str);
 44 | 			}
 45 | 		}
 46 | 		JsoupParser parser = resp.html();
 47 | 		// suppport parted ,分块抓取是会有个parent response来关联所有分块response
 48 | 		// System.out.println("isParted:"+resp.isPartResponse());
 49 | 		// Response parent=resp.getParentResponse();
 50 | 		// resp.addPartRequest(null);
 51 | 		//Map<String,Object> extras=resp.getRequest().getExtras();
 52 | 
 53 | 		if (currentUrl.equals("https://github.com/xbynet")) {
 54 | 			String avatar = parser.single("img.avatar", "src");
 55 | 			String dir = System.getProperty("java.io.tmpdir");
 56 | 			String savePath = Paths.get(dir, UUID.randomUUID().toString())
 57 | 					.toString();
 58 | 			boolean avatarDownloaded = download(avatar, savePath);
 59 | 			System.out.println("avatar:" + avatar + ", saved:" + savePath);
 60 | 			// System.out.println("avtar downloaded status:"+avatarDownloaded);
 61 | 			String name = parser.single(".vcard-names > .vcard-fullname",
 62 | 					"text");
 63 | 			System.out.println("name:" + name);
 64 | 			List<String> reponames = parser.list(
 65 | 					".pinned-repos-list .repo.js-repo", "text");
 66 | 			List<String> repoUrls = parser.list(
 67 | 					".pinned-repo-item .d-block >a", "href");
 68 | 			System.out.println("reponame:url");
 69 | 			if (reponames != null) {
 70 | 				for (int i = 0; i < reponames.size(); i++) {
 71 | 					String tmpUrl="https://github.com"+repoUrls.get(i);
 72 | 					System.out.println(reponames.get(i) + ":"+tmpUrl);
 73 | 					Request req=new Request(tmpUrl).putExtra("name", reponames.get(i));
 74 | 					resp.addRequest(req);
 75 | 				}
 76 | 			}
 77 | 		}else{
 78 | 			Map<String,Object> extras=resp.getRequest().getExtras();
 79 | 			String name=extras.get("name").toString();
 80 | 			System.out.println("repoName:"+name);
 81 | 			String shortDesc=parser.single(".repository-meta-content","allText");
 82 | 			System.out.println("shortDesc:"+shortDesc);
 83 | 		}
 84 | 	}
 85 | 
 86 | 	public void start() {
 87 | 		Site site = new Site();
 88 | 		Spider spider = Spider.builder(this).threadNum(5).site(site)
 89 | 				.urls("https://github.com/xbynet").build();
 90 | 		spider.run();
 91 | 	}
 92 | 
 93 | 	public void startCompleteConfig() {
 94 | 		String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 95 | 		String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36";
 96 | 
 97 | 		Site site = new Site();
 98 | 		site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/")
 99 | 				.setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000)
100 | 				.setUa(pcUA);
101 | 
102 | 		Request request = new Request("https://github.com/xbynet");
103 | 		HttpClientContext ctx = new HttpClientContext();
104 | 		BasicCookieStore cookieStore = new BasicCookieStore();
105 | 		ctx.setCookieStore(cookieStore);
106 | 		request.setAction(new RequestAction() {
107 | 			@Override
108 | 			public void before(CloseableHttpClient client, HttpUriRequest req) {
109 | 				System.out.println("before-haha");
110 | 			}
111 | 
112 | 			@Override
113 | 			public void after(CloseableHttpClient client,
114 | 					CloseableHttpResponse resp) {
115 | 				System.out.println("after-haha");
116 | 			}
117 | 		}).setCtx(ctx).setEncoding("GBK")
118 | 				.putExtra("somekey", "我是可以在response中使用的extras哦")
119 | 				.setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET)
120 | 				.setPartRequest(null).setEntity(null)
121 | 				.setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5)
122 | 				.setRetrySleepTime(10000);
123 | 
124 | 		Spider spider = Spider.builder(this).threadNum(5)
125 | 				.name("Spider-github-xbynet")
126 | 				.defaultDownloader(new DefaultDownloader())
127 | 				.fileDownloader(new FileDownloader())
128 | 				.httpClientFactory(new HttpClientFactory()).ipProvider(null)
129 | 				.listener(null).pool(null).scheduler(new DefaultScheduler())
130 | 				.shutdownOnComplete(true).site(site).build();
131 | 		spider.run();
132 | 	}
133 | 
134 | 	public static void main(String[] args) {
135 | 		new GithubCrawler().start();
136 | 	}
137 | }
138 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/java/com/github/xbynet/crawler/server/demo/GithubCrawler.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.server.demo;
  2 | 
  3 | import java.nio.file.Paths;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | import java.util.UUID;
  7 | 
  8 | import org.apache.http.client.methods.CloseableHttpResponse;
  9 | import org.apache.http.client.methods.HttpUriRequest;
 10 | import org.apache.http.client.protocol.HttpClientContext;
 11 | import org.apache.http.impl.client.BasicCookieStore;
 12 | import org.apache.http.impl.client.CloseableHttpClient;
 13 | 
 14 | import com.github.xbynet.crawler.Const;
 15 | import com.github.xbynet.crawler.Processor;
 16 | import com.github.xbynet.crawler.Request;
 17 | import com.github.xbynet.crawler.RequestAction;
 18 | import com.github.xbynet.crawler.Response;
 19 | import com.github.xbynet.crawler.Site;
 20 | import com.github.xbynet.crawler.Spider;
 21 | import com.github.xbynet.crawler.http.DefaultDownloader;
 22 | import com.github.xbynet.crawler.http.FileDownloader;
 23 | import com.github.xbynet.crawler.http.HttpClientFactory;
 24 | import com.github.xbynet.crawler.parser.JsoupParser;
 25 | import com.github.xbynet.crawler.scheduler.DefaultScheduler;
 26 | 
 27 | public class GithubCrawler extends Processor {
 28 | 	@Override
 29 | 	public void process(Response resp) {
 30 | 		String currentUrl = resp.getRequest().getUrl();
 31 | 		System.out.println("CurrentUrl:" + currentUrl);
 32 | 		int respCode = resp.getCode();
 33 | 		System.out.println("ResponseCode:" + respCode);
 34 | 		System.out.println("type:" + resp.getRespType().name());
 35 | 		String contentType = resp.getContentType();
 36 | 		System.out.println("ContentType:" + contentType);
 37 | 		Map<String, List<String>> headers = resp.getHeaders();
 38 | 		System.out.println("ResonseHeaders:");
 39 | 		for (String key : headers.keySet()) {
 40 | 			List<String> values=headers.get(key);
 41 | 			for(String str:values){
 42 | 				System.out.println(key + ":" +str);
 43 | 			}
 44 | 		}
 45 | 		JsoupParser parser = resp.html();
 46 | 		// suppport parted ,分块抓取是会有个parent response来关联所有分块response
 47 | 		// System.out.println("isParted:"+resp.isPartResponse());
 48 | 		// Response parent=resp.getParentResponse();
 49 | 		// resp.addPartRequest(null);
 50 | 		//Map<String,Object> extras=resp.getRequest().getExtras();
 51 | 
 52 | 		if (currentUrl.equals("https://github.com/xbynet")) {
 53 | 			String avatar = parser.single("img.avatar", "src");
 54 | 			String dir = System.getProperty("java.io.tmpdir");
 55 | 			String savePath = Paths.get(dir, UUID.randomUUID().toString())
 56 | 					.toString();
 57 | 			boolean avatarDownloaded = download(avatar, savePath);
 58 | 			System.out.println("avatar:" + avatar + ", saved:" + savePath);
 59 | 			// System.out.println("avtar downloaded status:"+avatarDownloaded);
 60 | 			String name = parser.single(".vcard-names > .vcard-fullname",
 61 | 					"text");
 62 | 			System.out.println("name:" + name);
 63 | 			List<String> reponames = parser.list(
 64 | 					".pinned-repos-list .repo.js-repo", "text");
 65 | 			List<String> repoUrls = parser.list(
 66 | 					".pinned-repo-item .d-block >a", "href");
 67 | 			System.out.println("reponame:url");
 68 | 			if (reponames != null) {
 69 | 				for (int i = 0; i < reponames.size(); i++) {
 70 | 					String tmpUrl="https://github.com"+repoUrls.get(i);
 71 | 					System.out.println(reponames.get(i) + ":"+tmpUrl);
 72 | 					Request req=new Request(tmpUrl).putExtra("name", reponames.get(i));
 73 | 					resp.addRequest(req);
 74 | 				}
 75 | 			}
 76 | 		}else{
 77 | 			Map<String,Object> extras=resp.getRequest().getExtras();
 78 | 			String name=extras.get("name").toString();
 79 | 			System.out.println("repoName:"+name);
 80 | 			String shortDesc=parser.single(".repository-meta-content","allText");
 81 | 			System.out.println("shortDesc:"+shortDesc);
 82 | 		}
 83 | 	}
 84 | 
 85 | 	public Spider createSpider() {
 86 | 		Site site = new Site();
 87 | 		Spider spider = Spider.builder(this).threadNum(5).site(site)
 88 | 				.urls("https://github.com/xbynet").build();
 89 | //		spider.run();
 90 | 		return spider;
 91 | 	}
 92 | 
 93 | 	public void startCompleteConfig() {
 94 | 		String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 95 | 		String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36";
 96 | 
 97 | 		Site site = new Site();
 98 | 		site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/")
 99 | 				.setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000)
100 | 				.setUa(pcUA);
101 | 
102 | 		Request request = new Request("https://github.com/xbynet");
103 | 		HttpClientContext ctx = new HttpClientContext();
104 | 		BasicCookieStore cookieStore = new BasicCookieStore();
105 | 		ctx.setCookieStore(cookieStore);
106 | 		request.setAction(new RequestAction() {
107 | 			@Override
108 | 			public void before(CloseableHttpClient client, HttpUriRequest req) {
109 | 				System.out.println("before-haha");
110 | 			}
111 | 
112 | 			@Override
113 | 			public void after(CloseableHttpClient client,
114 | 					CloseableHttpResponse resp) {
115 | 				System.out.println("after-haha");
116 | 			}
117 | 		}).setCtx(ctx).setEncoding("GBK")
118 | 				.putExtra("somekey", "我是可以在response中使用的extras哦")
119 | 				.setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET)
120 | 				.setPartRequest(null).setEntity(null)
121 | 				.setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5)
122 | 				.setRetrySleepTime(10000);
123 | 
124 | 		Spider spider = Spider.builder(this).threadNum(5)
125 | 				.name("Spider-github-xbynet")
126 | 				.defaultDownloader(new DefaultDownloader())
127 | 				.fileDownloader(new FileDownloader())
128 | 				.httpClientFactory(new HttpClientFactory()).ipProvider(null)
129 | 				.listener(null).pool(null).scheduler(new DefaultScheduler())
130 | 				.shutdownOnComplete(true).site(site).build();
131 | 		spider.run();
132 | 	}
133 | 
134 | 	public static void main(String[] args) {
135 | //		new GithubCrawler().start();
136 | 	}
137 | }
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # crawler
  2 | A simple and flexible web crawler framework for java.
  3 | 
  4 | ## Features:
  5 | 1、Code  is easy to understand and customized (代码简单易懂，可定制性强)     
  6 | 2、Api is simple and easy to use         
  7 | 3、Support File download、Content part fetch.(支持文件下载、分块抓取)          
  8 | 4、Request And Response support much options、strong customizable.(请求和响应支持的内容和选项比较丰富、每个请求可定制性强)   
  9 | 5、Support do your own operation before or after network request in downloader(支持网络请求前后执行自定义操作)        
 10 | 6、Selenium+PhantomJS support     
 11 | 7、Redis support      
 12 | 
 13 | ## Future:
 14 | 1、Complete the code comment and test(完善代码注释和完善测试代码)     
 15 | 
 16 | ## Install:
 17 | 
 18 | The only module that must be added is crawler-core
 19 | ```xml
 20 | <dependency>
 21 |     <groupId>com.github.xbynet</groupId>
 22 |     <artifactId>crawler-core</artifactId>
 23 |     <version>0.3.0</version>
 24 | </dependency
 25 | ```
 26 | But if you want to use selenium support:
 27 | ```xml
 28 | <dependency>
 29 |     <groupId>com.github.xbynet</groupId>
 30 |     <artifactId>crawler-selenium</artifactId>
 31 |     <version>0.3.0</version>
 32 | </dependency
 33 | ```
 34 | 
 35 | Module crawler-server is now a experimental attempt, and now has more work to do on it.
 36 | 
 37 | ## Demo:
 38 | 
 39 | ```java
 40 | import com.github.xbynet.crawler.http.DefaultDownloader;
 41 | import com.github.xbynet.crawler.http.FileDownloader;
 42 | import com.github.xbynet.crawler.http.HttpClientFactory;
 43 | import com.github.xbynet.crawler.parser.JsoupParser;
 44 | import com.github.xbynet.crawler.scheduler.DefaultScheduler;
 45 | 
 46 | public class GithubCrawler extends Processor {
 47 | 	@Override
 48 | 	public void process(Response resp) {
 49 | 		String currentUrl = resp.getRequest().getUrl();
 50 | 		System.out.println("CurrentUrl:" + currentUrl);
 51 | 		int respCode = resp.getCode();
 52 | 		System.out.println("ResponseCode:" + respCode);
 53 | 		System.out.println("type:" + resp.getRespType().name());
 54 | 		String contentType = resp.getContentType();
 55 | 		System.out.println("ContentType:" + contentType);
 56 | 		Map<String, List<String>> headers = resp.getHeaders();
 57 | 		System.out.println("ResonseHeaders:");
 58 | 		for (String key : headers.keySet()) {
 59 | 			List<String> values=headers.get(key);
 60 | 			for(String str:values){
 61 | 				System.out.println(key + ":" +str);
 62 | 			}
 63 | 		}
 64 | 		JsoupParser parser = resp.html();
 65 | 		// suppport parted ,分块抓取是会有个parent response来关联所有分块response
 66 | 		// System.out.println("isParted:"+resp.isPartResponse());
 67 | 		// Response parent=resp.getParentResponse();
 68 | 		// resp.addPartRequest(null);
 69 | 		//Map<String,Object> extras=resp.getRequest().getExtras();
 70 | 
 71 | 		if (currentUrl.equals("https://github.com/xbynet")) {
 72 | 			String avatar = parser.single("img.avatar", "src");
 73 | 			String dir = System.getProperty("java.io.tmpdir");
 74 | 			String savePath = Paths.get(dir, UUID.randomUUID().toString())
 75 | 					.toString();
 76 | 			boolean avatarDownloaded = download(avatar, savePath);
 77 | 			System.out.println("avatar:" + avatar + ", saved:" + savePath);
 78 | 			// System.out.println("avtar downloaded status:"+avatarDownloaded);
 79 | 			String name = parser.single(".vcard-names > .vcard-fullname",
 80 | 					"text");
 81 | 			System.out.println("name:" + name);
 82 | 			List<String> reponames = parser.list(
 83 | 					".pinned-repos-list .repo.js-repo", "text");
 84 | 			List<String> repoUrls = parser.list(
 85 | 					".pinned-repo-item .d-block >a", "href");
 86 | 			System.out.println("reponame:url");
 87 | 			if (reponames != null) {
 88 | 				for (int i = 0; i < reponames.size(); i++) {
 89 | 					String tmpUrl="https://github.com"+repoUrls.get(i);
 90 | 					System.out.println(reponames.get(i) + ":"+tmpUrl);
 91 | 					Request req=new Request(tmpUrl).putExtra("name", reponames.get(i));
 92 | 					resp.addRequest(req);
 93 | 				}
 94 | 			}
 95 | 		}else{
 96 | 			Map<String,Object> extras=resp.getRequest().getExtras();
 97 | 			String name=extras.get("name").toString();
 98 | 			System.out.println("repoName:"+name);
 99 | 			String shortDesc=parser.single(".repository-meta-content","allText");
100 | 			System.out.println("shortDesc:"+shortDesc);
101 | 		}
102 | 	}
103 | 
104 | 	public void start() {
105 | 		Site site = new Site();
106 | 		Spider spider = Spider.builder(this).threadNum(5).site(site)
107 | 				.urls("https://github.com/xbynet").build();
108 | 		spider.run();
109 | 	}
110 |   
111 | 	public static void main(String[] args) {
112 | 		new GithubCrawler().start();
113 | 	}
114 |   
115 |   
116 | 	public void startCompleteConfig() {
117 | 		String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
118 | 		String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36";
119 | 
120 | 		Site site = new Site();
121 | 		site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/")
122 | 				.setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000)
123 | 				.setUa(pcUA);
124 | 
125 | 		Request request = new Request("https://github.com/xbynet");
126 | 		HttpClientContext ctx = new HttpClientContext();
127 | 		BasicCookieStore cookieStore = new BasicCookieStore();
128 | 		ctx.setCookieStore(cookieStore);
129 | 		request.setAction(new RequestAction() {
130 | 			@Override
131 | 			public void before(CloseableHttpClient client, HttpUriRequest req) {
132 | 				System.out.println("before-haha");
133 | 			}
134 | 
135 | 			@Override
136 | 			public void after(CloseableHttpClient client,
137 | 					CloseableHttpResponse resp) {
138 | 				System.out.println("after-haha");
139 | 			}
140 | 		}).setCtx(ctx).setEncoding("UTF-8")
141 | 				.putExtra("somekey", "I can use in the response by your own")
142 | 				.setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET)
143 | 				.setPartRequest(null).setEntity(null)
144 | 				.setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5)
145 | 				.setRetrySleepTime(10000);
146 | 
147 | 		Spider spider = Spider.builder(this).threadNum(5)
148 | 				.name("Spider-github-xbynet")
149 | 				.defaultDownloader(new DefaultDownloader())
150 | 				.fileDownloader(new FileDownloader())
151 | 				.httpClientFactory(new HttpClientFactory()).ipProvider(null)
152 | 				.listener(null).pool(null).scheduler(new DefaultScheduler())
153 | 				.shutdownOnComplete(true).site(site).build();
154 | 		spider.run();
155 | 	}
156 | 
157 | 
158 | }
159 | 
160 | ```
161 | ## Examples:
162 | 
163 | - Github(github个人项目信息)
164 | - OSChinaTweets(开源中国动弹)
165 | - Qiushibaike(醜事百科)
166 | - Neihanshequ(内涵段子)   
167 | - ZihuRecommend(知乎推荐)   
168 |  
169 | **More Examples:** Please see [here](https://github.com/xbynet/crawler/tree/master/crawler-core/src/test/java/net/xby1993/crawler)  
170 | 
171 | ## Thanks: 
172 | [webmagic](https://github.com/code4craft/webmagic):本项目借鉴了webmagic多处代码，设计上也作了较多参考，非常感谢。     
173 | [xsoup](https://github.com/code4craft/xsoup)：本项目使用xsoup作为底层xpath处理器      
174 | [JsonPath](https://github.com/json-path/JsonPath)：本项目使用JsonPath作为底层jsonpath处理器    
175 | [Jsoup](https://jsoup.org/) 本项目使用Jsoup作为底层HTML/XML处理器      
176 | [HttpClient](http://hc.apache.org/) 本项目使用HttpClient作为底层网络请求工具    
177 | 


--------------------------------------------------------------------------------
/crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WindowUtil.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.selenium;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.util.Set;
  7 | import java.util.concurrent.TimeUnit;
  8 | 
  9 | import org.apache.commons.io.FileUtils;
 10 | import org.apache.commons.io.IOUtils;
 11 | import org.openqa.selenium.By;
 12 | import org.openqa.selenium.Cookie;
 13 | import org.openqa.selenium.Dimension;
 14 | import org.openqa.selenium.JavascriptExecutor;
 15 | import org.openqa.selenium.NoSuchElementException;
 16 | import org.openqa.selenium.OutputType;
 17 | import org.openqa.selenium.TakesScreenshot;
 18 | import org.openqa.selenium.WebDriver;
 19 | import org.openqa.selenium.WebElement;
 20 | import org.openqa.selenium.interactions.Actions;
 21 | import org.slf4j.Logger;
 22 | import org.slf4j.LoggerFactory;
 23 | 
 24 | 
 25 | /**
 26 |  * @author taojw
 27 |  *
 28 |  */
 29 | public class WindowUtil {
 30 | 	private static final Logger log=LoggerFactory.getLogger(WindowUtil.class);
 31 | 	/**
 32 | 	 * 窗口最大化
 33 | 	 * @param driver
 34 | 	 */
 35 | 	public static void maximize(WebDriver driver){
 36 | 		WebDriver.Options manage = driver.manage();
 37 | //		manage.window().maximize();
 38 | 		manage.window().setSize(new Dimension(1920,1080));
 39 | 		driver.navigate().refresh();
 40 | 	}
 41 | 	/**
 42 | 	 * 滚动窗口。
 43 | 	 * @param driver
 44 | 	 * @param height
 45 | 	 */
 46 | 	public static void scroll(WebDriver driver,int height){
 47 | 		((JavascriptExecutor)driver).executeScript("window.scrollTo(0,"+height+" );");	
 48 | 	}
 49 | 	/**
 50 | 	 * 重新调整窗口大小，以适应页面，需要耗费一定时间。建议等待合理的时间。
 51 | 	 * @param driver
 52 | 	 */
 53 | 	public static void loadAll(WebDriver driver){
 54 | 		Dimension od=driver.manage().window().getSize();
 55 | 		int width=driver.manage().window().getSize().width;
 56 | 		//尝试性解决：https://github.com/ariya/phantomjs/issues/11526问题
 57 |         driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS); 
 58 | 		long height=(Long)((JavascriptExecutor)driver).executeScript("return document.body.scrollHeight;");
 59 | 		driver.manage().window().setSize(new Dimension(width, (int)height));
 60 | 		driver.navigate().refresh();
 61 | 	}
 62 | 	public static void refresh(WebDriver driver){
 63 | 		driver.navigate().refresh();
 64 | 	}
 65 | 	public static void taskScreenShot(WebDriver driver,File saveFile){
 66 | 		if(saveFile.exists()){
 67 | 			saveFile.delete();
 68 | 		}
 69 | 		byte[] src=((TakesScreenshot)driver).getScreenshotAs(OutputType.BYTES);//.FILE);linux下非root用户，java创建临时文件存在问题
 70 | 		log.info("截图文件字节长度"+src.length);
 71 | 		try {
 72 | 			FileUtils.writeByteArrayToFile(saveFile, src);
 73 | 		} catch (IOException e) {
 74 | 			e.printStackTrace();
 75 | 			log.error("截图写入失败",e);
 76 | 		}
 77 | 	}
 78 | 	public static void changeWindow(WebDriver driver){
 79 | 		// 获取当前页面句柄
 80 | 		String handle = driver.getWindowHandle();
 81 | 		// 获取所有页面的句柄，并循环判断不是当前的句柄，就做选取switchTo()
 82 | 		for (String handles : driver.getWindowHandles()) {
 83 | 			if (handles.equals(handle))
 84 | 				continue;
 85 | 			driver.switchTo().window(handles);
 86 | 		}
 87 | 	}
 88 | 	public static void changeWindowTo(WebDriver driver,String handle){
 89 | 		for (String tmp : driver.getWindowHandles()) {
 90 | 			if (tmp.equals(handle)){
 91 | 				driver.switchTo().window(handle);
 92 | 				break;
 93 | 			}
 94 | 		}
 95 | 	}
 96 | 	
 97 | 	/**
 98 | 	 * 打开一个新tab页，返回该tab页的windowhandle
 99 | 	 * @param driver
100 | 	 * @param url
101 | 	 * @return
102 | 	 */
103 | 	public static String openNewTab(WebDriver driver,String url){
104 | 		Set<String> strSet1=driver.getWindowHandles();
105 | 		((JavascriptExecutor)driver).executeScript("window.open('"+url+"','_blank');");
106 | 		sleep(1000);
107 | 		Set<String> strSet2=driver.getWindowHandles();
108 | 		for(String tmp:strSet2){
109 | 			if(!strSet1.contains(tmp)){
110 | 				return tmp;
111 | 			}
112 | 		}
113 | 		return null;
114 | 	}
115 | 	public static void sleep(long millis){
116 | 		try {
117 | 			Thread.sleep(millis);
118 | 		} catch (InterruptedException e) {
119 | 			e.printStackTrace();
120 | 		}
121 | 	}
122 | 	/**
123 | 	 * 操作关闭模态窗口
124 | 	 * @param driver
125 | 	 * @param type 如Id,ClassName
126 | 	 * @param sel 选择器
127 | 	 */
128 | 	public static void clickModal(WebDriver driver,String type,String sel){
129 | 		String js="document.getElementsBy"+type+"('"+sel+"')[0].click();";
130 | 		((JavascriptExecutor)driver).executeScript(js);
131 | 	}
132 | 	
133 | 	/**
134 | 	 * 判断一个元素是否存在
135 | 	 * @param driver
136 | 	 * @param by
137 | 	 * @return
138 | 	 */
139 | 	public static boolean checkElementExists(WebDriver driver,By by){
140 | 		try{
141 | 			driver.findElement(by);
142 | 			return true;
143 | 		}catch(NoSuchElementException e){
144 | 			return false;
145 | 		}
146 | 	}
147 | 	/**
148 | 	 * 点击一个元素
149 | 	 * @param driver
150 | 	 * @param by
151 | 	 */
152 | 	public static void clickElement(WebDriver driver,By by){
153 | 		WebElement tmp=driver.findElement(by);
154 | 		Actions actions=new Actions(driver);
155 | 		actions.moveToElement(tmp).click().perform();
156 | 	}
157 | 	public static void clickElement(WebDriver driver,WebElement tmp){
158 | 		Actions actions=new Actions(driver);
159 | 		actions.moveToElement(tmp).click().perform();
160 | 	}
161 | 	public static Object execJs(WebDriver driver,String js){
162 | 		return ((JavascriptExecutor)driver).executeScript(js);
163 | 	}
164 | 	public static void clickByJsCssSelector(WebDriver driver,String cssSelector){
165 | 		String js="document.querySelector('"+cssSelector+"').click();";
166 | 		((JavascriptExecutor)driver).executeScript(js);
167 | 	}
168 | 	
169 | 	public static Set<Cookie> getCookies(WebDriver driver){
170 | 		return driver.manage().getCookies();
171 | 	}
172 | 	public static void setCookies(WebDriver driver,Set<Cookie> cookies){
173 | 		if(cookies==null){
174 | 			return;
175 | 		}
176 | 		//Phantomjs存在Cookie设置bug,只能通过js来设置了。
177 | 		StringBuilder sb=new StringBuilder();
178 | 		for(Cookie cookie:cookies){
179 | 			String js="document.cookie=\""+cookie.getName()+"="+cookie.getValue()+";path="+cookie.getPath()+";domain="+cookie.getDomain()+"\";";
180 | 			sb.append(js);
181 | 		}
182 | 		((JavascriptExecutor)driver).executeScript(sb.toString());
183 | 	}
184 | 	
185 | 	public static String getHttpCookieString(Set<Cookie> cookies){
186 | 		if(cookies==null){
187 | 			return "";
188 | 		}
189 | 		String httpCookie="";
190 | 		int index=0;
191 | 		for(Cookie c:cookies){
192 | 			index++;
193 | 			if(index==cookies.size()){
194 | 				httpCookie+=c.getName()+"="+c.getValue();
195 | 			}else{
196 | 				httpCookie+=c.getName()+"="+c.getValue()+"; ";
197 | 			}
198 | 		}
199 | 		return httpCookie;
200 | 	}
201 | 	
202 | 	/**
203 | 	 * 获取css属性，最典型的就是获取某个元素的display状态
204 | 	 * @param driver
205 | 	 * @param cssSelector
206 | 	 * @param attr
207 | 	 * @return
208 | 	 */
209 | 	public static Object getCssAttr(WebDriver driver,String cssSelector,String attr){
210 | 		InputStream ins=WindowUtil.class.getResourceAsStream("getCssAttr.js");
211 | 		String externalJS="";
212 | 		try {
213 | 			externalJS = IOUtils.toString(ins,"UTF-8");
214 | 		} catch (IOException e) {
215 | 			e.printStackTrace();
216 | 		}
217 | 		IOUtils.closeQuietly(ins);
218 | 		Object res = ((JavascriptExecutor) driver).executeScript(externalJS,cssSelector,attr);
219 | 		return res;
220 | 	}
221 | }
222 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>com.github.xbynet</groupId>
  5 | 	<artifactId>crawler-parent</artifactId>
  6 | 	<version>0.3.0</version>
  7 | 	<packaging>pom</packaging>
  8 | 	<name>crawler-parent</name>
  9 | 	<description>
 10 |         A simple and flexible web crawler framework for java.
 11 |     </description>
 12 | 	<url>https://github.com/xbynet/crawler</url>
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 		<project.encoding>UTF-8</project.encoding>
 16 | 		<java.version>1.6</java.version>
 17 | 	</properties>
 18 | 	<developers>
 19 | 		<developer>
 20 | 			<id>xbynet</id>
 21 | 			<name>JiaWei Tao</name>
 22 | 			<email>xbynet@outlook.com</email>
 23 | 		</developer>
 24 | 	</developers>
 25 | 	<scm>
 26 | 		<connection>scm:git:git@github.com:xbynet/crawler.git</connection>
 27 | 		<developerConnection>scm:git:git@github.com:xbynet/crawler.git</developerConnection>
 28 | 		<url>git@github.com:xbynet/crawler.git</url>
 29 | 		<tag>v${project.version}</tag>
 30 | 	</scm>
 31 | 	<licenses>
 32 | 		<license>
 33 | 			<name>MIT License</name>
 34 | 			<url>https://mit-license.org/</url>
 35 | 		</license>
 36 | 	</licenses>
 37 | 	<modules>
 38 | 		<module>crawler-core</module>
 39 | 		<module>crawler-selenium</module>
 40 | 		<module>crawler-server</module>
 41 | 	</modules>
 42 | 
 43 | 	<repositories>
 44 | 		<repository>
 45 | 			<id>aliyun</id>
 46 | 			<snapshots>
 47 | 				<enabled>true</enabled>
 48 | 			</snapshots>
 49 | 			<name>Public Repositories</name>
 50 | 			<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
 51 | 		</repository>
 52 | 	</repositories>
 53 | 	<pluginRepositories>
 54 | 		<pluginRepository>
 55 | 			<id>aliyun-plugin</id>
 56 | 			<name>Public Repositories</name>
 57 | 			<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
 58 | 		</pluginRepository>
 59 | 	</pluginRepositories>
 60 | 
 61 | 
 62 | 	<dependencies>
 63 | 		<dependency>
 64 | 			<groupId>junit</groupId>
 65 | 			<artifactId>junit</artifactId>
 66 | 			<version>4.12</version>
 67 | 			<scope>test</scope>
 68 | 		</dependency>
 69 | 		<dependency>
 70 | 			<groupId>cglib</groupId>
 71 | 			<artifactId>cglib</artifactId>
 72 | 			<!-- cglib-nodep -->
 73 | 			<version>3.2.4</version>
 74 | 		</dependency>
 75 | 		<dependency>
 76 | 			<groupId>commons-io</groupId>
 77 | 			<artifactId>commons-io</artifactId>
 78 | 			<version>2.5</version>
 79 | 		</dependency>
 80 | 		<dependency>
 81 | 			<groupId>org.apache.commons</groupId>
 82 | 			<artifactId>commons-lang3</artifactId>
 83 | 			<version>3.5</version>
 84 | 		</dependency>
 85 | 		<dependency>
 86 | 			<groupId>org.apache.httpcomponents</groupId>
 87 | 			<artifactId>httpclient</artifactId>
 88 | 			<version>4.5.3</version>
 89 | 		</dependency>
 90 | 		<dependency>
 91 | 			<groupId>com.alibaba</groupId>
 92 | 			<artifactId>fastjson</artifactId>
 93 | 			<version>1.2.28</version>
 94 | 		</dependency>
 95 | 		<!-- <dependency><groupId>joda-time</groupId><artifactId>joda-time</artifactId><version>2.8.2</version></dependency> -->
 96 | 		<dependency>
 97 | 			<groupId>ch.qos.logback</groupId>
 98 | 			<artifactId>logback-classic</artifactId>
 99 | 			<version>1.2.1</version>
100 | 		</dependency>
101 | 		<dependency>
102 | 			<groupId>ch.qos.logback</groupId>
103 | 			<artifactId>logback-core</artifactId>
104 | 			<version>1.2.1</version>
105 | 		</dependency>
106 | 		<dependency>
107 | 			<groupId>org.slf4j</groupId>
108 | 			<artifactId>slf4j-api</artifactId>
109 | 			<version>1.7.22</version>
110 | 		</dependency>
111 | 		<dependency>
112 |    		 	<groupId>org.slf4j</groupId>
113 |     		<artifactId>jcl-over-slf4j</artifactId>
114 |     		<version>1.7.22</version>
115 | 		</dependency>
116 | 		<dependency>
117 | 			<groupId>org.jsoup</groupId>
118 | 			<artifactId>jsoup</artifactId>
119 | 			<version>1.10.2</version>
120 | 		</dependency>
121 | 		<dependency>
122 | 			<groupId>com.jayway.jsonpath</groupId>
123 | 			<artifactId>json-path</artifactId>
124 | 			<version>2.2.0</version>
125 | 		</dependency>
126 | 		<dependency>
127 | 			<groupId>us.codecraft</groupId>
128 | 			<artifactId>xsoup</artifactId>
129 | 			<version>0.3.1</version>
130 | 		</dependency>
131 | 	</dependencies>
132 | 	<dependencyManagement>
133 | 		<dependencies>
134 | 		<dependency>
135 |             <groupId>org.springframework.boot</groupId>
136 |             <artifactId>spring-boot-dependencies</artifactId>
137 |             <version>1.5.4.RELEASE</version>
138 |             <type>pom</type>
139 |             <scope>import</scope>
140 |         </dependency>
141 | 			<dependency>
142 | 				<groupId>org.seleniumhq.selenium</groupId>
143 | 				<artifactId>selenium-java</artifactId>
144 | 				<version>2.53.1</version>
145 | 				<exclusions>
146 | 					<exclusion>
147 | 						<artifactId>htmlunit-driver</artifactId>
148 | 						<groupId>org.seleniumhq.selenium</groupId>
149 | 					</exclusion>
150 | 				</exclusions>
151 | 			</dependency>
152 | 			<dependency>
153 | 				<groupId>com.codeborne</groupId>
154 | 				<artifactId>phantomjsdriver</artifactId>
155 | 				<version>1.3.0</version>
156 | 			</dependency>
157 | 			<dependency>
158 | 				<groupId>redis.clients</groupId>
159 | 				<artifactId>jedis</artifactId>
160 | 				<version>2.9.0</version>
161 | 			</dependency>
162 | 		</dependencies>
163 | 	</dependencyManagement>
164 | 	<build>
165 | 		<plugins>
166 | 
167 | 			<plugin>
168 | 				<groupId>org.apache.maven.plugins</groupId>
169 | 				<artifactId>maven-compiler-plugin</artifactId>
170 | 				<version>3.1</version>
171 | 				<configuration>
172 | 					<source>1.6</source>
173 | 					<target>1.6</target>
174 | 					<encoding>UTF-8</encoding>
175 | 				</configuration>
176 | 			</plugin>
177 | 			<plugin>
178 | 				<groupId>org.apache.maven.plugins</groupId>
179 | 				<artifactId>maven-resources-plugin</artifactId>
180 | 				<version>2.6</version>
181 | 				<configuration>
182 | 					<encoding>UTF-8</encoding>
183 | 				</configuration>
184 | 			</plugin>
185 | 			<plugin>
186 | 				<groupId>org.apache.maven.plugins</groupId>
187 | 				<artifactId>maven-jar-plugin</artifactId>
188 | 				<!-- <configuration> <excludes> <exclude>log4j.xml</exclude> </excludes> 
189 | 					</configuration> -->
190 | 			</plugin>
191 | 
192 | 			<plugin>
193 | 				<groupId>org.apache.maven.plugins</groupId>
194 | 				<artifactId>maven-javadoc-plugin</artifactId>
195 | 				<version>2.10.4</version>
196 | 				<configuration>
197 | 					<encoding>UTF-8</encoding>
198 | 					<doctitle>crawler-0.3.0</doctitle>
199 | 					<locale>zh_CN</locale>
200 | 				</configuration>
201 | 				<executions>
202 | 					<execution>
203 | 						<id>aggregate</id>
204 | 						<goals>
205 | 							<goal>aggregate</goal>
206 | 						</goals>
207 | 						<phase>site</phase>
208 | 					</execution>
209 | 					<execution>
210 | 						<id>attach-javadocs</id>
211 | 						<goals>
212 | 							<goal>jar</goal>
213 | 						</goals>
214 | 					</execution>
215 | 				</executions>
216 | 			</plugin>
217 | 			<plugin>
218 | 				<groupId>org.apache.maven.plugins</groupId>
219 | 				<artifactId>maven-release-plugin</artifactId>
220 | 				<version>2.4.1</version>
221 | 				<configuration>
222 | 					<tagNameFormat>v@{project.version}</tagNameFormat>
223 | 					<autoVersionSubmodules>true</autoVersionSubmodules>
224 | 				</configuration>
225 | 			</plugin>
226 | 		</plugins>
227 | 	</build>
228 | 
229 | 	<profiles>
230 | 		<profile>
231 | 			<id>release</id>
232 | 			<build>
233 | 				<plugins>
234 | 					<!-- Source -->
235 | 					<plugin>
236 | 						<groupId>org.apache.maven.plugins</groupId>
237 | 						<artifactId>maven-source-plugin</artifactId>
238 | 						<version>2.2.1</version>
239 | 						<executions>
240 | 							<execution>
241 | 								<phase>package</phase>
242 | 								<goals>
243 | 									<goal>jar-no-fork</goal>
244 | 								</goals>
245 | 							</execution>
246 | 						</executions>
247 | 					</plugin>
248 | 					<!-- Javadoc -->
249 | 					<plugin>
250 | 						<groupId>org.apache.maven.plugins</groupId>
251 | 						<artifactId>maven-javadoc-plugin</artifactId>
252 | 						<version>2.9.1</version>
253 | 						<executions>
254 | 							<execution>
255 | 								<phase>package</phase>
256 | 								<goals>
257 | 									<goal>jar</goal>
258 | 								</goals>
259 | 							</execution>
260 | 						</executions>
261 | 					</plugin>
262 | 					<!-- GPG -->
263 | 					<plugin>
264 | 						<groupId>org.apache.maven.plugins</groupId>
265 | 						<artifactId>maven-gpg-plugin</artifactId>
266 | 						<version>1.6</version>
267 | 						<executions>
268 | 							<execution>
269 | 								<phase>verify</phase>
270 | 								<goals>
271 | 									<goal>sign</goal>
272 | 								</goals>
273 | 							</execution>
274 | 						</executions>
275 | 					</plugin>
276 | 					<plugin>
277 | 						<groupId>org.sonatype.plugins</groupId>
278 | 						<artifactId>nexus-staging-maven-plugin</artifactId>
279 | 						<version>1.6</version>
280 | 						<extensions>true</extensions>
281 | 						<configuration>
282 | 							<serverId>osscenter</serverId>
283 | 							<nexusUrl>https://oss.sonatype.org/</nexusUrl>
284 | 							<autoReleaseAfterClose>true</autoReleaseAfterClose>
285 | 						</configuration>
286 | 					</plugin>
287 | 				</plugins>
288 | 			</build>
289 | 			<distributionManagement>
290 | 				<snapshotRepository>
291 | 					<id>osscenter</id>
292 | 					<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
293 | 				</snapshotRepository>
294 | 				<repository>
295 | 					<id>osscenter</id>
296 | 					<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
297 | 				</repository>
298 | 			</distributionManagement>
299 | 		</profile>
300 | 	</profiles>
301 | </project>


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/http/AbsDownloader.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler.http;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.UnsupportedEncodingException;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.http.HttpHost;
 10 | import org.apache.http.NameValuePair;
 11 | import org.apache.http.client.ClientProtocolException;
 12 | import org.apache.http.client.config.CookieSpecs;
 13 | import org.apache.http.client.config.RequestConfig;
 14 | import org.apache.http.client.entity.UrlEncodedFormEntity;
 15 | import org.apache.http.client.methods.CloseableHttpResponse;
 16 | import org.apache.http.client.methods.HttpUriRequest;
 17 | import org.apache.http.client.methods.RequestBuilder;
 18 | import org.apache.http.client.protocol.HttpClientContext;
 19 | import org.apache.http.impl.client.CloseableHttpClient;
 20 | import org.apache.http.message.BasicNameValuePair;
 21 | import org.apache.http.util.EntityUtils;
 22 | import org.slf4j.Logger;
 23 | import org.slf4j.LoggerFactory;
 24 | 
 25 | import com.github.xbynet.crawler.IpProxyProvider;
 26 | import com.github.xbynet.crawler.Request;
 27 | import com.github.xbynet.crawler.RequestAction;
 28 | import com.github.xbynet.crawler.Response;
 29 | import com.github.xbynet.crawler.Site;
 30 | import com.github.xbynet.crawler.Spider;
 31 | import com.github.xbynet.crawler.SpiderListener;
 32 | import com.github.xbynet.crawler.Const.HttpMethod;
 33 | import com.github.xbynet.crawler.utils.CrawlerUtils;
 34 | 
 35 | public abstract class AbsDownloader implements Downloader{
 36 | 	private Logger log=LoggerFactory.getLogger(AbsDownloader.class);
 37 | 	
 38 | 	private CloseableHttpClient client;
 39 | 	private Spider spider;
 40 | 	
 41 | 	public AbsDownloader(){
 42 | 		
 43 | 	}
 44 | 	public void init(){
 45 | 		HttpClientFactory clientFactory=spider.getHttpClientFactory();
 46 | 		if(clientFactory==null){
 47 | 			clientFactory=new HttpClientFactory();
 48 | 		}
 49 | 		this.client=clientFactory.getClient();
 50 | 	}
 51 | 	protected void doDownload(Request request,Object... extras){
 52 | 		String url=request.getUrl();
 53 | 		Site site=getSpider().getSite();
 54 | 		IpProxyProvider ipProxyProvider=getSpider().getIpProvider();
 55 | 		HttpHost proxy=null;
 56 | 		if(ipProxyProvider!=null){
 57 | 			proxy=ipProxyProvider.getIp();
 58 | 		}
 59 | 		
 60 | 		log.debug(getSpider().getName()+",开始请求"+url);
 61 | 		HttpUriRequest httpUriRequest=generateHttpRequest(site, request, proxy);
 62 | 		
 63 | 		Response response=new Response();
 64 | 		boolean state=cycleRequest(httpUriRequest,request,site,response,extras);
 65 | 		
 66 | 		if(!state){
 67 | 			log.error("no content crawled for "+request.getUrl());
 68 | 			notifyListener(false,request,null);
 69 | 			return;
 70 | 		}
 71 | 		addContinueRequest(response);
 72 | 		notifyListener(true,request,null);
 73 | 		//循环遍历所有分块请求
 74 | 		List<Request> orderReqList=request.getPartRequest();
 75 | 		while(orderReqList!=null && orderReqList.size()>0){
 76 | 			Request req=orderReqList.remove(0);
 77 | 			spider.getScheduler().getDuplicateRemover().isDuplicate(req, spider);
 78 | 			Response resp=new Response(response);
 79 | 			state=cycleRequest(generateHttpRequest(site, req, proxy), req, site, resp, extras);
 80 | 			if(!state){
 81 | 				log.error("no content crawled for "+req.getUrl());
 82 | 				notifyListener(false, req, null);
 83 | 			}else{
 84 | 				notifyListener(true,req,null);
 85 | 			}
 86 | 			addContinueRequest(resp);
 87 | 		}
 88 | 	}
 89 | 	protected void addContinueRequest(Response response){
 90 | 		List<Request> reqlist=response.getContinueReqeusts();
 91 | 		if(reqlist!=null){
 92 | 			for(Request req:reqlist){
 93 | 				spider.getScheduler().push(req, spider);
 94 | 			}
 95 | 		}
 96 | 	}
 97 | 	protected boolean cycleRequest(HttpUriRequest httpUriRequest,Request request,Site site,Response response,Object... extras){
 98 | 		boolean state=false;
 99 | 		try {
100 | 			state = doRequest(httpUriRequest, request,site,response,extras);
101 | 		} catch (Exception e) {
102 | 			log.error("",e);
103 | 		}
104 | 		int retryCount=request.getRetryCount()>=0?request.getRetryCount():site.getRetry();
105 | 		int retrySleepTimes=request.getRetrySleepTime()>=0?request.getRetrySleepTime():site.getRetrySleep();
106 | 		int retryIndex=1;
107 | 		while(!state && retryIndex<retryCount){
108 | 			retryIndex++;
109 | 			CrawlerUtils.sleep(retrySleepTimes);
110 | 			try {
111 | 				state=doRequest(httpUriRequest, request,site,response,extras);
112 | 			} catch (Exception e) {
113 | 				log.error("",e);
114 | 			}
115 | 		}
116 | 		return state;
117 | 	}
118 | 	protected void notifyListener(boolean state,Request request,Exception e){
119 | 		SpiderListener listener=spider.getSpiderListener();
120 | 		if(listener==null){
121 | 			return;
122 | 		}
123 | 		if(state){
124 | 			listener.success(spider, request);
125 | 		}else{
126 | 			listener.fail(spider, request, e);
127 | 		}
128 | 	}
129 | 	protected boolean doRequest(HttpUriRequest httpUriRequest,Request request,Site site,Response response,Object... extras) throws Exception{
130 | 		RequestAction action=request.getAction();
131 | 		boolean state=false;
132 | 		HttpClientContext clientContext=request.getCtx();
133 | 		if(clientContext==null){
134 | 			clientContext=new HttpClientContext();
135 | 		}
136 | 		IpProxyProvider ipProxyProvider=getSpider().getIpProvider();
137 | 		HttpHost proxy=null;
138 | 		if(ipProxyProvider!=null){
139 | 			proxy=ipProxyProvider.getIp();
140 | 		}
141 | 		CloseableHttpResponse resp=null;
142 | 		try {
143 | 			if(action!=null){
144 | 				action.before(client, httpUriRequest);
145 | 			}
146 | 			resp=client.execute(httpUriRequest,clientContext);
147 | 			if(ipProxyProvider!=null){
148 | 				ipProxyProvider.valid(proxy);
149 | 			}
150 | 			state=true;
151 | 			if(action!=null){
152 | 				action.after(client, resp);
153 | 			}
154 | 			process(httpUriRequest, resp, request, site,response,extras);
155 | 			
156 | 		} catch (ClientProtocolException e) {
157 | 			if(ipProxyProvider!=null){
158 | 				ipProxyProvider.invalid(proxy);
159 | 			}
160 | 			throw new RuntimeException(e);
161 | 		} catch (IOException e) {
162 | 			if(ipProxyProvider!=null){
163 | 				ipProxyProvider.invalid(proxy);
164 | 			}
165 | 			throw new RuntimeException(e);
166 | 		}finally{
167 | 			if(resp!=null){
168 | 				EntityUtils.consumeQuietly(resp.getEntity());
169 | 			}
170 | 		}
171 | 		return state;
172 | 	}
173 | 	protected abstract void process(HttpUriRequest httpUriRequest,CloseableHttpResponse resp,Request request,Site site,Response response,Object... extras);
174 | 	
175 | 	protected HttpUriRequest generateHttpRequest(Site site,Request request,HttpHost proxy){
176 | 		RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
177 |         if (site.getHeaders() != null) {
178 |             for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
179 |                 requestBuilder.setHeader(headerEntry.getKey(), headerEntry.getValue());
180 |             }
181 |         }
182 |         RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
183 |         if (site != null) {
184 |             requestConfigBuilder.setConnectionRequestTimeout(site.getTimeout())
185 |                     .setSocketTimeout(site.getTimeout())
186 |                     .setConnectTimeout(site.getTimeout())
187 |                     .setCookieSpec(CookieSpecs.STANDARD);
188 |         }
189 | 
190 |         if (proxy != null) {
191 |             requestConfigBuilder.setProxy(proxy);
192 |         }
193 |         requestBuilder.setConfig(requestConfigBuilder.build());
194 |         HttpUriRequest httpUriRequest = requestBuilder.build();
195 |         if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
196 |             for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
197 |                 httpUriRequest.setHeader(header.getKey(), header.getValue());
198 |             }
199 |         }
200 |         return httpUriRequest;
201 | 	}
202 | 	private RequestBuilder selectRequestMethod(Request request) {
203 |         HttpMethod method = request.getMethod();
204 |         if (method == null || method==HttpMethod.GET) {
205 |             return addFormParams(RequestBuilder.get(),request);
206 |         } else if (method==HttpMethod.POST) {
207 |             return addFormParams(RequestBuilder.post(),request);
208 |         } else if (method==HttpMethod.HEAD) {
209 |             return addFormParams(RequestBuilder.head(),request);
210 |         }
211 |         throw new IllegalArgumentException("Illegal HTTP Method " + method);
212 |     }
213 | 
214 |     private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
215 |         if (request.getEntity() != null && "POST".equalsIgnoreCase(requestBuilder.getMethod())) {
216 |             requestBuilder.setEntity(request.getEntity());
217 |         }else if(request.getParams()!=null){
218 |         	List<NameValuePair> nameValuePairs=new ArrayList<NameValuePair>();
219 |         	for(String key:request.getParams().keySet()){
220 |         		BasicNameValuePair pair=new BasicNameValuePair(key, request.getParams().get(key));
221 |         		nameValuePairs.add(pair);
222 |         	}
223 |         	try {
224 | 				requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairs, "UTF-8"));
225 | 			} catch (UnsupportedEncodingException e) {
226 | 				log.error("",e);
227 | 			}
228 |         }
229 |         return requestBuilder;
230 |     }
231 | 
232 | 	public Spider getSpider() {
233 | 		return spider;
234 | 	}
235 | 
236 | 	public void setSpider(Spider spider) {
237 | 		this.spider = spider;
238 | 	}
239 | 
240 | 	@Override
241 | 	public void close() throws IOException {
242 | 		spider=null;
243 | 		client.close();
244 | 		client=null;
245 | 	}
246 | 
247 | 	@Override
248 | 	public void download(Request request) {
249 | 		throw new RuntimeException("not support!");
250 | 	}
251 | 	
252 | }
253 | 


--------------------------------------------------------------------------------
/crawler-core/src/main/java/com/github/xbynet/crawler/Spider.java:
--------------------------------------------------------------------------------
  1 | package com.github.xbynet.crawler;
  2 | 
  3 | import java.io.Closeable;
  4 | import java.io.IOException;
  5 | import java.util.Date;
  6 | import java.util.UUID;
  7 | import java.util.concurrent.TimeUnit;
  8 | import java.util.concurrent.atomic.AtomicLong;
  9 | import java.util.concurrent.locks.Condition;
 10 | import java.util.concurrent.locks.ReentrantLock;
 11 | 
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.github.xbynet.crawler.http.DefaultDownloader;
 16 | import com.github.xbynet.crawler.http.Downloader;
 17 | import com.github.xbynet.crawler.http.FileDownloader;
 18 | import com.github.xbynet.crawler.http.HttpClientFactory;
 19 | import com.github.xbynet.crawler.scheduler.DefaultScheduler;
 20 | import com.github.xbynet.crawler.scheduler.Scheduler;
 21 | import com.github.xbynet.crawler.utils.CountableThreadPool;
 22 | import com.github.xbynet.crawler.utils.CrawlerUtils;
 23 | 
 24 | public class Spider implements ISpider, Runnable {
 25 | 	private static final Logger log=LoggerFactory.getLogger(Spider.class);
 26 | 	
 27 | 	private String name;
 28 | 	private Site site;
 29 | 	private Scheduler scheduler = new DefaultScheduler();
 30 | 	private IpProxyProvider ipProvider;
 31 | 	private HttpClientFactory httpClientFactory = new HttpClientFactory();
 32 | 	private FileDownloader fileDownloader = null;
 33 | 	private Downloader defaultDownloader=null;
 34 | 	private Processor processor;
 35 | 	private SpiderListener spiderListener;
 36 | 	/** 是否在任务结束后释放所有资源并终止 */
 37 | 	private boolean shutdownOnComplete = true;
 38 | 	/** 空闲等待时长，超过此时长便自动结束爬虫 */
 39 | 	private int idleWaitTime=1*60*1000;
 40 | 	private Date startTime;
 41 | 	private Date endTime;
 42 | 	private AtomicLong processUrlCount=new AtomicLong(0L);
 43 | 
 44 | 	private ReentrantLock newUrlLock = new ReentrantLock();
 45 | 
 46 |     private Condition newUrlCondition = newUrlLock.newCondition();
 47 |     
 48 | 	public enum Status {
 49 | 		NotRun, Running, Stopped, Destroyed
 50 | 	}
 51 | 
 52 | 	private Status state = Status.NotRun;
 53 | 	private int threadNum = 1;
 54 | 
 55 | 	private CountableThreadPool pool;
 56 | 
 57 | 	private Spider() {
 58 | 		this.name = "Spider-" + UUID.randomUUID().toString();
 59 | 		this.fileDownloader = new FileDownloader();
 60 | 		this.fileDownloader.setSpider(this);
 61 | 		this.fileDownloader.init();
 62 | 		this.defaultDownloader=new DefaultDownloader();
 63 | 		this.defaultDownloader.setSpider(this);
 64 | 		this.defaultDownloader.init();
 65 | 	}
 66 | 	
 67 | 	
 68 | 	public static class Builder{
 69 | 		private Spider spider;
 70 | 		private Builder(Spider spider1,Processor p){
 71 | 			this.spider=spider1;
 72 | 			p.setSpider(spider);
 73 | 			p.setFileDownloader(spider.fileDownloader);
 74 | 			this.spider.processor=p;
 75 | 		}
 76 | 		
 77 | 		public Spider build(){
 78 | 			return spider;
 79 | 		}
 80 | 		
 81 | 		public Builder urls(String... urls){
 82 | 			for(String url:urls){
 83 | 				Request req=new Request(url);
 84 | 				spider.scheduler.push(req, spider);
 85 | 			}
 86 | 			return this;
 87 | 		}
 88 | 		public Builder requests(Request... requestlist){
 89 | 			for(Request req:requestlist){
 90 | 				spider.scheduler.push(req, spider);
 91 | 			}
 92 | 			return this;
 93 | 		}
 94 | 		public Builder site(Site site) {
 95 | 			spider.site = site;
 96 | 			return this;
 97 | 		}
 98 | 		public Builder scheduler(Scheduler scheduler) {
 99 | 			Scheduler old=spider.scheduler;
100 | 			spider.scheduler = scheduler;
101 | 			Request req=null;
102 | 			while((req=old.poll(spider))!=null){
103 | 				spider.scheduler.push(req, spider);
104 | 			}
105 | 			return this;
106 | 		}
107 | 		public Builder name(String name) {
108 | 			spider.name = name;
109 | 			return this;
110 | 		}
111 | 		public Builder ipProvider(IpProxyProvider ipProvider) {
112 | 			spider.ipProvider = ipProvider;
113 | 			return this;
114 | 		}
115 | 		public Builder httpClientFactory(HttpClientFactory httpClientFactory) {
116 | 			spider.httpClientFactory = httpClientFactory;
117 | 			return this;
118 | 		}
119 | 		public Builder fileDownloader(FileDownloader fileDownloader1) {
120 | 			fileDownloader1.setSpider(spider);
121 | 			fileDownloader1.init();
122 | 			spider.fileDownloader=fileDownloader1;
123 | 			return this;
124 | 		}
125 | 		public Builder listener(SpiderListener spiderListener) {
126 | 			spider.spiderListener = spiderListener;
127 | 			return this;
128 | 		}
129 | 		public Builder threadNum(int threadNum) {
130 | 			spider.threadNum = threadNum;
131 | 			return this;
132 | 		}
133 | 		public Builder pool(CountableThreadPool pool) {
134 | 			spider.pool = pool;
135 | 			return this;
136 | 		}
137 | 		public Builder shutdownOnComplete(boolean shutdownOnComplete) {
138 | 			spider.shutdownOnComplete = shutdownOnComplete;
139 | 			return this;
140 | 		}
141 | 
142 | 		public Builder defaultDownloader(Downloader downloader) {
143 | 			downloader.setSpider(spider);
144 | 			downloader.init();
145 | 			spider.defaultDownloader=downloader;
146 | 			return this;
147 | 		}
148 | 
149 | 	}
150 | 	public static Builder builder(Processor p) {
151 | 		return new Builder(new Spider(),p);
152 | 	}
153 | 
154 | 	public String getName() {
155 | 		return this.name;
156 | 	}
157 | 
158 | 	
159 | 	public Site getSite() {
160 | 		return site;
161 | 	}
162 | 
163 | 
164 | 	public Scheduler getScheduler() {
165 | 		return scheduler;
166 | 	}
167 | 
168 | 
169 | 	public IpProxyProvider getIpProvider() {
170 | 		return ipProvider;
171 | 	}
172 | 
173 | 	public HttpClientFactory getHttpClientFactory() {
174 | 		return httpClientFactory;
175 | 	}
176 | 
177 | 
178 | 	public FileDownloader getFileDownloader() {
179 | 		return fileDownloader;
180 | 	}
181 | 
182 | 
183 | 	public Processor getProcessor() {
184 | 		return processor;
185 | 	}
186 | 
187 | 	public SpiderListener getSpiderListener() {
188 | 		return spiderListener;
189 | 	}
190 | 
191 | 	public int getThreadNum() {
192 | 		return threadNum;
193 | 	}
194 | 
195 | 	public void run() {
196 | 		setStatus(Status.Running);
197 | 		init();
198 | 		log.debug("Spider "+getName()+" start!");
199 | 		System.out.println("--------------------------------------------------------------");
200 | 		System.out.println("### 不要问我为什么，你要记住，在你最落寞的时候，有个人对你说过，你可以的！###");
201 | 		System.out.println("### 为什么要写爬虫呢？因为我们爬的是寂寞;因为泡妹子需要笑话;因为找工作需要筛选职位;因为老板要求;也许因为要装x才是正解   ###");
202 | 		System.out.println("--------------------------------------------------------------");
203 | 		while (!Thread.currentThread().isInterrupted() && state==Status.Running) {
204 | 			Request request = scheduler.poll(this);
205 |             if (request == null) {
206 |                 if (pool.getThreadAlive() == 0) {
207 |                 	CrawlerUtils.sleep(idleWaitTime);
208 |                 	request = scheduler.poll(this);
209 |                 	if(request==null && shutdownOnComplete){
210 |                 		break;
211 |                 	}
212 |                 }
213 |                 // wait until new url added
214 |                 waitNewUrl();
215 |             } else {
216 |             	final Request tmpReq=request;
217 |                 pool.execute(new Runnable() {
218 |                     @Override
219 |                     public void run() {
220 |                         try {
221 |                             defaultDownloader.download(tmpReq);
222 |                         } catch (Exception e) {
223 |                             log.error("process request " + tmpReq + " error", e);
224 |                         } finally {
225 |                             processUrlCount.incrementAndGet();
226 |                             signalNewUrl();
227 |                         }
228 |                     }
229 |                 });
230 |             }
231 | 		}
232 | 		setStatus(Status.Stopped);
233 | 		if(shutdownOnComplete){
234 | 			shutdown();
235 | 		}
236 | 		
237 | 	}
238 | 	private void waitNewUrl() {
239 |         newUrlLock.lock();
240 |         try {
241 |             //double check
242 |             if (pool.getThreadAlive() == 0 && shutdownOnComplete) {
243 |                 return;
244 |             }
245 |             newUrlCondition.await(idleWaitTime, TimeUnit.MILLISECONDS);
246 |         } catch (InterruptedException e) {
247 |             log.warn("waitNewUrl - interrupted, error {}", e);
248 |         } finally {
249 |             newUrlLock.unlock();
250 |         }
251 |     }
252 | 
253 |     private void signalNewUrl() {
254 |         try {
255 |             newUrlLock.lock();
256 |             newUrlCondition.signalAll();
257 |         } finally {
258 |             newUrlLock.unlock();
259 |         }
260 |     }
261 | 	public void runAsync() {
262 | 		Thread thread = new Thread(this);
263 | 		thread.setDaemon(false);
264 | 		thread.start();
265 | 	}
266 | 
267 | 	public void stop() {
268 | 		setStatus(Status.Stopped);
269 | 	}
270 | 
271 | 	public synchronized void shutdown() {
272 | 		if(state==Status.Destroyed || state==Status.NotRun){
273 | 			throw new IllegalStateException("Spider has never start or already destroyed");
274 | 		}
275 | 		setStatus(Status.Destroyed);
276 | 		endTime=new Date();
277 | 		if(pool!=null){
278 | 			pool.shutdown();
279 | 			try {
280 | 				pool.awaitTermination(idleWaitTime<60000?60000:idleWaitTime, TimeUnit.MILLISECONDS);
281 | 			} catch (InterruptedException e) {
282 | 				log.warn("thread pool termination interrupted",e);
283 | 			}
284 | 		}
285 | 		closeQuietly(defaultDownloader);
286 | 		closeQuietly(fileDownloader);
287 | 		closeQuietly(ipProvider);
288 | 		closeQuietly(ipProvider);
289 | 		
290 | 	}
291 | 	private void closeQuietly(Closeable clo){
292 | 		if(clo!=null){
293 | 			try {
294 | 				clo.close();
295 | 			} catch (IOException e) {
296 | 				log.error("", e);
297 | 			}
298 | 		}
299 | 	}
300 | 
301 | 	protected synchronized void init() {
302 | 		if (pool == null) {
303 | 			if (state != Status.Destroyed) {
304 | 				pool = new CountableThreadPool(threadNum);
305 | 			} else {
306 | 				throw new IllegalStateException("current spider is destroyed!");
307 | 			}
308 | 		}
309 | 		startTime=new Date();
310 | 	}
311 | 
312 | 	public CountableThreadPool getPool() {
313 | 		return pool;
314 | 	}
315 | 
316 | 	
317 | 
318 | 	public boolean isShutdownOnComplete() {
319 | 		return shutdownOnComplete;
320 | 	}
321 | 
322 | 	public Status getState() {
323 | 		return state;
324 | 	}
325 | 
326 | 	private synchronized void setStatus(Status s) {
327 | 		state = s;
328 | 	}
329 | 
330 | 	public boolean isRunning() {
331 | 		return state == Status.Running;
332 | 	}
333 | 
334 | 	public boolean isStopped() {
335 | 		return state == Status.Stopped;
336 | 	}
337 | 
338 | 	public boolean isDestroyed() {
339 | 		return state == Status.Destroyed;
340 | 	}
341 | 
342 | 	public Date getStartTime() {
343 | 		return startTime;
344 | 	}
345 | 
346 | 	private void setStartTime(Date startTime) {
347 | 		this.startTime = startTime;
348 | 	}
349 | 
350 | 	public Date getEndTime() {
351 | 		return endTime;
352 | 	}
353 | 
354 | 	private void setEndTime(Date endTime) {
355 | 		this.endTime = endTime;
356 | 	}
357 | 
358 | 	public Downloader getDefaultDownloader() {
359 | 		return defaultDownloader;
360 | 	}
361 | 
362 | 	public AtomicLong getProcessUrlCount() {
363 | 		return processUrlCount;
364 | 	}
365 | 	/**
366 | 	 * 是否处于空闲状态
367 | 	 */
368 | 	public boolean isIdle(){
369 | 		return pool.getThreadAlive() == 0;
370 | 	}
371 | }
372 | 


--------------------------------------------------------------------------------
/crawler-server/src/main/webapp/js/bootstrap.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Bootstrap v3.3.4 (http://getbootstrap.com)
3 |  * Copyright 2011-2015 Twitter, Inc.
4 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
5 |  */
6 | if("undefined"==typeof jQuery)throw new Error("Bootstrap's JavaScript requires jQuery");+function(a){"use strict";var b=a.fn.jquery.split(" ")[0].split(".");if(b[0]<2&&b[1]<9||1==b[0]&&9==b[1]&&b[2]<1)throw new Error("Bootstrap's JavaScript requires jQuery version 1.9.1 or higher")}(jQuery),+function(a){"use strict";function b(){var a=document.createElement("bootstrap"),b={WebkitTransition:"webkitTransitionEnd",MozTransition:"transitionend",OTransition:"oTransitionEnd otransitionend",transition:"transitionend"};for(var c in b)if(void 0!==a.style[c])return{end:b[c]};return!1}a.fn.emulateTransitionEnd=function(b){var c=!1,d=this;a(this).one("bsTransitionEnd",function(){c=!0});var e=function(){c||a(d).trigger(a.support.transition.end)};return setTimeout(e,b),this},a(function(){a.support.transition=b(),a.support.transition&&(a.event.special.bsTransitionEnd={bindType:a.support.transition.end,delegateType:a.support.transition.end,handle:function(b){return a(b.target).is(this)?b.handleObj.handler.apply(this,arguments):void 0}})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var c=a(this),e=c.data("bs.alert");e||c.data("bs.alert",e=new d(this)),"string"==typeof b&&e[b].call(c)})}var c='[data-dismiss="alert"]',d=function(b){a(b).on("click",c,this.close)};d.VERSION="3.3.4",d.TRANSITION_DURATION=150,d.prototype.close=function(b){function c(){g.detach().trigger("closed.bs.alert").remove()}var e=a(this),f=e.attr("data-target");f||(f=e.attr("href"),f=f&&f.replace(/.*(?=#[^\s]*$)/,""));var g=a(f);b&&b.preventDefault(),g.length||(g=e.closest(".alert")),g.trigger(b=a.Event("close.bs.alert")),b.isDefaultPrevented()||(g.removeClass("in"),a.support.transition&&g.hasClass("fade")?g.one("bsTransitionEnd",c).emulateTransitionEnd(d.TRANSITION_DURATION):c())};var e=a.fn.alert;a.fn.alert=b,a.fn.alert.Constructor=d,a.fn.alert.noConflict=function(){return a.fn.alert=e,this},a(document).on("click.bs.alert.data-api",c,d.prototype.close)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.button"),f="object"==typeof b&&b;e||d.data("bs.button",e=new c(this,f)),"toggle"==b?e.toggle():b&&e.setState(b)})}var c=function(b,d){this.$element=a(b),this.options=a.extend({},c.DEFAULTS,d),this.isLoading=!1};c.VERSION="3.3.4",c.DEFAULTS={loadingText:"loading..."},c.prototype.setState=function(b){var c="disabled",d=this.$element,e=d.is("input")?"val":"html",f=d.data();b+="Text",null==f.resetText&&d.data("resetText",d[e]()),setTimeout(a.proxy(function(){d[e](null==f[b]?this.options[b]:f[b]),"loadingText"==b?(this.isLoading=!0,d.addClass(c).attr(c,c)):this.isLoading&&(this.isLoading=!1,d.removeClass(c).removeAttr(c))},this),0)},c.prototype.toggle=function(){var a=!0,b=this.$element.closest('[data-toggle="buttons"]');if(b.length){var c=this.$element.find("input");"radio"==c.prop("type")&&(c.prop("checked")&&this.$element.hasClass("active")?a=!1:b.find(".active").removeClass("active")),a&&c.prop("checked",!this.$element.hasClass("active")).trigger("change")}else this.$element.attr("aria-pressed",!this.$element.hasClass("active"));a&&this.$element.toggleClass("active")};var d=a.fn.button;a.fn.button=b,a.fn.button.Constructor=c,a.fn.button.noConflict=function(){return a.fn.button=d,this},a(document).on("click.bs.button.data-api",'[data-toggle^="button"]',function(c){var d=a(c.target);d.hasClass("btn")||(d=d.closest(".btn")),b.call(d,"toggle"),c.preventDefault()}).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',function(b){a(b.target).closest(".btn").toggleClass("focus",/^focus(in)?$/.test(b.type))})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.carousel"),f=a.extend({},c.DEFAULTS,d.data(),"object"==typeof b&&b),g="string"==typeof b?b:f.slide;e||d.data("bs.carousel",e=new c(this,f)),"number"==typeof b?e.to(b):g?e[g]():f.interval&&e.pause().cycle()})}var c=function(b,c){this.$element=a(b),this.$indicators=this.$element.find(".carousel-indicators"),this.options=c,this.paused=null,this.sliding=null,this.interval=null,this.$active=null,this.$items=null,this.options.keyboard&&this.$element.on("keydown.bs.carousel",a.proxy(this.keydown,this)),"hover"==this.options.pause&&!("ontouchstart"in document.documentElement)&&this.$element.on("mouseenter.bs.carousel",a.proxy(this.pause,this)).on("mouseleave.bs.carousel",a.proxy(this.cycle,this))};c.VERSION="3.3.4",c.TRANSITION_DURATION=600,c.DEFAULTS={interval:5e3,pause:"hover",wrap:!0,keyboard:!0},c.prototype.keydown=function(a){if(!/input|textarea/i.test(a.target.tagName)){switch(a.which){case 37:this.prev();break;case 39:this.next();break;default:return}a.preventDefault()}},c.prototype.cycle=function(b){return b||(this.paused=!1),this.interval&&clearInterval(this.interval),this.options.interval&&!this.paused&&(this.interval=setInterval(a.proxy(this.next,this),this.options.interval)),this},c.prototype.getItemIndex=function(a){return this.$items=a.parent().children(".item"),this.$items.index(a||this.$active)},c.prototype.getItemForDirection=function(a,b){var c=this.getItemIndex(b),d="prev"==a&&0===c||"next"==a&&c==this.$items.length-1;if(d&&!this.options.wrap)return b;var e="prev"==a?-1:1,f=(c+e)%this.$items.length;return this.$items.eq(f)},c.prototype.to=function(a){var b=this,c=this.getItemIndex(this.$active=this.$element.find(".item.active"));return a>this.$items.length-1||0>a?void 0:this.sliding?this.$element.one("slid.bs.carousel",function(){b.to(a)}):c==a?this.pause().cycle():this.slide(a>c?"next":"prev",this.$items.eq(a))},c.prototype.pause=function(b){return b||(this.paused=!0),this.$element.find(".next, .prev").length&&a.support.transition&&(this.$element.trigger(a.support.transition.end),this.cycle(!0)),this.interval=clearInterval(this.interval),this},c.prototype.next=function(){return this.sliding?void 0:this.slide("next")},c.prototype.prev=function(){return this.sliding?void 0:this.slide("prev")},c.prototype.slide=function(b,d){var e=this.$element.find(".item.active"),f=d||this.getItemForDirection(b,e),g=this.interval,h="next"==b?"left":"right",i=this;if(f.hasClass("active"))return this.sliding=!1;var j=f[0],k=a.Event("slide.bs.carousel",{relatedTarget:j,direction:h});if(this.$element.trigger(k),!k.isDefaultPrevented()){if(this.sliding=!0,g&&this.pause(),this.$indicators.length){this.$indicators.find(".active").removeClass("active");var l=a(this.$indicators.children()[this.getItemIndex(f)]);l&&l.addClass("active")}var m=a.Event("slid.bs.carousel",{relatedTarget:j,direction:h});return a.support.transition&&this.$element.hasClass("slide")?(f.addClass(b),f[0].offsetWidth,e.addClass(h),f.addClass(h),e.one("bsTransitionEnd",function(){f.removeClass([b,h].join(" ")).addClass("active"),e.removeClass(["active",h].join(" ")),i.sliding=!1,setTimeout(function(){i.$element.trigger(m)},0)}).emulateTransitionEnd(c.TRANSITION_DURATION)):(e.removeClass("active"),f.addClass("active"),this.sliding=!1,this.$element.trigger(m)),g&&this.cycle(),this}};var d=a.fn.carousel;a.fn.carousel=b,a.fn.carousel.Constructor=c,a.fn.carousel.noConflict=function(){return a.fn.carousel=d,this};var e=function(c){var d,e=a(this),f=a(e.attr("data-target")||(d=e.attr("href"))&&d.replace(/.*(?=#[^\s]+$)/,""));if(f.hasClass("carousel")){var g=a.extend({},f.data(),e.data()),h=e.attr("data-slide-to");h&&(g.interval=!1),b.call(f,g),h&&f.data("bs.carousel").to(h),c.preventDefault()}};a(document).on("click.bs.carousel.data-api","[data-slide]",e).on("click.bs.carousel.data-api","[data-slide-to]",e),a(window).on("load",function(){a('[data-ride="carousel"]').each(function(){var c=a(this);b.call(c,c.data())})})}(jQuery),+function(a){"use strict";function b(b){var c,d=b.attr("data-target")||(c=b.attr("href"))&&c.replace(/.*(?=#[^\s]+$)/,"");return a(d)}function c(b){return this.each(function(){var c=a(this),e=c.data("bs.collapse"),f=a.extend({},d.DEFAULTS,c.data(),"object"==typeof b&&b);!e&&f.toggle&&/show|hide/.test(b)&&(f.toggle=!1),e||c.data("bs.collapse",e=new d(this,f)),"string"==typeof b&&e[b]()})}var d=function(b,c){this.$element=a(b),this.options=a.extend({},d.DEFAULTS,c),this.$trigger=a('[data-toggle="collapse"][href="#'+b.id+'"],[data-toggle="collapse"][data-target="#'+b.id+'"]'),this.transitioning=null,this.options.parent?this.$parent=this.getParent():this.addAriaAndCollapsedClass(this.$element,this.$trigger),this.options.toggle&&this.toggle()};d.VERSION="3.3.4",d.TRANSITION_DURATION=350,d.DEFAULTS={toggle:!0},d.prototype.dimension=function(){var a=this.$element.hasClass("width");return a?"width":"height"},d.prototype.show=function(){if(!this.transitioning&&!this.$element.hasClass("in")){var b,e=this.$parent&&this.$parent.children(".panel").children(".in, .collapsing");if(!(e&&e.length&&(b=e.data("bs.collapse"),b&&b.transitioning))){var f=a.Event("show.bs.collapse");if(this.$element.trigger(f),!f.isDefaultPrevented()){e&&e.length&&(c.call(e,"hide"),b||e.data("bs.collapse",null));var g=this.dimension();this.$element.removeClass("collapse").addClass("collapsing")[g](0).attr("aria-expanded",!0),this.$trigger.removeClass("collapsed").attr("aria-expanded",!0),this.transitioning=1;var h=function(){this.$element.removeClass("collapsing").addClass("collapse in")[g](""),this.transitioning=0,this.$element.trigger("shown.bs.collapse")};if(!a.support.transition)return h.call(this);var i=a.camelCase(["scroll",g].join("-"));this.$element.one("bsTransitionEnd",a.proxy(h,this)).emulateTransitionEnd(d.TRANSITION_DURATION)[g](this.$element[0][i])}}}},d.prototype.hide=function(){if(!this.transitioning&&this.$element.hasClass("in")){var b=a.Event("hide.bs.collapse");if(this.$element.trigger(b),!b.isDefaultPrevented()){var c=this.dimension();this.$element[c](this.$element[c]())[0].offsetHeight,this.$element.addClass("collapsing").removeClass("collapse in").attr("aria-expanded",!1),this.$trigger.addClass("collapsed").attr("aria-expanded",!1),this.transitioning=1;var e=function(){this.transitioning=0,this.$element.removeClass("collapsing").addClass("collapse").trigger("hidden.bs.collapse")};return a.support.transition?void this.$element[c](0).one("bsTransitionEnd",a.proxy(e,this)).emulateTransitionEnd(d.TRANSITION_DURATION):e.call(this)}}},d.prototype.toggle=function(){this[this.$element.hasClass("in")?"hide":"show"]()},d.prototype.getParent=function(){return a(this.options.parent).find('[data-toggle="collapse"][data-parent="'+this.options.parent+'"]').each(a.proxy(function(c,d){var e=a(d);this.addAriaAndCollapsedClass(b(e),e)},this)).end()},d.prototype.addAriaAndCollapsedClass=function(a,b){var c=a.hasClass("in");a.attr("aria-expanded",c),b.toggleClass("collapsed",!c).attr("aria-expanded",c)};var e=a.fn.collapse;a.fn.collapse=c,a.fn.collapse.Constructor=d,a.fn.collapse.noConflict=function(){return a.fn.collapse=e,this},a(document).on("click.bs.collapse.data-api",'[data-toggle="collapse"]',function(d){var e=a(this);e.attr("data-target")||d.preventDefault();var f=b(e),g=f.data("bs.collapse"),h=g?"toggle":e.data();c.call(f,h)})}(jQuery),+function(a){"use strict";function b(b){b&&3===b.which||(a(e).remove(),a(f).each(function(){var d=a(this),e=c(d),f={relatedTarget:this};e.hasClass("open")&&(e.trigger(b=a.Event("hide.bs.dropdown",f)),b.isDefaultPrevented()||(d.attr("aria-expanded","false"),e.removeClass("open").trigger("hidden.bs.dropdown",f)))}))}function c(b){var c=b.attr("data-target");c||(c=b.attr("href"),c=c&&/#[A-Za-z]/.test(c)&&c.replace(/.*(?=#[^\s]*$)/,""));var d=c&&a(c);return d&&d.length?d:b.parent()}function d(b){return this.each(function(){var c=a(this),d=c.data("bs.dropdown");d||c.data("bs.dropdown",d=new g(this)),"string"==typeof b&&d[b].call(c)})}var e=".dropdown-backdrop",f='[data-toggle="dropdown"]',g=function(b){a(b).on("click.bs.dropdown",this.toggle)};g.VERSION="3.3.4",g.prototype.toggle=function(d){var e=a(this);if(!e.is(".disabled, :disabled")){var f=c(e),g=f.hasClass("open");if(b(),!g){"ontouchstart"in document.documentElement&&!f.closest(".navbar-nav").length&&a('<div class="dropdown-backdrop"/>').insertAfter(a(this)).on("click",b);var h={relatedTarget:this};if(f.trigger(d=a.Event("show.bs.dropdown",h)),d.isDefaultPrevented())return;e.trigger("focus").attr("aria-expanded","true"),f.toggleClass("open").trigger("shown.bs.dropdown",h)}return!1}},g.prototype.keydown=function(b){if(/(38|40|27|32)/.test(b.which)&&!/input|textarea/i.test(b.target.tagName)){var d=a(this);if(b.preventDefault(),b.stopPropagation(),!d.is(".disabled, :disabled")){var e=c(d),g=e.hasClass("open");if(!g&&27!=b.which||g&&27==b.which)return 27==b.which&&e.find(f).trigger("focus"),d.trigger("click");var h=" li:not(.disabled):visible a",i=e.find('[role="menu"]'+h+', [role="listbox"]'+h);if(i.length){var j=i.index(b.target);38==b.which&&j>0&&j--,40==b.which&&j<i.length-1&&j++,~j||(j=0),i.eq(j).trigger("focus")}}}};var h=a.fn.dropdown;a.fn.dropdown=d,a.fn.dropdown.Constructor=g,a.fn.dropdown.noConflict=function(){return a.fn.dropdown=h,this},a(document).on("click.bs.dropdown.data-api",b).on("click.bs.dropdown.data-api",".dropdown form",function(a){a.stopPropagation()}).on("click.bs.dropdown.data-api",f,g.prototype.toggle).on("keydown.bs.dropdown.data-api",f,g.prototype.keydown).on("keydown.bs.dropdown.data-api",'[role="menu"]',g.prototype.keydown).on("keydown.bs.dropdown.data-api",'[role="listbox"]',g.prototype.keydown)}(jQuery),+function(a){"use strict";function b(b,d){return this.each(function(){var e=a(this),f=e.data("bs.modal"),g=a.extend({},c.DEFAULTS,e.data(),"object"==typeof b&&b);f||e.data("bs.modal",f=new c(this,g)),"string"==typeof b?f[b](d):g.show&&f.show(d)})}var c=function(b,c){this.options=c,this.$body=a(document.body),this.$element=a(b),this.$dialog=this.$element.find(".modal-dialog"),this.$backdrop=null,this.isShown=null,this.originalBodyPad=null,this.scrollbarWidth=0,this.ignoreBackdropClick=!1,this.options.remote&&this.$element.find(".modal-content").load(this.options.remote,a.proxy(function(){this.$element.trigger("loaded.bs.modal")},this))};c.VERSION="3.3.4",c.TRANSITION_DURATION=300,c.BACKDROP_TRANSITION_DURATION=150,c.DEFAULTS={backdrop:!0,keyboard:!0,show:!0},c.prototype.toggle=function(a){return this.isShown?this.hide():this.show(a)},c.prototype.show=function(b){var d=this,e=a.Event("show.bs.modal",{relatedTarget:b});this.$element.trigger(e),this.isShown||e.isDefaultPrevented()||(this.isShown=!0,this.checkScrollbar(),this.setScrollbar(),this.$body.addClass("modal-open"),this.escape(),this.resize(),this.$element.on("click.dismiss.bs.modal",'[data-dismiss="modal"]',a.proxy(this.hide,this)),this.$dialog.on("mousedown.dismiss.bs.modal",function(){d.$element.one("mouseup.dismiss.bs.modal",function(b){a(b.target).is(d.$element)&&(d.ignoreBackdropClick=!0)})}),this.backdrop(function(){var e=a.support.transition&&d.$element.hasClass("fade");d.$element.parent().length||d.$element.appendTo(d.$body),d.$element.show().scrollTop(0),d.adjustDialog(),e&&d.$element[0].offsetWidth,d.$element.addClass("in").attr("aria-hidden",!1),d.enforceFocus();var f=a.Event("shown.bs.modal",{relatedTarget:b});e?d.$dialog.one("bsTransitionEnd",function(){d.$element.trigger("focus").trigger(f)}).emulateTransitionEnd(c.TRANSITION_DURATION):d.$element.trigger("focus").trigger(f)}))},c.prototype.hide=function(b){b&&b.preventDefault(),b=a.Event("hide.bs.modal"),this.$element.trigger(b),this.isShown&&!b.isDefaultPrevented()&&(this.isShown=!1,this.escape(),this.resize(),a(document).off("focusin.bs.modal"),this.$element.removeClass("in").attr("aria-hidden",!0).off("click.dismiss.bs.modal").off("mouseup.dismiss.bs.modal"),this.$dialog.off("mousedown.dismiss.bs.modal"),a.support.transition&&this.$element.hasClass("fade")?this.$element.one("bsTransitionEnd",a.proxy(this.hideModal,this)).emulateTransitionEnd(c.TRANSITION_DURATION):this.hideModal())},c.prototype.enforceFocus=function(){a(document).off("focusin.bs.modal").on("focusin.bs.modal",a.proxy(function(a){this.$element[0]===a.target||this.$element.has(a.target).length||this.$element.trigger("focus")},this))},c.prototype.escape=function(){this.isShown&&this.options.keyboard?this.$element.on("keydown.dismiss.bs.modal",a.proxy(function(a){27==a.which&&this.hide()},this)):this.isShown||this.$element.off("keydown.dismiss.bs.modal")},c.prototype.resize=function(){this.isShown?a(window).on("resize.bs.modal",a.proxy(this.handleUpdate,this)):a(window).off("resize.bs.modal")},c.prototype.hideModal=function(){var a=this;this.$element.hide(),this.backdrop(function(){a.$body.removeClass("modal-open"),a.resetAdjustments(),a.resetScrollbar(),a.$element.trigger("hidden.bs.modal")})},c.prototype.removeBackdrop=function(){this.$backdrop&&this.$backdrop.remove(),this.$backdrop=null},c.prototype.backdrop=function(b){var d=this,e=this.$element.hasClass("fade")?"fade":"";if(this.isShown&&this.options.backdrop){var f=a.support.transition&&e;if(this.$backdrop=a('<div class="modal-backdrop '+e+'" />').appendTo(this.$body),this.$element.on("click.dismiss.bs.modal",a.proxy(function(a){return this.ignoreBackdropClick?void(this.ignoreBackdropClick=!1):void(a.target===a.currentTarget&&("static"==this.options.backdrop?this.$element[0].focus():this.hide()))},this)),f&&this.$backdrop[0].offsetWidth,this.$backdrop.addClass("in"),!b)return;f?this.$backdrop.one("bsTransitionEnd",b).emulateTransitionEnd(c.BACKDROP_TRANSITION_DURATION):b()}else if(!this.isShown&&this.$backdrop){this.$backdrop.removeClass("in");var g=function(){d.removeBackdrop(),b&&b()};a.support.transition&&this.$element.hasClass("fade")?this.$backdrop.one("bsTransitionEnd",g).emulateTransitionEnd(c.BACKDROP_TRANSITION_DURATION):g()}else b&&b()},c.prototype.handleUpdate=function(){this.adjustDialog()},c.prototype.adjustDialog=function(){var a=this.$element[0].scrollHeight>document.documentElement.clientHeight;this.$element.css({paddingLeft:!this.bodyIsOverflowing&&a?this.scrollbarWidth:"",paddingRight:this.bodyIsOverflowing&&!a?this.scrollbarWidth:""})},c.prototype.resetAdjustments=function(){this.$element.css({paddingLeft:"",paddingRight:""})},c.prototype.checkScrollbar=function(){var a=window.innerWidth;if(!a){var b=document.documentElement.getBoundingClientRect();a=b.right-Math.abs(b.left)}this.bodyIsOverflowing=document.body.clientWidth<a,this.scrollbarWidth=this.measureScrollbar()},c.prototype.setScrollbar=function(){var a=parseInt(this.$body.css("padding-right")||0,10);this.originalBodyPad=document.body.style.paddingRight||"",this.bodyIsOverflowing&&this.$body.css("padding-right",a+this.scrollbarWidth)},c.prototype.resetScrollbar=function(){this.$body.css("padding-right",this.originalBodyPad)},c.prototype.measureScrollbar=function(){var a=document.createElement("div");a.className="modal-scrollbar-measure",this.$body.append(a);var b=a.offsetWidth-a.clientWidth;return this.$body[0].removeChild(a),b};var d=a.fn.modal;a.fn.modal=b,a.fn.modal.Constructor=c,a.fn.modal.noConflict=function(){return a.fn.modal=d,this},a(document).on("click.bs.modal.data-api",'[data-toggle="modal"]',function(c){var d=a(this),e=d.attr("href"),f=a(d.attr("data-target")||e&&e.replace(/.*(?=#[^\s]+$)/,"")),g=f.data("bs.modal")?"toggle":a.extend({remote:!/#/.test(e)&&e},f.data(),d.data());d.is("a")&&c.preventDefault(),f.one("show.bs.modal",function(a){a.isDefaultPrevented()||f.one("hidden.bs.modal",function(){d.is(":visible")&&d.trigger("focus")})}),b.call(f,g,this)})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.tooltip"),f="object"==typeof b&&b;(e||!/destroy|hide/.test(b))&&(e||d.data("bs.tooltip",e=new c(this,f)),"string"==typeof b&&e[b]())})}var c=function(a,b){this.type=null,this.options=null,this.enabled=null,this.timeout=null,this.hoverState=null,this.$element=null,this.init("tooltip",a,b)};c.VERSION="3.3.4",c.TRANSITION_DURATION=150,c.DEFAULTS={animation:!0,placement:"top",selector:!1,template:'<div class="tooltip" role="tooltip"><div class="tooltip-arrow"></div><div class="tooltip-inner"></div></div>',trigger:"hover focus",title:"",delay:0,html:!1,container:!1,viewport:{selector:"body",padding:0}},c.prototype.init=function(b,c,d){if(this.enabled=!0,this.type=b,this.$element=a(c),this.options=this.getOptions(d),this.$viewport=this.options.viewport&&a(this.options.viewport.selector||this.options.viewport),this.$element[0]instanceof document.constructor&&!this.options.selector)throw new Error("`selector` option must be specified when initializing "+this.type+" on the window.document object!");for(var e=this.options.trigger.split(" "),f=e.length;f--;){var g=e[f];if("click"==g)this.$element.on("click."+this.type,this.options.selector,a.proxy(this.toggle,this));else if("manual"!=g){var h="hover"==g?"mouseenter":"focusin",i="hover"==g?"mouseleave":"focusout";this.$element.on(h+"."+this.type,this.options.selector,a.proxy(this.enter,this)),this.$element.on(i+"."+this.type,this.options.selector,a.proxy(this.leave,this))}}this.options.selector?this._options=a.extend({},this.options,{trigger:"manual",selector:""}):this.fixTitle()},c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.getOptions=function(b){return b=a.extend({},this.getDefaults(),this.$element.data(),b),b.delay&&"number"==typeof b.delay&&(b.delay={show:b.delay,hide:b.delay}),b},c.prototype.getDelegateOptions=function(){var b={},c=this.getDefaults();return this._options&&a.each(this._options,function(a,d){c[a]!=d&&(b[a]=d)}),b},c.prototype.enter=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c&&c.$tip&&c.$tip.is(":visible")?void(c.hoverState="in"):(c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),clearTimeout(c.timeout),c.hoverState="in",c.options.delay&&c.options.delay.show?void(c.timeout=setTimeout(function(){"in"==c.hoverState&&c.show()},c.options.delay.show)):c.show())},c.prototype.leave=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),clearTimeout(c.timeout),c.hoverState="out",c.options.delay&&c.options.delay.hide?void(c.timeout=setTimeout(function(){"out"==c.hoverState&&c.hide()},c.options.delay.hide)):c.hide()},c.prototype.show=function(){var b=a.Event("show.bs."+this.type);if(this.hasContent()&&this.enabled){this.$element.trigger(b);var d=a.contains(this.$element[0].ownerDocument.documentElement,this.$element[0]);if(b.isDefaultPrevented()||!d)return;var e=this,f=this.tip(),g=this.getUID(this.type);this.setContent(),f.attr("id",g),this.$element.attr("aria-describedby",g),this.options.animation&&f.addClass("fade");var h="function"==typeof this.options.placement?this.options.placement.call(this,f[0],this.$element[0]):this.options.placement,i=/\s?auto?\s?/i,j=i.test(h);j&&(h=h.replace(i,"")||"top"),f.detach().css({top:0,left:0,display:"block"}).addClass(h).data("bs."+this.type,this),this.options.container?f.appendTo(this.options.container):f.insertAfter(this.$element);var k=this.getPosition(),l=f[0].offsetWidth,m=f[0].offsetHeight;if(j){var n=h,o=this.options.container?a(this.options.container):this.$element.parent(),p=this.getPosition(o);h="bottom"==h&&k.bottom+m>p.bottom?"top":"top"==h&&k.top-m<p.top?"bottom":"right"==h&&k.right+l>p.width?"left":"left"==h&&k.left-l<p.left?"right":h,f.removeClass(n).addClass(h)}var q=this.getCalculatedOffset(h,k,l,m);this.applyPlacement(q,h);var r=function(){var a=e.hoverState;e.$element.trigger("shown.bs."+e.type),e.hoverState=null,"out"==a&&e.leave(e)};a.support.transition&&this.$tip.hasClass("fade")?f.one("bsTransitionEnd",r).emulateTransitionEnd(c.TRANSITION_DURATION):r()}},c.prototype.applyPlacement=function(b,c){var d=this.tip(),e=d[0].offsetWidth,f=d[0].offsetHeight,g=parseInt(d.css("margin-top"),10),h=parseInt(d.css("margin-left"),10);isNaN(g)&&(g=0),isNaN(h)&&(h=0),b.top=b.top+g,b.left=b.left+h,a.offset.setOffset(d[0],a.extend({using:function(a){d.css({top:Math.round(a.top),left:Math.round(a.left)})}},b),0),d.addClass("in");var i=d[0].offsetWidth,j=d[0].offsetHeight;"top"==c&&j!=f&&(b.top=b.top+f-j);var k=this.getViewportAdjustedDelta(c,b,i,j);k.left?b.left+=k.left:b.top+=k.top;var l=/top|bottom/.test(c),m=l?2*k.left-e+i:2*k.top-f+j,n=l?"offsetWidth":"offsetHeight";d.offset(b),this.replaceArrow(m,d[0][n],l)},c.prototype.replaceArrow=function(a,b,c){this.arrow().css(c?"left":"top",50*(1-a/b)+"%").css(c?"top":"left","")},c.prototype.setContent=function(){var a=this.tip(),b=this.getTitle();a.find(".tooltip-inner")[this.options.html?"html":"text"](b),a.removeClass("fade in top bottom left right")},c.prototype.hide=function(b){function d(){"in"!=e.hoverState&&f.detach(),e.$element.removeAttr("aria-describedby").trigger("hidden.bs."+e.type),b&&b()}var e=this,f=a(this.$tip),g=a.Event("hide.bs."+this.type);return this.$element.trigger(g),g.isDefaultPrevented()?void 0:(f.removeClass("in"),a.support.transition&&f.hasClass("fade")?f.one("bsTransitionEnd",d).emulateTransitionEnd(c.TRANSITION_DURATION):d(),this.hoverState=null,this)},c.prototype.fixTitle=function(){var a=this.$element;(a.attr("title")||"string"!=typeof a.attr("data-original-title"))&&a.attr("data-original-title",a.attr("title")||"").attr("title","")},c.prototype.hasContent=function(){return this.getTitle()},c.prototype.getPosition=function(b){b=b||this.$element;var c=b[0],d="BODY"==c.tagName,e=c.getBoundingClientRect();null==e.width&&(e=a.extend({},e,{width:e.right-e.left,height:e.bottom-e.top}));var f=d?{top:0,left:0}:b.offset(),g={scroll:d?document.documentElement.scrollTop||document.body.scrollTop:b.scrollTop()},h=d?{width:a(window).width(),height:a(window).height()}:null;return a.extend({},e,g,h,f)},c.prototype.getCalculatedOffset=function(a,b,c,d){return"bottom"==a?{top:b.top+b.height,left:b.left+b.width/2-c/2}:"top"==a?{top:b.top-d,left:b.left+b.width/2-c/2}:"left"==a?{top:b.top+b.height/2-d/2,left:b.left-c}:{top:b.top+b.height/2-d/2,left:b.left+b.width}},c.prototype.getViewportAdjustedDelta=function(a,b,c,d){var e={top:0,left:0};if(!this.$viewport)return e;var f=this.options.viewport&&this.options.viewport.padding||0,g=this.getPosition(this.$viewport);if(/right|left/.test(a)){var h=b.top-f-g.scroll,i=b.top+f-g.scroll+d;h<g.top?e.top=g.top-h:i>g.top+g.height&&(e.top=g.top+g.height-i)}else{var j=b.left-f,k=b.left+f+c;j<g.left?e.left=g.left-j:k>g.width&&(e.left=g.left+g.width-k)}return e},c.prototype.getTitle=function(){var a,b=this.$element,c=this.options;return a=b.attr("data-original-title")||("function"==typeof c.title?c.title.call(b[0]):c.title)},c.prototype.getUID=function(a){do a+=~~(1e6*Math.random());while(document.getElementById(a));return a},c.prototype.tip=function(){return this.$tip=this.$tip||a(this.options.template)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".tooltip-arrow")},c.prototype.enable=function(){this.enabled=!0},c.prototype.disable=function(){this.enabled=!1},c.prototype.toggleEnabled=function(){this.enabled=!this.enabled},c.prototype.toggle=function(b){var c=this;b&&(c=a(b.currentTarget).data("bs."+this.type),c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c))),c.tip().hasClass("in")?c.leave(c):c.enter(c)},c.prototype.destroy=function(){var a=this;clearTimeout(this.timeout),this.hide(function(){a.$element.off("."+a.type).removeData("bs."+a.type)})};var d=a.fn.tooltip;a.fn.tooltip=b,a.fn.tooltip.Constructor=c,a.fn.tooltip.noConflict=function(){return a.fn.tooltip=d,this}}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.popover"),f="object"==typeof b&&b;(e||!/destroy|hide/.test(b))&&(e||d.data("bs.popover",e=new c(this,f)),"string"==typeof b&&e[b]())})}var c=function(a,b){this.init("popover",a,b)};if(!a.fn.tooltip)throw new Error("Popover requires tooltip.js");c.VERSION="3.3.4",c.DEFAULTS=a.extend({},a.fn.tooltip.Constructor.DEFAULTS,{placement:"right",trigger:"click",content:"",template:'<div class="popover" role="tooltip"><div class="arrow"></div><h3 class="popover-title"></h3><div class="popover-content"></div></div>'}),c.prototype=a.extend({},a.fn.tooltip.Constructor.prototype),c.prototype.constructor=c,c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.setContent=function(){var a=this.tip(),b=this.getTitle(),c=this.getContent();a.find(".popover-title")[this.options.html?"html":"text"](b),a.find(".popover-content").children().detach().end()[this.options.html?"string"==typeof c?"html":"append":"text"](c),a.removeClass("fade top bottom left right in"),a.find(".popover-title").html()||a.find(".popover-title").hide()},c.prototype.hasContent=function(){return this.getTitle()||this.getContent()},c.prototype.getContent=function(){var a=this.$element,b=this.options;return a.attr("data-content")||("function"==typeof b.content?b.content.call(a[0]):b.content)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".arrow")};var d=a.fn.popover;a.fn.popover=b,a.fn.popover.Constructor=c,a.fn.popover.noConflict=function(){return a.fn.popover=d,this}}(jQuery),+function(a){"use strict";function b(c,d){this.$body=a(document.body),this.$scrollElement=a(a(c).is(document.body)?window:c),this.options=a.extend({},b.DEFAULTS,d),this.selector=(this.options.target||"")+" .nav li > a",this.offsets=[],this.targets=[],this.activeTarget=null,this.scrollHeight=0,this.$scrollElement.on("scroll.bs.scrollspy",a.proxy(this.process,this)),this.refresh(),this.process()}function c(c){return this.each(function(){var d=a(this),e=d.data("bs.scrollspy"),f="object"==typeof c&&c;e||d.data("bs.scrollspy",e=new b(this,f)),"string"==typeof c&&e[c]()})}b.VERSION="3.3.4",b.DEFAULTS={offset:10},b.prototype.getScrollHeight=function(){return this.$scrollElement[0].scrollHeight||Math.max(this.$body[0].scrollHeight,document.documentElement.scrollHeight)},b.prototype.refresh=function(){var b=this,c="offset",d=0;this.offsets=[],this.targets=[],this.scrollHeight=this.getScrollHeight(),a.isWindow(this.$scrollElement[0])||(c="position",d=this.$scrollElement.scrollTop()),this.$body.find(this.selector).map(function(){var b=a(this),e=b.data("target")||b.attr("href"),f=/^#./.test(e)&&a(e);return f&&f.length&&f.is(":visible")&&[[f[c]().top+d,e]]||null}).sort(function(a,b){return a[0]-b[0]}).each(function(){b.offsets.push(this[0]),b.targets.push(this[1])})},b.prototype.process=function(){var a,b=this.$scrollElement.scrollTop()+this.options.offset,c=this.getScrollHeight(),d=this.options.offset+c-this.$scrollElement.height(),e=this.offsets,f=this.targets,g=this.activeTarget;if(this.scrollHeight!=c&&this.refresh(),b>=d)return g!=(a=f[f.length-1])&&this.activate(a);if(g&&b<e[0])return this.activeTarget=null,this.clear();for(a=e.length;a--;)g!=f[a]&&b>=e[a]&&(void 0===e[a+1]||b<e[a+1])&&this.activate(f[a])},b.prototype.activate=function(b){this.activeTarget=b,this.clear();var c=this.selector+'[data-target="'+b+'"],'+this.selector+'[href="'+b+'"]',d=a(c).parents("li").addClass("active");d.parent(".dropdown-menu").length&&(d=d.closest("li.dropdown").addClass("active")),d.trigger("activate.bs.scrollspy")},b.prototype.clear=function(){a(this.selector).parentsUntil(this.options.target,".active").removeClass("active")};var d=a.fn.scrollspy;a.fn.scrollspy=c,a.fn.scrollspy.Constructor=b,a.fn.scrollspy.noConflict=function(){return a.fn.scrollspy=d,this},a(window).on("load.bs.scrollspy.data-api",function(){a('[data-spy="scroll"]').each(function(){var b=a(this);c.call(b,b.data())})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.tab");e||d.data("bs.tab",e=new c(this)),"string"==typeof b&&e[b]()})}var c=function(b){this.element=a(b)};c.VERSION="3.3.4",c.TRANSITION_DURATION=150,c.prototype.show=function(){var b=this.element,c=b.closest("ul:not(.dropdown-menu)"),d=b.data("target");if(d||(d=b.attr("href"),d=d&&d.replace(/.*(?=#[^\s]*$)/,"")),!b.parent("li").hasClass("active")){
7 | var e=c.find(".active:last a"),f=a.Event("hide.bs.tab",{relatedTarget:b[0]}),g=a.Event("show.bs.tab",{relatedTarget:e[0]});if(e.trigger(f),b.trigger(g),!g.isDefaultPrevented()&&!f.isDefaultPrevented()){var h=a(d);this.activate(b.closest("li"),c),this.activate(h,h.parent(),function(){e.trigger({type:"hidden.bs.tab",relatedTarget:b[0]}),b.trigger({type:"shown.bs.tab",relatedTarget:e[0]})})}}},c.prototype.activate=function(b,d,e){function f(){g.removeClass("active").find("> .dropdown-menu > .active").removeClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!1),b.addClass("active").find('[data-toggle="tab"]').attr("aria-expanded",!0),h?(b[0].offsetWidth,b.addClass("in")):b.removeClass("fade"),b.parent(".dropdown-menu").length&&b.closest("li.dropdown").addClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!0),e&&e()}var g=d.find("> .active"),h=e&&a.support.transition&&(g.length&&g.hasClass("fade")||!!d.find("> .fade").length);g.length&&h?g.one("bsTransitionEnd",f).emulateTransitionEnd(c.TRANSITION_DURATION):f(),g.removeClass("in")};var d=a.fn.tab;a.fn.tab=b,a.fn.tab.Constructor=c,a.fn.tab.noConflict=function(){return a.fn.tab=d,this};var e=function(c){c.preventDefault(),b.call(a(this),"show")};a(document).on("click.bs.tab.data-api",'[data-toggle="tab"]',e).on("click.bs.tab.data-api",'[data-toggle="pill"]',e)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.affix"),f="object"==typeof b&&b;e||d.data("bs.affix",e=new c(this,f)),"string"==typeof b&&e[b]()})}var c=function(b,d){this.options=a.extend({},c.DEFAULTS,d),this.$target=a(this.options.target).on("scroll.bs.affix.data-api",a.proxy(this.checkPosition,this)).on("click.bs.affix.data-api",a.proxy(this.checkPositionWithEventLoop,this)),this.$element=a(b),this.affixed=null,this.unpin=null,this.pinnedOffset=null,this.checkPosition()};c.VERSION="3.3.4",c.RESET="affix affix-top affix-bottom",c.DEFAULTS={offset:0,target:window},c.prototype.getState=function(a,b,c,d){var e=this.$target.scrollTop(),f=this.$element.offset(),g=this.$target.height();if(null!=c&&"top"==this.affixed)return c>e?"top":!1;if("bottom"==this.affixed)return null!=c?e+this.unpin<=f.top?!1:"bottom":a-d>=e+g?!1:"bottom";var h=null==this.affixed,i=h?e:f.top,j=h?g:b;return null!=c&&c>=e?"top":null!=d&&i+j>=a-d?"bottom":!1},c.prototype.getPinnedOffset=function(){if(this.pinnedOffset)return this.pinnedOffset;this.$element.removeClass(c.RESET).addClass("affix");var a=this.$target.scrollTop(),b=this.$element.offset();return this.pinnedOffset=b.top-a},c.prototype.checkPositionWithEventLoop=function(){setTimeout(a.proxy(this.checkPosition,this),1)},c.prototype.checkPosition=function(){if(this.$element.is(":visible")){var b=this.$element.height(),d=this.options.offset,e=d.top,f=d.bottom,g=a(document.body).height();"object"!=typeof d&&(f=e=d),"function"==typeof e&&(e=d.top(this.$element)),"function"==typeof f&&(f=d.bottom(this.$element));var h=this.getState(g,b,e,f);if(this.affixed!=h){null!=this.unpin&&this.$element.css("top","");var i="affix"+(h?"-"+h:""),j=a.Event(i+".bs.affix");if(this.$element.trigger(j),j.isDefaultPrevented())return;this.affixed=h,this.unpin="bottom"==h?this.getPinnedOffset():null,this.$element.removeClass(c.RESET).addClass(i).trigger(i.replace("affix","affixed")+".bs.affix")}"bottom"==h&&this.$element.offset({top:g-b-f})}};var d=a.fn.affix;a.fn.affix=b,a.fn.affix.Constructor=c,a.fn.affix.noConflict=function(){return a.fn.affix=d,this},a(window).on("load",function(){a('[data-spy="affix"]').each(function(){var c=a(this),d=c.data();d.offset=d.offset||{},null!=d.offsetBottom&&(d.offset.bottom=d.offsetBottom),null!=d.offsetTop&&(d.offset.top=d.offsetTop),b.call(c,d)})})}(jQuery);


--------------------------------------------------------------------------------