├── docs └── tutorial.md ├── crawler-server ├── .gitignore ├── src │ └── main │ │ ├── webapp │ │ ├── META-INF │ │ │ └── context.xml │ │ ├── index.jsp │ │ ├── fonts │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ ├── glyphicons-halflings-regular.woff │ │ │ └── glyphicons-halflings-regular.woff2 │ │ ├── WEB-INF │ │ │ └── web.xml │ │ ├── js │ │ │ ├── spider-list.js │ │ │ └── bootstrap.min.js │ │ └── jsp │ │ │ ├── new-employee.jsp │ │ │ └── spider-list.jsp │ │ └── java │ │ └── com │ │ └── github │ │ └── xbynet │ │ └── crawler │ │ └── server │ │ ├── Main.java │ │ ├── HelloServlet.java │ │ ├── monitor │ │ ├── SpiderManager.java │ │ └── MonitorServlet.java │ │ └── demo │ │ └── GithubCrawler.java └── pom.xml ├── crawler-core ├── src │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── github │ │ │ └── xbynet │ │ │ └── crawler │ │ │ ├── parser │ │ │ ├── Parser.java │ │ │ ├── JsonPathParser.java │ │ │ ├── XpathParser.java │ │ │ └── JsoupParser.java │ │ │ ├── ISpider.java │ │ │ ├── SpiderListener.java │ │ │ ├── Const.java │ │ │ ├── http │ │ │ ├── Downloader.java │ │ │ ├── FileDownloader.java │ │ │ ├── CustomRedirectStrategy.java │ │ │ ├── DefaultDownloader.java │ │ │ ├── HttpClientFactory.java │ │ │ └── AbsDownloader.java │ │ │ ├── annotation │ │ │ └── Nullable.java │ │ │ ├── scheduler │ │ │ ├── DuplicateRemover.java │ │ │ ├── Scheduler.java │ │ │ ├── DefaultScheduler.java │ │ │ └── RedisScheduler.java │ │ │ ├── IpProxyProvider.java │ │ │ ├── RequestAction.java │ │ │ ├── utils │ │ │ ├── BeanUtil.java │ │ │ ├── CrawlerUtils.java │ │ │ └── CountableThreadPool.java │ │ │ ├── Processor.java │ │ │ ├── Site.java │ │ │ ├── Response.java │ │ │ ├── Request.java │ │ │ └── Spider.java │ └── test │ │ ├── java │ │ └── net │ │ │ └── xby1993 │ │ │ └── crawler │ │ │ ├── StartAllJoke.java │ │ │ ├── AppTest.java │ │ │ ├── ZhihuRecommendCrawler.java │ │ │ ├── OSChinaTweetsCrawler.java │ │ │ ├── QiushibaikeCrawler.java │ │ │ ├── NeihanshequCrawler.java │ │ │ └── GithubCrawler.java │ │ └── resources │ │ └── logback.xml └── pom.xml ├── crawler-selenium ├── src │ └── main │ │ └── java │ │ └── com │ │ └── github │ │ └── xbynet │ │ └── crawler │ │ └── selenium │ │ ├── SeleniumAction.java │ │ ├── WebDriverPool.java │ │ ├── getCssAttr.js │ │ ├── ImageRegion.java │ │ ├── ImageUtil.java │ │ ├── WebDriverManager.java │ │ ├── SeleniumDownloader.java │ │ ├── PhantomjsWebDriverPool.java │ │ └── WindowUtil.java └── pom.xml ├── .gitignore ├── LICENSE ├── README.md └── pom.xml /docs/tutorial.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler-server/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/META-INF/context.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/index.jsp: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello World!

4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.parser; 2 | 3 | public interface Parser { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/ISpider.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | public interface ISpider { 4 | String getName(); 5 | 6 | } 7 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xbynet/crawler/HEAD/crawler-server/src/main/webapp/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/SpiderListener.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | public interface SpiderListener { 4 | void success(Spider spider,Request request); 5 | void fail(Spider spider,Request request,Exception e); 6 | } 7 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Const.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | public class Const { 4 | public enum HttpMethod{ 5 | GET,POST,HEAD 6 | } 7 | public enum CssAttr{ 8 | innerHtml,text,allText 9 | } 10 | public enum ResponseType{ 11 | TEXT,BIN 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/SeleniumAction.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import org.openqa.selenium.WebDriver; 4 | 5 | /** 6 | * @author taojw 7 | * 8 | */ 9 | public interface SeleniumAction { 10 | void execute(WebDriver driver); 11 | } 12 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/StartAllJoke.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | public class StartAllJoke { 4 | public static void main(String[] args) { 5 | new OSChinaTweetsCrawler().start(); 6 | new QiushibaikeCrawler().start(); 7 | new NeihanshequCrawler().start(); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverPool.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import org.openqa.selenium.WebDriver; 4 | 5 | public interface WebDriverPool { 6 | WebDriver get() throws InterruptedException; 7 | void returnToPool(WebDriver webDriver); 8 | void close(WebDriver webDriver); 9 | void shutdown(); 10 | } 11 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/Downloader.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.io.Closeable; 4 | 5 | import com.github.xbynet.crawler.Request; 6 | import com.github.xbynet.crawler.Spider; 7 | 8 | public interface Downloader extends Closeable{ 9 | void init(); 10 | void download(Request request); 11 | void setSpider(Spider spider); 12 | } 13 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/annotation/Nullable.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | @Target(ElementType.PARAMETER) 9 | @Retention(RetentionPolicy.SOURCE) 10 | public @interface Nullable { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 7 | Archetype Created Web Application 8 | 9 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DuplicateRemover.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.scheduler; 2 | 3 | import com.github.xbynet.crawler.ISpider; 4 | import com.github.xbynet.crawler.Request; 5 | 6 | public interface DuplicateRemover { 7 | public boolean isDuplicate(Request request, ISpider spider); 8 | public void resetDuplicateCheck(ISpider spider); 9 | public int getTotalRequestsCount(ISpider spider); 10 | 11 | } -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/getCssAttr.js: -------------------------------------------------------------------------------- 1 | function getStyle(obj, attr) { 2 | if (obj.currentStyle) { 3 | return obj.currentStyle[attr]; 4 | } else { 5 | return document.defaultView.getComputedStyle(obj, null)[attr]; 6 | } 7 | } 8 | function getCssAttr(sel,attr){ 9 | var tmp=document.querySelector(sel); 10 | var res=getStyle(tmp,attr); 11 | return res; 12 | } 13 | return getCssAttr(arguments[0],arguments[1]); -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/js/spider-list.js: -------------------------------------------------------------------------------- 1 | function changeState(name){ 2 | var t=$("#stateBtn").text().trim(); 3 | var method='start'; 4 | if(t=='停止'){ 5 | method='stop'; 6 | } 7 | $.get(baseUrl+"monitor?name="+name+"&method="+method,function(data){ 8 | if(data=='true'){ 9 | $("#stateBtn").text(method=='start'?'停止':'启动'); 10 | $("#status").text(method=='start'?"running":"stopping..."); 11 | }else{ 12 | alert("请求失败:"+data); 13 | } 14 | },"text") 15 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | /.settings 3 | /target 4 | /.classpath 5 | /crawler-core/target 6 | /crawler-selenium/target 7 | /crawler-core/.project 8 | /crawler-core/.settings 9 | /crawler-core/.classpath 10 | /crawler-selenium/.project 11 | /crawler-selenium/.settings 12 | /crawler-selenium/.classpath 13 | /crawler-server/.tern-project 14 | /crawler-server/.settings 15 | /crawler-server/target 16 | /crawler-server/tomcat.8666 17 | /crawler-server/.classpath 18 | /crawler-server/.project 19 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/Scheduler.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.scheduler; 2 | 3 | import com.github.xbynet.crawler.ISpider; 4 | import com.github.xbynet.crawler.Request; 5 | 6 | public interface Scheduler { 7 | public void push(Request request,ISpider spider); 8 | public Request poll(ISpider spider); 9 | public int getLeftRequestsCount(ISpider spider); 10 | public int getTotalRequestsCount(ISpider spider); 11 | public DuplicateRemover getDuplicateRemover(); 12 | } 13 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/IpProxyProvider.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | 6 | import org.apache.http.HttpHost; 7 | 8 | public class IpProxyProvider implements Closeable{ 9 | 10 | public HttpHost getIp(){ 11 | return null; 12 | } 13 | public void invalid(HttpHost host){ 14 | 15 | } 16 | public void valid(HttpHost host){ 17 | 18 | } 19 | @Override 20 | public void close() throws IOException { 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/RequestAction.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.apache.http.client.methods.CloseableHttpResponse; 6 | import org.apache.http.client.methods.HttpUriRequest; 7 | import org.apache.http.impl.client.CloseableHttpClient; 8 | 9 | public interface RequestAction extends Serializable { 10 | void before(CloseableHttpClient client,HttpUriRequest req); 11 | void after(CloseableHttpClient client,CloseableHttpResponse resp); 12 | } 13 | -------------------------------------------------------------------------------- /crawler-core/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %-5level %msg [%logger{16} %d{HH:mm:ss}]%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageRegion.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | /** 4 | * @author taojw 5 | * 6 | */ 7 | public class ImageRegion { 8 | public int x; 9 | public int y; 10 | public int width; 11 | public int height; 12 | public ImageRegion(int x,int y,int width,int height){ 13 | this.x=x; 14 | this.y=y; 15 | this.width=width; 16 | this.height=height; 17 | } 18 | @Override 19 | public String toString() { 20 | return "ImageRegion [x=" + x + ", y=" + y + ", width=" + width 21 | + ", height=" + height + "]"; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /crawler-core/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.github.xbynet 6 | crawler-parent 7 | 0.3.0 8 | 9 | crawler-core 10 | jar 11 | 12 | 13 | redis.clients 14 | jedis 15 | 16 | 17 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/utils/BeanUtil.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.utils; 2 | 3 | import java.util.concurrent.ConcurrentHashMap; 4 | 5 | import net.sf.cglib.beans.BeanCopier; 6 | 7 | public class BeanUtil { 8 | public static ConcurrentHashMap beanCopierMap = new ConcurrentHashMap(); 9 | 10 | public static void copyProperties(Object source, Object target) { 11 | String beanKey = generateKey(source.getClass(), target.getClass()); 12 | BeanCopier copier = null; 13 | copier = BeanCopier.create(source.getClass(), target.getClass(), false); 14 | beanCopierMap.putIfAbsent(beanKey, copier); 15 | copier = beanCopierMap.get(beanKey); 16 | copier.copy(source, target, null); 17 | } 18 | 19 | private static String generateKey(Class class1, Class class2) { 20 | return class1.toString() + class2.toString(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/ImageUtil.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import java.io.IOException; 4 | 5 | import net.coobird.thumbnailator.Thumbnails; 6 | 7 | /** 8 | * @author taojw 9 | * 10 | */ 11 | public class ImageUtil { 12 | public static void crop(String srcfile,String destfile,ImageRegion region){ 13 | //指定坐标 14 | try { 15 | Thumbnails.of(srcfile) 16 | .sourceRegion(region.x, region.y, region.width, region.height) 17 | .size(region.width, region.height).outputQuality(1.0) 18 | //.keepAspectRatio(false) //不保持比例 19 | .toFile(destfile); 20 | } catch (IOException e) { 21 | // TODO Auto-generated catch block 22 | e.printStackTrace(); 23 | } 24 | } 25 | public static void main(String[] args) { 26 | crop("D:\\data\\111.png","D:\\data\\1112.png",new ImageRegion(66, 264, 422, 426)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /crawler-server/src/main/java/com/github/xbynet/crawler/server/Main.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.server; 2 | 3 | import org.apache.catalina.core.StandardContext; 4 | import org.apache.catalina.startup.Tomcat; 5 | 6 | 7 | /** 8 | *Embeded Tomcat 9 | *http://www.oracle.com/webfolder/technetwork/tutorials/obe/java/basic_app_embedded_tomcat/basic_app-tomcat-embedded.html 10 | *https://github.com/heroku/devcenter-embedded-tomcat 11 | */ 12 | public class Main { 13 | 14 | public static void main(String[] args) throws Exception { 15 | String contextPath = "/"; 16 | String appBase = "."; 17 | Tomcat tomcat = new Tomcat(); 18 | tomcat.setPort(8666); 19 | tomcat.getHost().setAppBase(appBase); 20 | StandardContext ctx=(StandardContext)tomcat.addWebapp(contextPath, appBase);//Context ctx = tomcat.addContext("/", new File(".").getAbsolutePath()); 21 | 22 | tomcat.start(); 23 | tomcat.getServer().await(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import org.apache.http.client.methods.CloseableHttpResponse; 4 | import org.apache.http.client.methods.HttpUriRequest; 5 | import org.apache.http.impl.client.CloseableHttpClient; 6 | 7 | import com.alibaba.fastjson.JSONObject; 8 | 9 | import junit.framework.Test; 10 | import junit.framework.TestCase; 11 | import junit.framework.TestSuite; 12 | 13 | /** 14 | * Unit test for simple App. 15 | */ 16 | public class AppTest 17 | extends TestCase 18 | { 19 | /** 20 | * Create the test case 21 | * 22 | * @param testName name of the test case 23 | */ 24 | public AppTest( String testName ) 25 | { 26 | super( testName ); 27 | } 28 | 29 | /** 30 | * @return the suite of tests being tested 31 | */ 32 | public static Test suite() 33 | { 34 | return new TestSuite( AppTest.class ); 35 | } 36 | 37 | /** 38 | * Rigourous Test :-) 39 | */ 40 | public void testApp() 41 | { 42 | assertTrue( true ); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 xbynet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crawler-selenium/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.github.xbynet 6 | crawler-parent 7 | 0.3.0 8 | 9 | crawler-selenium 10 | jar 11 | 12 | 13 | com.github.xbynet 14 | crawler-core 15 | ${project.version} 16 | 17 | 18 | org.seleniumhq.selenium 19 | selenium-java 20 | 21 | 22 | com.codeborne 23 | phantomjsdriver 24 | 25 | 26 | net.coobird 27 | thumbnailator 28 | 0.4.8 29 | 30 | 31 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Processor.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | 6 | import com.github.xbynet.crawler.http.FileDownloader; 7 | 8 | /** 9 | *爬虫页面处理器,撰写爬虫时需要扩展此类 10 | */ 11 | public abstract class Processor implements Closeable{ 12 | private FileDownloader fileDownloader=null; 13 | private Spider spider=null; 14 | 15 | public abstract void process(Response resp); 16 | 17 | public boolean download(Request req,String savePath){ 18 | return fileDownloader.download(req, savePath); 19 | } 20 | public boolean download(String url,String savePath){ 21 | Request req=new Request(url); 22 | return fileDownloader.download(req, savePath); 23 | } 24 | public FileDownloader getFileDownloader() { 25 | return fileDownloader; 26 | } 27 | 28 | public void setFileDownloader(FileDownloader fileDownloader) { 29 | this.fileDownloader = fileDownloader; 30 | } 31 | @Override 32 | public void close()throws IOException{ 33 | 34 | } 35 | 36 | public Spider getSpider() { 37 | return spider; 38 | } 39 | 40 | public void setSpider(Spider spider) { 41 | this.spider = spider; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /crawler-server/src/main/java/com/github/xbynet/crawler/server/HelloServlet.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.server; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.ServletException; 6 | import javax.servlet.ServletOutputStream; 7 | import javax.servlet.annotation.WebServlet; 8 | import javax.servlet.http.HttpServlet; 9 | import javax.servlet.http.HttpServletRequest; 10 | import javax.servlet.http.HttpServletResponse; 11 | 12 | import com.github.xbynet.crawler.Spider; 13 | import com.github.xbynet.crawler.server.demo.GithubCrawler; 14 | import com.github.xbynet.crawler.server.monitor.SpiderManager; 15 | 16 | @WebServlet( 17 | name = "MyServlet", 18 | urlPatterns = {"/hello"} 19 | ) 20 | public class HelloServlet extends HttpServlet { 21 | 22 | @Override 23 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) 24 | throws ServletException, IOException { 25 | ServletOutputStream out = resp.getOutputStream(); 26 | Spider s=new GithubCrawler().createSpider(); 27 | SpiderManager.get().add(s); 28 | out.write(("add spider of "+s.getName()).getBytes()); 29 | out.flush(); 30 | out.close(); 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/utils/CrawlerUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.utils; 2 | 3 | import javax.script.Invocable; 4 | import javax.script.ScriptEngine; 5 | import javax.script.ScriptEngineManager; 6 | 7 | import org.apache.commons.lang3.StringUtils; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import com.github.xbynet.crawler.annotation.Nullable; 12 | 13 | public class CrawlerUtils { 14 | private static final Logger log=LoggerFactory.getLogger(CrawlerUtils.class); 15 | 16 | public static void sleep(int millis){ 17 | try { 18 | Thread.sleep(millis); 19 | } catch (InterruptedException e) { 20 | log.warn("",e); 21 | } 22 | } 23 | 24 | public Object executeJs(String js,@Nullable String funcName,Object... args){ 25 | ScriptEngineManager manager = new ScriptEngineManager(); 26 | ScriptEngine engine = manager.getEngineByName("javascript"); 27 | try { 28 | Object res=engine.eval(js); 29 | if(StringUtils.isNotBlank(funcName)){ 30 | if (engine instanceof Invocable) { 31 | Invocable invoke = (Invocable) engine; 32 | res = invoke.invokeFunction(funcName, args); 33 | } 34 | } 35 | return res; 36 | } catch (Exception e) { 37 | log.error("",e); 38 | } 39 | return null; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/parser/JsonPathParser.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.parser; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.jayway.jsonpath.JsonPath; 7 | import com.jayway.jsonpath.ReadContext; 8 | 9 | public class JsonPathParser implements Parser { 10 | private ReadContext ctx; 11 | 12 | public JsonPathParser(String raw) { 13 | this.ctx = JsonPath.parse(raw); 14 | } 15 | 16 | public String single(String jsonpath) { 17 | Object object = ctx.read(jsonpath); 18 | if (object == null) { 19 | return null; 20 | } 21 | if (object instanceof List) { 22 | List list = (List) object; 23 | if (list != null && list.size() > 0) { 24 | return list.get(0).toString(); 25 | } 26 | } 27 | return object.toString(); 28 | } 29 | 30 | public List list(String jsonpath) { 31 | List reslist = new ArrayList(); 32 | Object object = ctx.read(jsonpath); 33 | if (object == null) { 34 | return reslist; 35 | } 36 | if (object instanceof List) { 37 | List list = (List) object; 38 | for (Object item : list) { 39 | reslist.add(item.toString()); 40 | } 41 | } else { 42 | reslist.add(object.toString()); 43 | } 44 | return reslist; 45 | } 46 | 47 | public ReadContext getCtx() { 48 | return ctx; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/parser/XpathParser.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.parser; 2 | 3 | import java.util.List; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | 9 | import us.codecraft.xsoup.XPathEvaluator; 10 | import us.codecraft.xsoup.Xsoup; 11 | 12 | public class XpathParser implements Parser{ 13 | 14 | private Document doc; 15 | 16 | public XpathParser(String raw) { 17 | this.doc=Jsoup.parse(raw); 18 | } 19 | 20 | public String single(String xpathStr) { 21 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr); 22 | return xPathEvaluator.evaluate(doc).get(); 23 | } 24 | 25 | public List list(String xpathStr) { 26 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr); 27 | return xPathEvaluator.evaluate(doc).list(); 28 | } 29 | 30 | public Element element(String xpathStr) { 31 | List elements = elements(xpathStr); 32 | if (elements!=null && elements.size()>0){ 33 | return elements.get(0); 34 | } 35 | return null; 36 | } 37 | 38 | public List elements(String xpathStr) { 39 | XPathEvaluator xPathEvaluator = Xsoup.compile(xpathStr); 40 | return xPathEvaluator.evaluate(doc).getElements(); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/DefaultScheduler.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.scheduler; 2 | 3 | import java.util.Collections; 4 | import java.util.Set; 5 | import java.util.concurrent.BlockingQueue; 6 | import java.util.concurrent.ConcurrentHashMap; 7 | import java.util.concurrent.LinkedBlockingQueue; 8 | 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import com.github.xbynet.crawler.Const; 13 | import com.github.xbynet.crawler.ISpider; 14 | import com.github.xbynet.crawler.Request; 15 | 16 | public class DefaultScheduler implements Scheduler, DuplicateRemover { 17 | private final Logger log = LoggerFactory.getLogger(DefaultScheduler.class); 18 | private Set urls = Collections 19 | .newSetFromMap(new ConcurrentHashMap()); 20 | private BlockingQueue queue = new LinkedBlockingQueue(); 21 | 22 | public void push(Request request, ISpider spider) { 23 | if (Const.HttpMethod.POST == request.getMethod() 24 | || !isDuplicate(request, spider)) { 25 | log.debug("push to queue {}", request.getUrl()); 26 | queue.add(request); 27 | } 28 | } 29 | 30 | public Request poll(ISpider spider) { 31 | return queue.poll(); 32 | } 33 | 34 | public DuplicateRemover getDuplicateRemover(){ 35 | return this; 36 | } 37 | public boolean isDuplicate(Request request, ISpider spider) { 38 | return !urls.add(request.getUrl()); 39 | } 40 | 41 | public void resetDuplicateCheck(ISpider spider) { 42 | urls.clear(); 43 | } 44 | 45 | public int getTotalRequestsCount(ISpider spider) { 46 | return urls.size(); 47 | } 48 | 49 | public int getLeftRequestsCount(ISpider spider) { 50 | return queue.size(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/FileDownloader.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.io.IOException; 6 | 7 | import org.apache.commons.io.IOUtils; 8 | import org.apache.http.client.methods.CloseableHttpResponse; 9 | import org.apache.http.client.methods.HttpUriRequest; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import com.github.xbynet.crawler.Request; 14 | import com.github.xbynet.crawler.Response; 15 | import com.github.xbynet.crawler.Site; 16 | 17 | 18 | public class FileDownloader extends AbsDownloader{ 19 | private final Logger log=LoggerFactory.getLogger(FileDownloader.class); 20 | 21 | 22 | public boolean download(Request request,String savePath){ 23 | log.debug("开始下载文件"+request.getUrl()+"到路径"+savePath); 24 | super.doDownload(request,savePath); 25 | File file=new File(savePath); 26 | return file.exists(); 27 | } 28 | @Override 29 | protected void process(HttpUriRequest httpUriRequest, 30 | CloseableHttpResponse resp, Request request, Site site,Response response,Object... extras) { 31 | if(resp==null){ 32 | log.error("文件"+httpUriRequest.getURI().toString()+"下载失败"); 33 | return; 34 | } 35 | String savePath=extras[0].toString(); 36 | File saveFile=new File(savePath); 37 | if(saveFile.exists()){ 38 | saveFile.delete(); 39 | } 40 | FileOutputStream fous=null; 41 | try { 42 | fous=new FileOutputStream(saveFile); 43 | IOUtils.copy(resp.getEntity().getContent(), fous); 44 | log.debug("文件"+httpUriRequest.getURI().toString()+"下载成功"); 45 | } catch (UnsupportedOperationException e) { 46 | log.error("",e); 47 | } catch (IOException e) { 48 | log.error("",e); 49 | }finally{ 50 | IOUtils.closeQuietly(fous); 51 | } 52 | } 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Site.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class Site { 7 | private String encoding="UTF-8"; 8 | private String ua="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; 9 | private int sleep=20; 10 | private int retry=3; 11 | private int retrySleep=500; 12 | private int timeout=30000; 13 | private Map headers=new HashMap(); 14 | 15 | public Site(){ 16 | getHeaders().put("User-Agent", ua); 17 | } 18 | public String getEncoding() { 19 | return encoding; 20 | } 21 | 22 | public Site setEncoding(String encoding) { 23 | this.encoding = encoding; 24 | return this; 25 | } 26 | 27 | public String getUa() { 28 | return ua; 29 | } 30 | 31 | public Site setUa(String ua) { 32 | getHeaders().put("User-Agent", ua); 33 | return this; 34 | } 35 | 36 | public int getSleep() { 37 | return sleep; 38 | } 39 | 40 | public Site setSleep(int sleep) { 41 | this.sleep = sleep; 42 | return this; 43 | } 44 | 45 | public int getRetry() { 46 | return retry; 47 | } 48 | 49 | public Site setRetry(int retry) { 50 | this.retry = retry; 51 | return this; 52 | } 53 | 54 | public int getRetrySleep() { 55 | return retrySleep; 56 | } 57 | 58 | public Site setRetrySleep(int retrySleep) { 59 | this.retrySleep = retrySleep; 60 | return this; 61 | } 62 | 63 | public int getTimeout() { 64 | return timeout; 65 | } 66 | 67 | public Site setTimeout(int timeout) { 68 | this.timeout = timeout; 69 | return this; 70 | } 71 | 72 | public Site setHeader(String name,String value){ 73 | getHeaders().put(name, value); 74 | return this; 75 | } 76 | public Map getHeaders() { 77 | return headers; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/CustomRedirectStrategy.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.net.URI; 4 | 5 | import org.apache.http.HttpRequest; 6 | import org.apache.http.HttpResponse; 7 | import org.apache.http.ProtocolException; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.client.methods.HttpPost; 10 | import org.apache.http.client.methods.HttpRequestWrapper; 11 | import org.apache.http.client.methods.HttpUriRequest; 12 | import org.apache.http.impl.client.LaxRedirectStrategy; 13 | import org.apache.http.protocol.HttpContext; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | /** 18 | *支持post 302跳转策略实现类 19 | *HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy()); 20 | *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。 21 | *原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java 22 | */ 23 | public class CustomRedirectStrategy extends LaxRedirectStrategy { 24 | private Logger logger = LoggerFactory.getLogger(getClass()); 25 | 26 | @Override 27 | public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { 28 | URI uri = getLocationURI(request, response, context); 29 | String method = request.getRequestLine().getMethod(); 30 | if ("post".equalsIgnoreCase(method)) { 31 | try { 32 | HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; 33 | httpRequestWrapper.setURI(uri); 34 | httpRequestWrapper.removeHeaders("Content-Length"); 35 | return httpRequestWrapper; 36 | } catch (Exception e) { 37 | logger.error("强转为HttpRequestWrapper出错"); 38 | } 39 | return new HttpPost(uri); 40 | } else { 41 | return new HttpGet(uri); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WebDriverManager.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | 6 | import org.openqa.selenium.WebDriver; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | public class WebDriverManager implements Closeable{ 11 | private static final Logger log=LoggerFactory.getLogger(WebDriverManager.class); 12 | 13 | private WebDriverPool webDriverPool=null; 14 | 15 | public WebDriverManager(String phantomjsPath){ 16 | this.webDriverPool=new PhantomjsWebDriverPool(1,false,phantomjsPath); 17 | } 18 | public WebDriverManager(WebDriverPool webDriverPool){ 19 | this.webDriverPool=webDriverPool; 20 | } 21 | public void load(String url,int sleepTimeMillis,SeleniumAction... actions){ 22 | WebDriver driver=null; 23 | try { 24 | driver=webDriverPool.get(); 25 | driver.get(url); 26 | sleep(sleepTimeMillis); 27 | WebDriver.Options manage = driver.manage(); 28 | manage.window().maximize(); 29 | for(SeleniumAction action:actions){ 30 | action.execute(driver); 31 | } 32 | } catch (InterruptedException e) { 33 | e.printStackTrace(); 34 | log.error("",e); 35 | }finally{ 36 | if(driver!=null){ 37 | webDriverPool.returnToPool(driver); 38 | } 39 | } 40 | } 41 | public void load(SeleniumAction... actions){ 42 | WebDriver driver=null; 43 | try { 44 | driver=webDriverPool.get(); 45 | WebDriver.Options manage = driver.manage(); 46 | manage.window().maximize(); 47 | for(SeleniumAction action:actions){ 48 | action.execute(driver); 49 | } 50 | } catch (InterruptedException e) { 51 | e.printStackTrace(); 52 | log.error("",e); 53 | }finally{ 54 | if(driver!=null){ 55 | webDriverPool.returnToPool(driver); 56 | } 57 | } 58 | } 59 | public void shutDown(){ 60 | if(webDriverPool!=null){ 61 | webDriverPool.shutdown(); 62 | } 63 | } 64 | @Override 65 | public void close() throws IOException { 66 | shutDown(); 67 | } 68 | public void sleep(long millis){ 69 | try { 70 | Thread.sleep(millis); 71 | } catch (InterruptedException e) { 72 | e.printStackTrace(); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/jsp/new-employee.jsp: -------------------------------------------------------------------------------- 1 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %> 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |
10 | 11 | 12 | 13 | 14 | 15 |

Employee

16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |

36 | 37 |
38 |
39 |
40 | 41 | -------------------------------------------------------------------------------- /crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/SpiderManager.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.server.monitor; 2 | 3 | import java.util.concurrent.ConcurrentHashMap; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import com.github.xbynet.crawler.Spider; 9 | 10 | public class SpiderManager { 11 | private Logger log=LoggerFactory.getLogger(SpiderManager.class); 12 | 13 | private ConcurrentHashMap spiders=new ConcurrentHashMap<>(); 14 | 15 | private SpiderManager(){ 16 | 17 | } 18 | 19 | private static class SingleHolder{ 20 | static SpiderManager instance=new SpiderManager(); 21 | } 22 | 23 | public static SpiderManager get(){ 24 | return SingleHolder.instance; 25 | } 26 | 27 | public synchronized void add(Spider... spiders1){ 28 | for(Spider s:spiders1){ 29 | getSpiders().put(s.getName(),s); 30 | } 31 | } 32 | public synchronized Spider remove(String name){ 33 | return getSpiders().remove(name); 34 | } 35 | public synchronized void stopAll(){ 36 | for(String key:getSpiders().keySet()){ 37 | stop(key); 38 | } 39 | } 40 | public synchronized void startAll(){ 41 | for(String key:getSpiders().keySet()){ 42 | start(key); 43 | } 44 | } 45 | public String status(String name){ 46 | if(!getSpiders().containsKey(name)){ 47 | throw new IllegalArgumentException("the spider of "+name+" is not in manager"); 48 | } 49 | Spider spider=getSpiders().get(name); 50 | return spider.getState().name(); 51 | } 52 | 53 | public synchronized boolean stop(String name){ 54 | if(!getSpiders().containsKey(name)){ 55 | throw new IllegalArgumentException("the spider of "+name+" is not in manager"); 56 | } 57 | Spider spider=getSpiders().get(name); 58 | if(spider.isRunning()){ 59 | spider.stop(); 60 | return true; 61 | }else{ 62 | log.warn("illegal status "+spider.getState().name()+" for stop"); 63 | return false; 64 | } 65 | } 66 | public synchronized boolean start(String name){ 67 | if(!getSpiders().containsKey(name)){ 68 | throw new IllegalArgumentException("the spider of "+name+" is not in manager"); 69 | } 70 | Spider spider=getSpiders().get(name); 71 | if(spider.getState()==Spider.Status.NotRun){ 72 | spider.runAsync(); 73 | return true; 74 | } 75 | if(spider.isStopped()){ 76 | if(spider.isShutdownOnComplete()){ 77 | log.warn("spider of "+name+" setShutdownOnComplete=true, so it's not support restart"); 78 | return false; 79 | } 80 | spider.runAsync(); 81 | return true; 82 | } 83 | log.warn("illegal status "+spider.getState().name()+" for start"); 84 | return false; 85 | } 86 | 87 | public ConcurrentHashMap getSpiders() { 88 | return spiders; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/jsp/spider-list.jsp: -------------------------------------------------------------------------------- 1 | <%@ page contentType="text/html;charset=utf-8" %> 2 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %> 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 爬虫监控 15 | 16 | 17 | 18 |
19 |

爬虫监控

20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 42 | 43 | 44 | 45 |
标识页面处理器类名状态操作运行信息
${spider.name}${spider.processor}${spider.status} 39 | 41 | ${spider.info}
46 |
47 | 48 |
49 |
50 | 没有正在运行的爬虫 51 |
52 |
53 |
54 |
55 | 56 | 57 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/DefaultDownloader.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.commons.io.IOUtils; 10 | import org.apache.http.Header; 11 | import org.apache.http.HeaderElement; 12 | import org.apache.http.client.methods.CloseableHttpResponse; 13 | import org.apache.http.client.methods.HttpUriRequest; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.github.xbynet.crawler.Const; 18 | import com.github.xbynet.crawler.Request; 19 | import com.github.xbynet.crawler.Response; 20 | import com.github.xbynet.crawler.Site; 21 | 22 | public class DefaultDownloader extends AbsDownloader { 23 | private final Logger log = LoggerFactory.getLogger(DefaultDownloader.class); 24 | 25 | @Override 26 | public void download(Request request){ 27 | super.doDownload(request); 28 | } 29 | @Override 30 | protected void process(HttpUriRequest httpUriRequest, 31 | CloseableHttpResponse resp, Request request, Site site,Response response, 32 | Object... extras) { 33 | if (resp == null) { 34 | log.error(request.getUrl() + "请求失败"); 35 | return ; 36 | } 37 | response.setCode(resp.getStatusLine().getStatusCode()); 38 | response.setContentType(resp.getFirstHeader("Content-Type").getValue()); 39 | Const.ResponseType type = null; 40 | try { 41 | if (response.getContentType().contains("text") 42 | || response.getContentType().contains("json")) { 43 | type = Const.ResponseType.TEXT; 44 | String raw=IOUtils.toString(resp.getEntity().getContent(), 45 | request.getEncoding() != null ? request.getEncoding() 46 | : site.getEncoding()); 47 | response.setRaw(raw); 48 | } else { 49 | type = Const.ResponseType.BIN; 50 | response.setBytes(IOUtils.toByteArray(resp.getEntity() 51 | .getContent())); 52 | } 53 | } catch (UnsupportedOperationException e) { 54 | log.error("", e); 55 | } catch (IOException e) { 56 | log.error("", e); 57 | } 58 | response.setRespType(type); 59 | response.setRequest(request); 60 | 61 | Map> headers=new HashMap>(); 62 | for(Header header:resp.getAllHeaders()){ 63 | List value=new ArrayList(); 64 | HeaderElement[] hes=header.getElements(); 65 | if(hes!=null && hes.length>1){ 66 | for(HeaderElement e:hes){ 67 | value.add(e.getValue()); 68 | } 69 | }else{ 70 | value.add(header.getValue()); 71 | } 72 | headers.put(header.getName(), value); 73 | } 74 | response.setHeaders(headers); 75 | try { 76 | getSpider().getProcessor().process(response); 77 | } catch (Exception e) { 78 | log.error("",e); 79 | } 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/utils/CountableThreadPool.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.utils; 2 | 3 | import java.util.concurrent.ExecutorService; 4 | import java.util.concurrent.Executors; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.atomic.AtomicInteger; 7 | import java.util.concurrent.locks.Condition; 8 | import java.util.concurrent.locks.ReentrantLock; 9 | 10 | 11 | public class CountableThreadPool { 12 | 13 | private int threadNum; 14 | 15 | private AtomicInteger threadAlive = new AtomicInteger(); 16 | 17 | private ReentrantLock reentrantLock = new ReentrantLock(); 18 | 19 | private Condition condition = reentrantLock.newCondition(); 20 | 21 | public CountableThreadPool(int threadNum) { 22 | this.threadNum = threadNum; 23 | this.executorService = Executors.newFixedThreadPool(threadNum); 24 | } 25 | 26 | public CountableThreadPool(int threadNum, ExecutorService executorService) { 27 | this.threadNum = threadNum; 28 | this.executorService = executorService; 29 | } 30 | 31 | public void setExecutorService(ExecutorService executorService) { 32 | this.executorService = executorService; 33 | } 34 | 35 | public int getThreadAlive() { 36 | return threadAlive.get(); 37 | } 38 | 39 | public int getThreadNum() { 40 | return threadNum; 41 | } 42 | 43 | private ExecutorService executorService; 44 | 45 | public void execute(final Runnable runnable) { 46 | 47 | 48 | if (threadAlive.get() >= threadNum) { 49 | try { 50 | reentrantLock.lock(); 51 | while (threadAlive.get() >= threadNum) { 52 | try { 53 | condition.await(); 54 | } catch (InterruptedException e) { 55 | } 56 | } 57 | } finally { 58 | reentrantLock.unlock(); 59 | } 60 | } 61 | threadAlive.incrementAndGet(); 62 | executorService.execute(new Runnable() { 63 | public void run() { 64 | try { 65 | runnable.run(); 66 | } finally { 67 | try { 68 | reentrantLock.lock(); 69 | threadAlive.decrementAndGet(); 70 | condition.signal(); 71 | } finally { 72 | reentrantLock.unlock(); 73 | } 74 | } 75 | } 76 | }); 77 | } 78 | 79 | public boolean isShutdown() { 80 | return executorService.isShutdown(); 81 | } 82 | 83 | public void shutdown() { 84 | executorService.shutdown(); 85 | } 86 | public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException{ 87 | return executorService.awaitTermination(timeout, unit); 88 | } 89 | 90 | 91 | } 92 | -------------------------------------------------------------------------------- /crawler-server/src/main/java/com/github/xbynet/crawler/server/monitor/MonitorServlet.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.server.monitor; 2 | 3 | import java.io.IOException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.ArrayList; 6 | import java.util.Date; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.concurrent.ConcurrentHashMap; 11 | 12 | import javax.servlet.ServletException; 13 | import javax.servlet.ServletOutputStream; 14 | import javax.servlet.annotation.WebServlet; 15 | import javax.servlet.http.HttpServlet; 16 | import javax.servlet.http.HttpServletRequest; 17 | import javax.servlet.http.HttpServletResponse; 18 | 19 | import org.apache.commons.lang3.StringUtils; 20 | 21 | import com.github.xbynet.crawler.Spider; 22 | 23 | @WebServlet( 24 | name = "MonitorServlet", 25 | urlPatterns = {"/monitor"} 26 | ) 27 | public class MonitorServlet extends HttpServlet{ 28 | 29 | @Override 30 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) 31 | throws ServletException, IOException { 32 | String method=req.getParameter("method"); 33 | String name=req.getParameter("name"); 34 | String uri=req.getRequestURI(); 35 | if(StringUtils.isBlank(method)){ 36 | List> infolist = new ArrayList<>(); 37 | ConcurrentHashMap spiders = SpiderManager.get() 38 | .getSpiders(); 39 | for (String key : spiders.keySet()) { 40 | Map map = new HashMap<>(); 41 | Spider spider = spiders.get(key); 42 | map.put("name", key); 43 | map.put("processor", spider.getProcessor().getClass().getName()); 44 | map.put("status", spider.getState().name().toLowerCase()); 45 | SimpleDateFormat sdf = new SimpleDateFormat( 46 | "yyyy-MM-dd HH:mm:ss"); 47 | Date start = spider.getStartTime(); 48 | Date end = spider.getEndTime(); 49 | end = end == null ? new Date() : end; 50 | long runsecs = start == null ? 0 : (end.getTime() - start 51 | .getTime()) / 1000; 52 | map.put("info", 53 | "开始时间:" 54 | + (start == null ? "无" : sdf.format(start)) 55 | + ",运行时间:" 56 | + runsecs 57 | + "秒," 58 | + "总请求数:" 59 | + spider.getScheduler().getTotalRequestsCount( 60 | spider) 61 | + ",剩余请求数:" 62 | + spider.getScheduler().getLeftRequestsCount( 63 | spider)); 64 | 65 | infolist.add(map); 66 | } 67 | req.setAttribute("root", req.getServletContext().getContextPath()); 68 | req.setAttribute("spiders", infolist); 69 | req.getRequestDispatcher("/jsp/spider-list.jsp").forward(req, resp); 70 | }else if(method.equals("start")){ 71 | outString(resp, String.valueOf(SpiderManager.get().start(name))); 72 | }else if(method.equals("stop")){ 73 | outString(resp, String.valueOf(SpiderManager.get().stop(name))); 74 | } 75 | } 76 | public void outString(HttpServletResponse resp,String content) throws IOException{ 77 | ServletOutputStream out = resp.getOutputStream(); 78 | out.write(content.getBytes()); 79 | out.flush(); 80 | out.close(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/ZhihuRecommendCrawler.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.Date; 8 | import java.util.concurrent.atomic.AtomicInteger; 9 | 10 | import org.apache.commons.io.FileUtils; 11 | import org.apache.commons.io.IOUtils; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Element; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.github.xbynet.crawler.Const; 18 | import com.github.xbynet.crawler.Processor; 19 | import com.github.xbynet.crawler.Request; 20 | import com.github.xbynet.crawler.Response; 21 | import com.github.xbynet.crawler.Site; 22 | import com.github.xbynet.crawler.Spider; 23 | import com.github.xbynet.crawler.parser.JsonPathParser; 24 | 25 | public class ZhihuRecommendCrawler extends Processor{ 26 | private Logger log=LoggerFactory.getLogger(ZhihuRecommendCrawler.class); 27 | private AtomicInteger offset=new AtomicInteger(0); 28 | 29 | @Override 30 | public void process(Response resp) { 31 | String curUrl=resp.getRequest().getUrl(); 32 | JsonPathParser parser=resp.json(); 33 | int count=Integer.valueOf(parser.single("$.msg.length()")); 34 | if(count>0){ 35 | resp.addRequest(getPostRequest(offset.addAndGet(20))); 36 | } 37 | StringBuilder sb=new StringBuilder(); 38 | for(int i=0;i

"+title+"

"+authorAndInfo+"查看
"+content+"
\n"); 46 | } 47 | appendToFile(sb.toString()); 48 | 49 | } 50 | public void start() { 51 | Site site = new Site(); 52 | site.setHeader("Referer", "https://www.zhihu.com/explore/recommendations"); 53 | Spider spider = Spider.builder(this).threadNum(5).site(site) 54 | .requests(getPostRequest(0)).build(); 55 | spider.run(); 56 | appendToFile(""); 57 | } 58 | private Request getPostRequest(int offset){ 59 | Request req=new Request("https://www.zhihu.com/node/ExploreRecommendListV2"); 60 | req.setMethod(Const.HttpMethod.POST); 61 | req.setParams("method", "next"); 62 | req.setParams("params", "{\"limit\":20,\"offset\":"+offset+"}"); 63 | return req; 64 | } 65 | private synchronized void appendToFile(String content){ 66 | SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH"); 67 | File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".zhihu.html"); 68 | if(!f.exists()){ 69 | try { 70 | f.createNewFile(); 71 | FileUtils.write(f, "","UTF-8"); 72 | } catch (IOException e) { 73 | e.printStackTrace(); 74 | } 75 | } 76 | FileWriter writer=null; 77 | try { 78 | writer=new FileWriter(f,true); 79 | writer.write(content); 80 | } catch (IOException e) { 81 | e.printStackTrace(); 82 | }finally{ 83 | IOUtils.closeQuietly(writer); 84 | } 85 | } 86 | public static void main(String[] args) { 87 | new ZhihuRecommendCrawler().start(); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/OSChinaTweetsCrawler.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.ArrayList; 8 | import java.util.Date; 9 | import java.util.List; 10 | import java.util.concurrent.atomic.AtomicInteger; 11 | 12 | import org.apache.commons.io.FileUtils; 13 | import org.apache.commons.io.IOUtils; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | 17 | import com.github.xbynet.crawler.Processor; 18 | import com.github.xbynet.crawler.Request; 19 | import com.github.xbynet.crawler.Response; 20 | import com.github.xbynet.crawler.Site; 21 | import com.github.xbynet.crawler.Spider; 22 | import com.github.xbynet.crawler.parser.JsoupParser; 23 | 24 | public class OSChinaTweetsCrawler extends Processor{ 25 | private final int maxPageCount=20; 26 | private final AtomicInteger count=new AtomicInteger(0); 27 | @Override 28 | public void process(Response resp) { 29 | synchronized (count) { 30 | if(count.get()>maxPageCount) 31 | return; 32 | } 33 | count.addAndGet(1); 34 | String currentUrl=resp.getRequest().getUrl(); 35 | JsoupParser parser=resp.html(); 36 | List lastIds=parser.list("span[data-last]","data-last"); 37 | String lastId=lastIds.get(lastIds.size()-1); 38 | String continueUrl="https://www.oschina.net/tweets?lastLogId="+lastId; 39 | Request req=new Request(continueUrl); 40 | req.setHeader("Referer", currentUrl); 41 | req.setHeader("X-Requested-With", "XMLHttpRequest"); 42 | resp.addRequest(req); 43 | 44 | StringBuilder sb=new StringBuilder(); 45 | List authors=parser.list(".tweetitem .box-fl > a","title"); 46 | List itemUrls=parser.list(".tweetitem .ti-toolbox a[title=\"查看详情\"]","href"); 47 | List itemContents=new ArrayList(itemUrls.size()); 48 | Elements els=parser.elements(".tweetitem"); 49 | for(Element e:els){ 50 | String tmp=e.select(".ti-content > .inner-content").first().html(); 51 | itemContents.add(tmp.replace("src=\"/", "src=\"https://www.oschina.net/")); 52 | } 53 | for(int i=0;i"+authors.get(i)+"查看"+itemContents.get(i)+"\n"); 55 | } 56 | appendToFile(sb.toString()); 57 | } 58 | public void start() { 59 | Site site = new Site(); 60 | site.setEncoding("UTF-8"); 61 | site.setHeader("Referer", "https://www.oschina.net/"); 62 | Spider spider = Spider.builder(this).threadNum(1).site(site) 63 | .urls("https://www.oschina.net/tweets?nocache="+System.currentTimeMillis()).build(); 64 | spider.run(); 65 | appendToFile(""); 66 | } 67 | public static void main(String[] args) { 68 | new OSChinaTweetsCrawler().start(); 69 | } 70 | private synchronized void appendToFile(String content){ 71 | SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH"); 72 | File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".oschina.html"); 73 | if(!f.exists()){ 74 | try { 75 | f.createNewFile(); 76 | FileUtils.write(f, "","UTF-8"); 77 | } catch (IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | FileWriter writer=null; 82 | try { 83 | writer=new FileWriter(f,true); 84 | writer.write(content); 85 | } catch (IOException e) { 86 | e.printStackTrace(); 87 | }finally{ 88 | IOUtils.closeQuietly(writer); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/QiushibaikeCrawler.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.ArrayList; 8 | import java.util.Date; 9 | import java.util.List; 10 | import java.util.concurrent.atomic.AtomicInteger; 11 | 12 | import org.apache.commons.io.FileUtils; 13 | import org.apache.commons.io.IOUtils; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | 17 | import com.github.xbynet.crawler.Processor; 18 | import com.github.xbynet.crawler.Response; 19 | import com.github.xbynet.crawler.Site; 20 | import com.github.xbynet.crawler.Spider; 21 | import com.github.xbynet.crawler.parser.JsoupParser; 22 | 23 | public class QiushibaikeCrawler extends Processor{ 24 | @Override 25 | public void process(Response resp) { 26 | String currentUrl=resp.getRequest().getUrl(); 27 | JsoupParser parser=resp.html(); 28 | if(currentUrl.equals("https://www.qiushibaike.com/")){ 29 | int pageCount=Integer.valueOf(parser.single("ul.pagination > li:nth-last-child(2) .page-numbers","text").trim()); 30 | System.out.println("8hr共有"+pageCount+"页"); 31 | for(int i=2;i<=pageCount;i++){ 32 | resp.addRequest("https://www.qiushibaike.com/8hr/page/"+i+"/", false); 33 | } 34 | }else if(currentUrl.equals("https://www.qiushibaike.com/hot/")){ 35 | int pageCount=Integer.valueOf(parser.single("ul.pagination > li:nth-last-child(2) .page-numbers","text").trim()); 36 | System.out.println("hot共有"+pageCount+"页"); 37 | for(int i=2;i<=pageCount;i++){ 38 | resp.addRequest("https://www.qiushibaike.com/hot/page/"+i+"/", false); 39 | } 40 | } 41 | Elements els=parser.elements("#content-left > div"); 42 | StringBuilder sb=new StringBuilder(); 43 | for(Element e:els){ 44 | String author=e.select(".author > a:nth-child(2)").attr("title").trim(); 45 | String link="https://www.qiushibaike.com"+e.select(".contentHerf").attr("href"); 46 | String content=e.select(".contentHerf .content").html(); 47 | Elements thumbEls=e.select(".thumb"); 48 | if(thumbEls!=null && thumbEls.size()>0){ 49 | content+=thumbEls.get(0).outerHtml().replace("src=\"//", "src=\"http://"); 50 | } 51 | sb.append("
"+author+"查看"+content+"
\n"); 52 | 53 | } 54 | appendToFile(sb.toString()); 55 | } 56 | public void start() { 57 | Site site = new Site(); 58 | // site.setEncoding("UTF-8"); 59 | site.setHeader("Referer", "https://www.qiushibaike.com/"); 60 | Spider spider = Spider.builder(this).threadNum(1).site(site) 61 | .urls("https://www.qiushibaike.com/","https://www.qiushibaike.com/hot/").build(); 62 | spider.run(); 63 | appendToFile(""); 64 | } 65 | public static void main(String[] args) { 66 | new QiushibaikeCrawler().start(); 67 | } 68 | private synchronized void appendToFile(String content){ 69 | SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH"); 70 | File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".qiushibaike.html"); 71 | if(!f.exists()){ 72 | try { 73 | f.createNewFile(); 74 | FileUtils.write(f, "","UTF-8"); 75 | } catch (IOException e) { 76 | e.printStackTrace(); 77 | } 78 | } 79 | FileWriter writer=null; 80 | try { 81 | writer=new FileWriter(f,true); 82 | writer.write(content); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | }finally{ 86 | IOUtils.closeQuietly(writer); 87 | } 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/SeleniumDownloader.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.openqa.selenium.By; 10 | import org.openqa.selenium.WebDriver; 11 | import org.openqa.selenium.WebElement; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.github.xbynet.crawler.Const; 16 | import com.github.xbynet.crawler.Request; 17 | import com.github.xbynet.crawler.Response; 18 | import com.github.xbynet.crawler.Spider; 19 | import com.github.xbynet.crawler.http.Downloader; 20 | 21 | public class SeleniumDownloader implements Downloader { 22 | private static final Logger log = LoggerFactory 23 | .getLogger(SeleniumDownloader.class); 24 | private int sleepTime = 3000;// 3s 25 | private SeleniumAction action = null; 26 | private WebDriverPool webDriverPool; 27 | private Spider spider; 28 | 29 | public SeleniumDownloader(WebDriverPool webDriverPool) { 30 | this.webDriverPool = webDriverPool; 31 | } 32 | 33 | public SeleniumDownloader(int sleepTime, WebDriverPool pool) { 34 | this(sleepTime, pool, null); 35 | } 36 | 37 | public SeleniumDownloader(int sleepTime, WebDriverPool pool, 38 | SeleniumAction action) { 39 | this.sleepTime = sleepTime; 40 | this.action = action; 41 | this.webDriverPool = pool; 42 | } 43 | 44 | public void setOperator(SeleniumAction action) { 45 | this.action = action; 46 | } 47 | 48 | @Override 49 | public void download(Request request) { 50 | WebDriver webDriver; 51 | try { 52 | webDriver = webDriverPool.get(); 53 | } catch (InterruptedException e) { 54 | log.warn("interrupted", e); 55 | return; 56 | } 57 | log.info("downloading page " + request.getUrl()); 58 | Response resp = new Response(); 59 | resp.setRequest(request); 60 | resp.setRespType(Const.ResponseType.TEXT); 61 | try { 62 | webDriver.get(request.getUrl()); 63 | Thread.sleep(sleepTime); 64 | } catch (Exception e) { 65 | log.error("", e); 66 | webDriverPool.close(webDriver); 67 | return; 68 | } 69 | try { 70 | WebDriver.Options manage = webDriver.manage(); 71 | manage.window().maximize(); 72 | if (action != null) { 73 | action.execute(webDriver); 74 | } 75 | SeleniumAction reqAction = null; 76 | if (request.getExtras() != null 77 | && request.getExtras().containsKey("action")) { 78 | reqAction = (SeleniumAction) request.getExtras().get("action"); 79 | } 80 | if (reqAction != null) { 81 | reqAction.execute(webDriver); 82 | } 83 | 84 | WebElement webElement = webDriver.findElement(By.xpath("/html")); 85 | String content = webElement.getAttribute("outerHTML"); 86 | 87 | resp.setRaw(content); 88 | Map> headers = new HashMap>(); 89 | List cookielist = new ArrayList(1); 90 | cookielist.add(WindowUtil.getHttpCookieString(webDriver.manage() 91 | .getCookies())); 92 | headers.put("Set-Cookie", cookielist); 93 | resp.setHeaders(headers); 94 | 95 | getSpider().getProcessor().process(resp); 96 | } catch (Exception e) { 97 | log.error("", e); 98 | } finally { 99 | webDriverPool.returnToPool(webDriver); 100 | } 101 | } 102 | public Spider getSpider() { 103 | return spider; 104 | } 105 | 106 | public void setSpider(Spider spider) { 107 | this.spider = spider; 108 | } 109 | 110 | @Override 111 | public void close() throws IOException { 112 | webDriverPool.shutdown(); 113 | } 114 | 115 | @Override 116 | public void init() { 117 | 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/parser/JsoupParser.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.parser; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.nodes.Node; 10 | import org.jsoup.nodes.TextNode; 11 | import org.jsoup.select.Elements; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.github.xbynet.crawler.Const; 16 | 17 | public class JsoupParser implements Parser { 18 | private static final Logger log = LoggerFactory 19 | .getLogger(JsoupParser.class); 20 | 21 | private Document doc; 22 | 23 | public JsoupParser(String raw) { 24 | doc=Jsoup.parse(raw); 25 | } 26 | 27 | public String single(String cssSelector) { 28 | Elements els = getDoc().select(cssSelector); 29 | if (els == null || els.size() == 0) { 30 | log.warn("所选元素不存在" + cssSelector); 31 | return null; 32 | } 33 | return getValue(getDoc().select(cssSelector).get(0), null); 34 | } 35 | 36 | public String single(String cssSelector, String attrName) { 37 | Elements els = getDoc().select(cssSelector); 38 | if (els == null || els.size() == 0) { 39 | log.warn("所选元素不存在" + cssSelector); 40 | return null; 41 | } 42 | return getValue(getDoc().select(cssSelector).get(0), attrName); 43 | } 44 | 45 | public List list(String cssSelector) { 46 | List reslist = new ArrayList(); 47 | Elements els = getDoc().select(cssSelector); 48 | if (els == null || els.size() == 0) { 49 | log.warn("所选元素不存在" + cssSelector); 50 | return reslist; 51 | } 52 | for (Element e : els) { 53 | reslist.add(getValue(e, null)); 54 | } 55 | return reslist; 56 | } 57 | 58 | public List list(String cssSelector, String attrName) { 59 | List reslist = new ArrayList(); 60 | Elements els = getDoc().select(cssSelector); 61 | if (els == null || els.size() == 0) { 62 | log.warn("所选元素不存在" + cssSelector); 63 | return reslist; 64 | } 65 | for (Element e : els) { 66 | reslist.add(getValue(e, attrName)); 67 | } 68 | return reslist; 69 | } 70 | 71 | private String getValue(Element element, String attrName) { 72 | if (attrName == null) { 73 | return element.outerHtml(); 74 | } else if ("innerHtml".equalsIgnoreCase(attrName)) { 75 | return element.html(); 76 | } else if ("text".equalsIgnoreCase(attrName)) { 77 | return getText(element); 78 | } else if ("allText".equalsIgnoreCase(attrName)) { 79 | return element.text(); 80 | } else { 81 | return element.attr(attrName); 82 | } 83 | } 84 | 85 | protected String getText(Element element) { 86 | StringBuilder accum = new StringBuilder(); 87 | for (Node node : element.childNodes()) { 88 | if (node instanceof TextNode) { 89 | TextNode textNode = (TextNode) node; 90 | accum.append(textNode.text()); 91 | } 92 | } 93 | return accum.toString(); 94 | } 95 | 96 | public Element element(String cssSelector) { 97 | Elements els = getDoc().select(cssSelector); 98 | if (els == null || els.size() == 0) { 99 | log.warn("所选元素不存在" + cssSelector); 100 | return null; 101 | } 102 | return els.get(0); 103 | } 104 | 105 | public Elements elements(String cssSelector) { 106 | Elements els = getDoc().select(cssSelector); 107 | return els; 108 | } 109 | public String script(String cssSelector) { 110 | return single(cssSelector,Const.CssAttr.innerHtml.name()); 111 | } 112 | public List scripts(String cssSelector) { 113 | return list(cssSelector,Const.CssAttr.innerHtml.name()); 114 | } 115 | 116 | public Document getDoc() { 117 | return doc; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /crawler-server/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.github.xbynet 6 | crawler-parent 7 | 0.3.0 8 | 9 | crawler-server 10 | jar 11 | 12 | 7.0.57 13 | compile 14 | 15 | 16 | 17 | com.github.xbynet 18 | crawler-core 19 | ${project.version} 20 | 21 | 22 | org.apache.tomcat.embed 23 | tomcat-embed-core 24 | ${tomcat.version} 25 | ${myscope} 26 | 27 | 28 | org.apache.tomcat.embed 29 | tomcat-embed-logging-juli 30 | ${tomcat.version} 31 | ${myscope} 32 | 33 | 34 | org.apache.tomcat.embed 35 | tomcat-embed-jasper 36 | ${tomcat.version} 37 | ${myscope} 38 | 39 | 40 | org.apache.tomcat 41 | tomcat-jasper 42 | ${tomcat.version} 43 | ${myscope} 44 | 45 | 46 | org.apache.tomcat 47 | tomcat-jasper-el 48 | ${tomcat.version} 49 | ${myscope} 50 | 51 | 52 | org.apache.tomcat 53 | tomcat-jsp-api 54 | ${tomcat.version} 55 | ${myscope} 56 | 57 | 58 | jstl 59 | jstl 60 | 1.2 61 | 62 | 63 | 64 | 65 | pkg 66 | 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-assembly-plugin 71 | 72 | 73 | jar-with-dependencies 74 | 75 | crawler-server-${project.version} 76 | 77 | 78 | com.github.xbynet.crawler.server.Main 79 | 80 | 81 | 82 | 83 | 84 | package 85 | 86 | single 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | crawler-server 97 | 98 | 99 | src/main/webapp 100 | META-INF/resources 101 | 102 | 103 | 104 | 105 | org.apache.maven.plugins 106 | maven-compiler-plugin 107 | 2.3.2 108 | true 109 | 110 | 1.7 111 | 1.7 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/HttpClientFactory.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.io.IOException; 4 | import java.security.KeyManagementException; 5 | import java.security.KeyStoreException; 6 | import java.security.NoSuchAlgorithmException; 7 | import java.security.cert.X509Certificate; 8 | 9 | import javax.net.ssl.SSLContext; 10 | 11 | import org.apache.http.HttpException; 12 | import org.apache.http.HttpRequest; 13 | import org.apache.http.HttpRequestInterceptor; 14 | import org.apache.http.config.Registry; 15 | import org.apache.http.config.RegistryBuilder; 16 | import org.apache.http.config.SocketConfig; 17 | import org.apache.http.conn.socket.ConnectionSocketFactory; 18 | import org.apache.http.conn.socket.PlainConnectionSocketFactory; 19 | import org.apache.http.conn.ssl.NoopHostnameVerifier; 20 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory; 21 | import org.apache.http.impl.client.CloseableHttpClient; 22 | import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; 23 | import org.apache.http.impl.client.HttpClients; 24 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 25 | import org.apache.http.protocol.HttpContext; 26 | import org.apache.http.ssl.SSLContexts; 27 | import org.apache.http.ssl.TrustStrategy; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | 32 | public class HttpClientFactory { 33 | private static final Logger log=LoggerFactory.getLogger(HttpClientFactory.class); 34 | 35 | public CloseableHttpClient getClient(){ 36 | return getClient(30000, 3); 37 | } 38 | public CloseableHttpClient getClient(int timeout,int retry){ 39 | RegistryBuilder registryBuilder = RegistryBuilder.create(); 40 | registryBuilder.register("http", PlainConnectionSocketFactory.INSTANCE); 41 | // Fixing: https://code.google.com/p/crawler4j/issues/detail?id=174 42 | // By always trusting the ssl certificate 43 | SSLContext sslContext=null; 44 | try { 45 | sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustStrategy() { 46 | public boolean isTrusted(final X509Certificate[] chain, String authType) { 47 | return true; 48 | } 49 | }).build(); 50 | } catch (KeyManagementException e) { 51 | log.error("",e); 52 | } catch (NoSuchAlgorithmException e) { 53 | log.error("",e); 54 | } catch (KeyStoreException e) { 55 | log.error("",e); 56 | } 57 | SSLConnectionSocketFactory sslsf=new SSLConnectionSocketFactory(sslContext, NoopHostnameVerifier.INSTANCE); 58 | registryBuilder.register("https", sslsf); 59 | Registry registry = registryBuilder.build(); 60 | //设置连接管理器 61 | PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registry); 62 | poolingHttpClientConnectionManager.setMaxTotal(500); 63 | poolingHttpClientConnectionManager.setDefaultMaxPerRoute(1000); 64 | 65 | SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); 66 | socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); 67 | socketConfigBuilder.setSoTimeout(timeout); 68 | SocketConfig socketConfig = socketConfigBuilder.build(); 69 | //构建客户端 70 | CloseableHttpClient client= HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager) 71 | .addInterceptorFirst(new HttpRequestInterceptor() { 72 | 73 | public void process( 74 | final HttpRequest request, 75 | final HttpContext context) throws HttpException, IOException { 76 | if (!request.containsHeader("Accept-Encoding")) { 77 | request.addHeader("Accept-Encoding", "gzip"); 78 | } 79 | } 80 | }) 81 | .setDefaultSocketConfig(socketConfig) 82 | .setRetryHandler(new DefaultHttpRequestRetryHandler(retry, true)) 83 | .build(); 84 | return client; 85 | } 86 | 87 | 88 | } 89 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/NeihanshequCrawler.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.math.BigDecimal; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Date; 9 | import java.util.List; 10 | import java.util.concurrent.atomic.AtomicInteger; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | import org.apache.commons.io.FileUtils; 15 | import org.apache.commons.io.IOUtils; 16 | 17 | import com.github.xbynet.crawler.Processor; 18 | import com.github.xbynet.crawler.Response; 19 | import com.github.xbynet.crawler.Site; 20 | import com.github.xbynet.crawler.Spider; 21 | import com.github.xbynet.crawler.parser.JsonPathParser; 22 | import com.github.xbynet.crawler.parser.JsoupParser; 23 | 24 | public class NeihanshequCrawler extends Processor{ 25 | private static final int maxCount=100; 26 | private AtomicInteger count=new AtomicInteger(0); 27 | 28 | @Override 29 | public void process(Response resp) { 30 | String currentUrl=resp.getRequest().getUrl(); 31 | 32 | if(currentUrl.equals("http://neihanshequ.com/")){ 33 | JsoupParser parser=resp.html(); 34 | List scripts=parser.scripts("script"); 35 | for(String str:scripts){ 36 | if(str.contains("var gListViewConfig")){ 37 | Pattern p=Pattern.compile("max_time: '(.*?)',",Pattern.MULTILINE); 38 | Matcher m=p.matcher(str); 39 | if(m.find()){ 40 | String maxTime=m.group(1); 41 | if(maxTime.contains(".")){ 42 | maxTime=maxTime.split("\\.")[0]; 43 | } 44 | if(count.getAndIncrement()<=maxCount){ 45 | resp.addRequest("http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time="+maxTime, true); 46 | } 47 | return; 48 | } 49 | break; 50 | } 51 | } 52 | }else{ 53 | JsonPathParser parser=resp.json(); 54 | String maxTime=parser.single("$.data.max_time"); 55 | if(maxTime.contains("E")){ 56 | maxTime=new BigDecimal(maxTime).toPlainString(); 57 | } 58 | if(count.getAndIncrement()<=maxCount){ 59 | resp.addRequest("http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time="+maxTime, true); 60 | } 61 | StringBuilder sb=new StringBuilder(); 62 | int size=Integer.valueOf(parser.single("$.data.data.length()")); 63 | for(int i=0;i"+author+"查看"+content+"\n"); 68 | } 69 | appendToFile(sb.toString()); 70 | } 71 | } 72 | public void start() { 73 | Site site = new Site(); 74 | // site.setEncoding("UTF-8"); 75 | site.setHeader("Referer", "http://neihanshequ.com/"); 76 | Spider spider = Spider.builder(this).threadNum(1).site(site) 77 | .urls("http://neihanshequ.com/").build(); 78 | spider.run(); 79 | appendToFile(""); 80 | } 81 | public static void main(String[] args) { 82 | new NeihanshequCrawler().start(); 83 | } 84 | private synchronized void appendToFile(String content){ 85 | SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd-HH"); 86 | File f=new File("D:\\code\\test\\tweets\\"+sdf.format(new Date())+".neihanshequ.html"); 87 | if(!f.exists()){ 88 | try { 89 | f.createNewFile(); 90 | FileUtils.write(f, "","UTF-8"); 91 | } catch (IOException e) { 92 | e.printStackTrace(); 93 | } 94 | } 95 | FileWriter writer=null; 96 | try { 97 | writer=new FileWriter(f,true); 98 | writer.write(content); 99 | } catch (IOException e) { 100 | e.printStackTrace(); 101 | }finally{ 102 | IOUtils.closeQuietly(writer); 103 | } 104 | } 105 | 106 | } -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Response.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import com.github.xbynet.crawler.parser.JsonPathParser; 8 | import com.github.xbynet.crawler.parser.JsoupParser; 9 | import com.github.xbynet.crawler.parser.XpathParser; 10 | import com.github.xbynet.crawler.utils.BeanUtil; 11 | 12 | public class Response { 13 | private int code; 14 | private String contentType; 15 | private Map> headers; 16 | private Const.ResponseType respType; 17 | private String raw;//如果respType为Const.ResponseType.TEXT,则有值 18 | private byte[] bytes;//如果respType为Const.ResponseType.BIN,则有值 19 | private Request request; 20 | private List continueRequest; 21 | private Response parentResponse=null;//用于分块时 22 | 23 | public Response(){ 24 | 25 | } 26 | public Response(Response parent){ 27 | this.parentResponse=parent; 28 | } 29 | public JsoupParser html(){ 30 | return new JsoupParser(raw); 31 | } 32 | public JsoupParser xml(){ 33 | return new JsoupParser(raw); 34 | } 35 | public JsonPathParser json(){ 36 | //处理jsonp的情形 37 | if(!raw.startsWith("{")&&!raw.startsWith("[")){ 38 | raw=raw.substring(raw.indexOf("(")+1,raw.length()-1); 39 | } 40 | return new JsonPathParser(raw); 41 | } 42 | public XpathParser xpath(){ 43 | return new XpathParser(raw); 44 | } 45 | 46 | public String getRaw(){ 47 | return raw; 48 | } 49 | public Response setRaw(String raw) { 50 | this.raw = raw; 51 | return this; 52 | } 53 | public int getCode() { 54 | return code; 55 | } 56 | public Response setCode(int code) { 57 | this.code = code; 58 | return this; 59 | } 60 | public String getContentType() { 61 | return contentType; 62 | } 63 | public Response setContentType(String contentType) { 64 | this.contentType = contentType; 65 | return this; 66 | } 67 | public Map> getHeaders() { 68 | return headers; 69 | } 70 | public Response setHeaders(Map> headers) { 71 | this.headers = headers; 72 | return this; 73 | } 74 | public Const.ResponseType getRespType() { 75 | return respType; 76 | } 77 | public Response setRespType(Const.ResponseType respType) { 78 | this.respType = respType; 79 | return this; 80 | } 81 | public byte[] getBytes() { 82 | return bytes; 83 | } 84 | public Response setBytes(byte[] bytes) { 85 | this.bytes = bytes; 86 | return this; 87 | } 88 | public Request getRequest() { 89 | return request; 90 | } 91 | public Response setRequest(Request request) { 92 | this.request = request; 93 | return this; 94 | } 95 | 96 | public Response addRequest(String url,boolean copyParent){ 97 | if(continueRequest==null){ 98 | continueRequest=new ArrayList(); 99 | } 100 | Request req=new Request(); 101 | if(copyParent){ 102 | BeanUtil.copyProperties(request, req); 103 | } 104 | req.setUrl(url); 105 | continueRequest.add(req); 106 | return this; 107 | } 108 | public Response addRequest(Request req){ 109 | if(continueRequest==null){ 110 | continueRequest=new ArrayList(); 111 | } 112 | continueRequest.add(req); 113 | return this; 114 | } 115 | public List getContinueReqeusts(){ 116 | return continueRequest; 117 | } 118 | public Response addPartRequest(String url,boolean copyParent){ 119 | Request req=new Request(); 120 | if(copyParent){ 121 | //不支持分块嵌套分块 122 | if(parentResponse==null){ 123 | BeanUtil.copyProperties(request, req); 124 | }else{ 125 | BeanUtil.copyProperties(parentResponse.getRequest(),req); 126 | } 127 | } 128 | req.setUrl(url); 129 | req.setPartRequest(null); 130 | return this; 131 | } 132 | public Response addPartRequest(Request req){ 133 | if(parentResponse==null){ 134 | request.getPartRequest().add(req); 135 | }else{ 136 | parentResponse.getRequest().getPartRequest().add(req); 137 | } 138 | return this; 139 | } 140 | public boolean isPartResponse(){ 141 | return parentResponse!=null; 142 | } 143 | public Response getParentResponse() { 144 | return parentResponse; 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Request.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.beans.Transient; 4 | import java.io.Serializable; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import org.apache.http.HttpEntity; 11 | import org.apache.http.client.protocol.HttpClientContext; 12 | 13 | import com.alibaba.fastjson.annotation.JSONField; 14 | 15 | public class Request implements Serializable{ 16 | private String url; 17 | private String encoding; 18 | private Const.HttpMethod method=Const.HttpMethod.GET; 19 | 20 | private int retrySleepTime=-1;//millis 21 | private int retryCount=-1;//millis 22 | 23 | private Map headers=new HashMap(); 24 | private Map params=new HashMap(); 25 | /**可以在添加请求时附加额外信息*/ 26 | private Map extras=new HashMap(); 27 | 28 | private transient HttpClientContext ctx; 29 | 30 | /** 31 | * support for json,xml or more,在post时,设置此选项会使params参数失效。 32 | */ 33 | private transient HttpEntity entity; 34 | 35 | private RequestAction action; 36 | 37 | /**支持存在分块请求的情形,(比如一篇文章需要翻多页抓取,歌手信息不分布在多个页面中)*/ 38 | private List partRequest=new ArrayList(); 39 | /**是否分块*/ 40 | private boolean supportPart=false; 41 | 42 | public Request(){ 43 | 44 | } 45 | public Request(String url){ 46 | this.url=url; 47 | } 48 | public Const.HttpMethod getMethod() { 49 | return method; 50 | } 51 | public Request setMethod(Const.HttpMethod method) { 52 | this.method = method; 53 | return this; 54 | } 55 | public Map getHeaders() { 56 | return headers; 57 | } 58 | public Request setHeader(String key,String value) { 59 | headers.put(key, value); 60 | return this; 61 | } 62 | public Map getParams() { 63 | return params; 64 | } 65 | public Request setParams(String key,String value) { 66 | params.put(key, value); 67 | return this; 68 | } 69 | public Map getExtras() { 70 | return extras; 71 | } 72 | public Request setExtras(Map extras) { 73 | this.extras=extras; 74 | return this; 75 | } 76 | public Request putExtra(String key,String value) { 77 | extras.put(key, value); 78 | return this; 79 | } 80 | 81 | public HttpClientContext getCtx() { 82 | return ctx; 83 | } 84 | public Request setCtx(HttpClientContext ctx) { 85 | this.ctx = ctx; 86 | return this; 87 | } 88 | 89 | public HttpEntity getEntity() { 90 | return entity; 91 | } 92 | public Request setEntity(HttpEntity entity) { 93 | this.entity = entity; 94 | return this; 95 | } 96 | public String getEncoding() { 97 | return encoding; 98 | } 99 | public Request setEncoding(String encoding) { 100 | this.encoding = encoding; 101 | return this; 102 | } 103 | 104 | public int getRetryCount() { 105 | return retryCount; 106 | } 107 | public Request setRetryCount(int retryCount) { 108 | this.retryCount = retryCount; 109 | return this; 110 | } 111 | public int getRetrySleepTime() { 112 | return retrySleepTime; 113 | } 114 | public Request setRetrySleepTime(int retrySleepTime) { 115 | this.retrySleepTime = retrySleepTime; 116 | return this; 117 | } 118 | public RequestAction getAction() { 119 | return action; 120 | } 121 | public Request setAction(RequestAction action) { 122 | this.action = action; 123 | return this; 124 | } 125 | public String getUrl() { 126 | return url; 127 | } 128 | public void setUrl(String url) { 129 | this.url = url; 130 | } 131 | 132 | public List getPartRequest() { 133 | return partRequest; 134 | } 135 | public Request setPartRequest(List list) { 136 | this.partRequest=list; 137 | return this; 138 | } 139 | public void addPartRequest(Request req) { 140 | this.partRequest.add(req); 141 | supportPart=true; 142 | } 143 | 144 | @Override 145 | public String toString() { 146 | return "Request [url=" + url + ", encoding=" + encoding + ", method=" 147 | + method + ", retrySleepTime=" + retrySleepTime 148 | + ", retryCount=" + retryCount + ", headers=" + headers 149 | + ", params=" + params + ", extras=" + extras + ", ctx=" + ctx 150 | + ", entity=" + entity + ", action=" + action + "]"; 151 | } 152 | public boolean isSupportPart() { 153 | return supportPart; 154 | } 155 | 156 | } 157 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/scheduler/RedisScheduler.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.scheduler; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | import org.apache.commons.codec.digest.DigestUtils; 10 | import org.apache.commons.lang3.SerializationUtils; 11 | import org.apache.http.HttpEntity; 12 | import org.apache.http.client.protocol.HttpClientContext; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import redis.clients.jedis.Jedis; 17 | import redis.clients.jedis.JedisPool; 18 | import redis.clients.jedis.JedisPoolConfig; 19 | 20 | import com.alibaba.fastjson.JSON; 21 | import com.github.xbynet.crawler.Const; 22 | import com.github.xbynet.crawler.ISpider; 23 | import com.github.xbynet.crawler.Request; 24 | import com.github.xbynet.crawler.RequestAction; 25 | /** 26 | * Use Redis as url scheduler for distributed crawlers.
27 | * 28 | * @author code4crafter@gmail.com
29 | * @since 0.2.0 30 | */ 31 | public class RedisScheduler implements Scheduler, DuplicateRemover { 32 | private Logger log=LoggerFactory.getLogger(RedisScheduler.class); 33 | 34 | protected JedisPool pool; 35 | 36 | private static final String QUEUE_PREFIX = "queue_"; 37 | 38 | private static final String SET_PREFIX = "set_"; 39 | 40 | private static final String ITEM_PREFIX = "item_"; 41 | 42 | 43 | public RedisScheduler(String host) { 44 | this(new JedisPool(new JedisPoolConfig(), host)); 45 | } 46 | 47 | public RedisScheduler(JedisPool pool) { 48 | this.pool = pool; 49 | } 50 | 51 | @Override 52 | public void resetDuplicateCheck(ISpider spider) { 53 | Jedis jedis = pool.getResource(); 54 | try { 55 | jedis.del(getSetKey(spider)); 56 | } finally { 57 | jedis.close(); 58 | } 59 | } 60 | 61 | @Override 62 | public boolean isDuplicate(Request request, ISpider spider) { 63 | Jedis jedis = pool.getResource(); 64 | try { 65 | return jedis.sadd(getSetKey(spider), request.getUrl()) > 0; 66 | } finally { 67 | jedis.close(); 68 | } 69 | 70 | } 71 | 72 | @Override 73 | public void push(Request request, ISpider spider) { 74 | Jedis jedis = pool.getResource(); 75 | if (Const.HttpMethod.POST == request.getMethod() 76 | || !isDuplicate(request, spider)) { 77 | log.debug("push to queue {}", request.getUrl()); 78 | try { 79 | jedis.rpush(getQueueKey(spider), request.getUrl()); 80 | String field = DigestUtils.md5Hex(request.getUrl()); 81 | byte[] data=SerializationUtils.serialize(request); 82 | jedis.hset((ITEM_PREFIX + spider.getName()).getBytes(), field.getBytes(), data); 83 | } finally { 84 | jedis.close(); 85 | } 86 | } 87 | } 88 | 89 | @Override 90 | public synchronized Request poll(ISpider spider) { 91 | Jedis jedis = pool.getResource(); 92 | try { 93 | String url = jedis.lpop(getQueueKey(spider)); 94 | if (url == null) { 95 | return null; 96 | } 97 | String key = ITEM_PREFIX + spider.getName(); 98 | String field = DigestUtils.md5Hex(url); 99 | byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); 100 | Request request=SerializationUtils.deserialize(bytes); 101 | return request; 102 | } finally { 103 | jedis.close(); 104 | } 105 | } 106 | 107 | protected String getSetKey(ISpider spider) { 108 | return SET_PREFIX + spider.getName(); 109 | } 110 | 111 | protected String getQueueKey(ISpider spider) { 112 | return QUEUE_PREFIX + spider.getName(); 113 | } 114 | 115 | protected String getItemKey(ISpider spider) 116 | { 117 | return ITEM_PREFIX + spider.getName(); 118 | } 119 | 120 | @Override 121 | public int getLeftRequestsCount(ISpider spider) { 122 | Jedis jedis = pool.getResource(); 123 | try { 124 | Long size = jedis.llen(getQueueKey(spider)); 125 | return size.intValue(); 126 | } finally { 127 | jedis.close(); 128 | } 129 | } 130 | 131 | @Override 132 | public int getTotalRequestsCount(ISpider spider) { 133 | Jedis jedis = pool.getResource(); 134 | try { 135 | Long size = jedis.scard(getSetKey(spider)); 136 | return size.intValue(); 137 | } finally { 138 | jedis.close(); 139 | } 140 | } 141 | 142 | 143 | @Override 144 | public DuplicateRemover getDuplicateRemover() { 145 | return this; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/PhantomjsWebDriverPool.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Set; 5 | import java.util.concurrent.BlockingDeque; 6 | import java.util.concurrent.LinkedBlockingDeque; 7 | import java.util.concurrent.TimeUnit; 8 | import java.util.concurrent.atomic.AtomicBoolean; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | import org.openqa.selenium.WebDriver; 12 | import org.openqa.selenium.phantomjs.PhantomJSDriver; 13 | import org.openqa.selenium.phantomjs.PhantomJSDriverService; 14 | import org.openqa.selenium.remote.DesiredCapabilities; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | /** 19 | * @author taojw 20 | */ 21 | public class PhantomjsWebDriverPool implements WebDriverPool { 22 | private Logger logger = LoggerFactory.getLogger(getClass()); 23 | 24 | private int CAPACITY = 5; 25 | private AtomicInteger refCount = new AtomicInteger(0); 26 | private static final String DRIVER_PHANTOMJS = "phantomjs"; 27 | 28 | /** 29 | * store webDrivers available 30 | */ 31 | private BlockingDeque innerQueue = new LinkedBlockingDeque( 32 | CAPACITY); 33 | 34 | private AtomicBoolean shutdowned = new AtomicBoolean(false); 35 | 36 | private String PHANTOMJS_PATH; 37 | private DesiredCapabilities caps = DesiredCapabilities.phantomjs(); 38 | 39 | public PhantomjsWebDriverPool(String phantomjsPath) { 40 | this(5, false, phantomjsPath); 41 | } 42 | 43 | /** 44 | * 45 | * @param poolsize 46 | * @param loadImg 47 | * 是否加载图片,默认不加载 48 | */ 49 | public PhantomjsWebDriverPool(int poolsize, boolean loadImg, 50 | String phantomjsPath) { 51 | this.CAPACITY = poolsize; 52 | innerQueue = new LinkedBlockingDeque(poolsize); 53 | PHANTOMJS_PATH = phantomjsPath; 54 | caps.setJavascriptEnabled(true); 55 | caps.setCapability("webStorageEnabled", true); 56 | caps.setCapability( 57 | PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, 58 | PHANTOMJS_PATH); 59 | // caps.setCapability("takesScreenshot", false); 60 | caps.setCapability( 61 | PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX 62 | + "User-Agent", 63 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); 64 | ArrayList cliArgsCap = new ArrayList(); 65 | // http://phantomjs.org/api/command-line.html 66 | cliArgsCap.add("--web-security=false"); 67 | cliArgsCap.add("--ssl-protocol=any"); 68 | cliArgsCap.add("--ignore-ssl-errors=true"); 69 | if (loadImg) { 70 | cliArgsCap.add("--load-images=true"); 71 | } else { 72 | cliArgsCap.add("--load-images=false"); 73 | } 74 | caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, 75 | cliArgsCap); 76 | caps.setCapability( 77 | PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS, 78 | new String[] { "--logLevel=INFO" }); 79 | 80 | } 81 | 82 | public WebDriver get() throws InterruptedException { 83 | WebDriver poll = innerQueue.poll(); 84 | if (poll != null) { 85 | return poll; 86 | } 87 | if (refCount.get() < CAPACITY) { 88 | synchronized (innerQueue) { 89 | if (refCount.get() < CAPACITY) { 90 | 91 | WebDriver mDriver = new PhantomJSDriver(caps); 92 | // 尝试性解决:https://github.com/ariya/phantomjs/issues/11526问题 93 | mDriver.manage().timeouts() 94 | .pageLoadTimeout(60, TimeUnit.SECONDS); 95 | // mDriver.manage().window().setSize(new Dimension(1366, 96 | // 768)); 97 | innerQueue.add(mDriver); 98 | refCount.incrementAndGet(); 99 | } 100 | } 101 | } 102 | return innerQueue.take(); 103 | } 104 | 105 | public void returnToPool(WebDriver webDriver) { 106 | if (shutdowned.get()) { 107 | webDriver.quit(); 108 | webDriver = null; 109 | } else { 110 | Set handles = webDriver.getWindowHandles(); 111 | if (handles.size() > 1) { 112 | int index = 0; 113 | for (String handle : handles) { 114 | if (index == 0) { 115 | index++; 116 | continue; 117 | } 118 | WindowUtil.changeWindowTo(webDriver, handle); 119 | webDriver.close(); 120 | index++; 121 | } 122 | } 123 | synchronized (shutdowned) { 124 | if(!shutdowned.get()){ 125 | innerQueue.add(webDriver); 126 | }else{ 127 | webDriver.quit(); 128 | webDriver = null; 129 | } 130 | } 131 | } 132 | } 133 | 134 | public void close(WebDriver webDriver) { 135 | refCount.decrementAndGet(); 136 | webDriver.quit(); 137 | webDriver = null; 138 | } 139 | 140 | public void shutdown() { 141 | synchronized (shutdowned) { 142 | shutdowned.set(true); 143 | } 144 | try { 145 | for (WebDriver driver : innerQueue) { 146 | close(driver); 147 | } 148 | innerQueue.clear(); 149 | refCount.set(0); 150 | } catch (Exception e) { 151 | logger.warn("webdriverpool关闭失败", e); 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /crawler-core/src/test/java/net/xby1993/crawler/GithubCrawler.java: -------------------------------------------------------------------------------- 1 | package net.xby1993.crawler; 2 | 3 | import java.nio.file.Paths; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.UUID; 7 | 8 | import org.apache.commons.io.IOUtils; 9 | import org.apache.http.client.methods.CloseableHttpResponse; 10 | import org.apache.http.client.methods.HttpUriRequest; 11 | import org.apache.http.client.protocol.HttpClientContext; 12 | import org.apache.http.impl.client.BasicCookieStore; 13 | import org.apache.http.impl.client.CloseableHttpClient; 14 | 15 | import com.github.xbynet.crawler.Const; 16 | import com.github.xbynet.crawler.Processor; 17 | import com.github.xbynet.crawler.Request; 18 | import com.github.xbynet.crawler.RequestAction; 19 | import com.github.xbynet.crawler.Response; 20 | import com.github.xbynet.crawler.Site; 21 | import com.github.xbynet.crawler.Spider; 22 | import com.github.xbynet.crawler.http.DefaultDownloader; 23 | import com.github.xbynet.crawler.http.FileDownloader; 24 | import com.github.xbynet.crawler.http.HttpClientFactory; 25 | import com.github.xbynet.crawler.parser.JsoupParser; 26 | import com.github.xbynet.crawler.scheduler.DefaultScheduler; 27 | 28 | public class GithubCrawler extends Processor { 29 | @Override 30 | public void process(Response resp) { 31 | String currentUrl = resp.getRequest().getUrl(); 32 | System.out.println("CurrentUrl:" + currentUrl); 33 | int respCode = resp.getCode(); 34 | System.out.println("ResponseCode:" + respCode); 35 | System.out.println("type:" + resp.getRespType().name()); 36 | String contentType = resp.getContentType(); 37 | System.out.println("ContentType:" + contentType); 38 | Map> headers = resp.getHeaders(); 39 | System.out.println("ResonseHeaders:"); 40 | for (String key : headers.keySet()) { 41 | List values=headers.get(key); 42 | for(String str:values){ 43 | System.out.println(key + ":" +str); 44 | } 45 | } 46 | JsoupParser parser = resp.html(); 47 | // suppport parted ,分块抓取是会有个parent response来关联所有分块response 48 | // System.out.println("isParted:"+resp.isPartResponse()); 49 | // Response parent=resp.getParentResponse(); 50 | // resp.addPartRequest(null); 51 | //Map extras=resp.getRequest().getExtras(); 52 | 53 | if (currentUrl.equals("https://github.com/xbynet")) { 54 | String avatar = parser.single("img.avatar", "src"); 55 | String dir = System.getProperty("java.io.tmpdir"); 56 | String savePath = Paths.get(dir, UUID.randomUUID().toString()) 57 | .toString(); 58 | boolean avatarDownloaded = download(avatar, savePath); 59 | System.out.println("avatar:" + avatar + ", saved:" + savePath); 60 | // System.out.println("avtar downloaded status:"+avatarDownloaded); 61 | String name = parser.single(".vcard-names > .vcard-fullname", 62 | "text"); 63 | System.out.println("name:" + name); 64 | List reponames = parser.list( 65 | ".pinned-repos-list .repo.js-repo", "text"); 66 | List repoUrls = parser.list( 67 | ".pinned-repo-item .d-block >a", "href"); 68 | System.out.println("reponame:url"); 69 | if (reponames != null) { 70 | for (int i = 0; i < reponames.size(); i++) { 71 | String tmpUrl="https://github.com"+repoUrls.get(i); 72 | System.out.println(reponames.get(i) + ":"+tmpUrl); 73 | Request req=new Request(tmpUrl).putExtra("name", reponames.get(i)); 74 | resp.addRequest(req); 75 | } 76 | } 77 | }else{ 78 | Map extras=resp.getRequest().getExtras(); 79 | String name=extras.get("name").toString(); 80 | System.out.println("repoName:"+name); 81 | String shortDesc=parser.single(".repository-meta-content","allText"); 82 | System.out.println("shortDesc:"+shortDesc); 83 | } 84 | } 85 | 86 | public void start() { 87 | Site site = new Site(); 88 | Spider spider = Spider.builder(this).threadNum(5).site(site) 89 | .urls("https://github.com/xbynet").build(); 90 | spider.run(); 91 | } 92 | 93 | public void startCompleteConfig() { 94 | String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; 95 | String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"; 96 | 97 | Site site = new Site(); 98 | site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/") 99 | .setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000) 100 | .setUa(pcUA); 101 | 102 | Request request = new Request("https://github.com/xbynet"); 103 | HttpClientContext ctx = new HttpClientContext(); 104 | BasicCookieStore cookieStore = new BasicCookieStore(); 105 | ctx.setCookieStore(cookieStore); 106 | request.setAction(new RequestAction() { 107 | @Override 108 | public void before(CloseableHttpClient client, HttpUriRequest req) { 109 | System.out.println("before-haha"); 110 | } 111 | 112 | @Override 113 | public void after(CloseableHttpClient client, 114 | CloseableHttpResponse resp) { 115 | System.out.println("after-haha"); 116 | } 117 | }).setCtx(ctx).setEncoding("GBK") 118 | .putExtra("somekey", "我是可以在response中使用的extras哦") 119 | .setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET) 120 | .setPartRequest(null).setEntity(null) 121 | .setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5) 122 | .setRetrySleepTime(10000); 123 | 124 | Spider spider = Spider.builder(this).threadNum(5) 125 | .name("Spider-github-xbynet") 126 | .defaultDownloader(new DefaultDownloader()) 127 | .fileDownloader(new FileDownloader()) 128 | .httpClientFactory(new HttpClientFactory()).ipProvider(null) 129 | .listener(null).pool(null).scheduler(new DefaultScheduler()) 130 | .shutdownOnComplete(true).site(site).build(); 131 | spider.run(); 132 | } 133 | 134 | public static void main(String[] args) { 135 | new GithubCrawler().start(); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /crawler-server/src/main/java/com/github/xbynet/crawler/server/demo/GithubCrawler.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.server.demo; 2 | 3 | import java.nio.file.Paths; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.UUID; 7 | 8 | import org.apache.http.client.methods.CloseableHttpResponse; 9 | import org.apache.http.client.methods.HttpUriRequest; 10 | import org.apache.http.client.protocol.HttpClientContext; 11 | import org.apache.http.impl.client.BasicCookieStore; 12 | import org.apache.http.impl.client.CloseableHttpClient; 13 | 14 | import com.github.xbynet.crawler.Const; 15 | import com.github.xbynet.crawler.Processor; 16 | import com.github.xbynet.crawler.Request; 17 | import com.github.xbynet.crawler.RequestAction; 18 | import com.github.xbynet.crawler.Response; 19 | import com.github.xbynet.crawler.Site; 20 | import com.github.xbynet.crawler.Spider; 21 | import com.github.xbynet.crawler.http.DefaultDownloader; 22 | import com.github.xbynet.crawler.http.FileDownloader; 23 | import com.github.xbynet.crawler.http.HttpClientFactory; 24 | import com.github.xbynet.crawler.parser.JsoupParser; 25 | import com.github.xbynet.crawler.scheduler.DefaultScheduler; 26 | 27 | public class GithubCrawler extends Processor { 28 | @Override 29 | public void process(Response resp) { 30 | String currentUrl = resp.getRequest().getUrl(); 31 | System.out.println("CurrentUrl:" + currentUrl); 32 | int respCode = resp.getCode(); 33 | System.out.println("ResponseCode:" + respCode); 34 | System.out.println("type:" + resp.getRespType().name()); 35 | String contentType = resp.getContentType(); 36 | System.out.println("ContentType:" + contentType); 37 | Map> headers = resp.getHeaders(); 38 | System.out.println("ResonseHeaders:"); 39 | for (String key : headers.keySet()) { 40 | List values=headers.get(key); 41 | for(String str:values){ 42 | System.out.println(key + ":" +str); 43 | } 44 | } 45 | JsoupParser parser = resp.html(); 46 | // suppport parted ,分块抓取是会有个parent response来关联所有分块response 47 | // System.out.println("isParted:"+resp.isPartResponse()); 48 | // Response parent=resp.getParentResponse(); 49 | // resp.addPartRequest(null); 50 | //Map extras=resp.getRequest().getExtras(); 51 | 52 | if (currentUrl.equals("https://github.com/xbynet")) { 53 | String avatar = parser.single("img.avatar", "src"); 54 | String dir = System.getProperty("java.io.tmpdir"); 55 | String savePath = Paths.get(dir, UUID.randomUUID().toString()) 56 | .toString(); 57 | boolean avatarDownloaded = download(avatar, savePath); 58 | System.out.println("avatar:" + avatar + ", saved:" + savePath); 59 | // System.out.println("avtar downloaded status:"+avatarDownloaded); 60 | String name = parser.single(".vcard-names > .vcard-fullname", 61 | "text"); 62 | System.out.println("name:" + name); 63 | List reponames = parser.list( 64 | ".pinned-repos-list .repo.js-repo", "text"); 65 | List repoUrls = parser.list( 66 | ".pinned-repo-item .d-block >a", "href"); 67 | System.out.println("reponame:url"); 68 | if (reponames != null) { 69 | for (int i = 0; i < reponames.size(); i++) { 70 | String tmpUrl="https://github.com"+repoUrls.get(i); 71 | System.out.println(reponames.get(i) + ":"+tmpUrl); 72 | Request req=new Request(tmpUrl).putExtra("name", reponames.get(i)); 73 | resp.addRequest(req); 74 | } 75 | } 76 | }else{ 77 | Map extras=resp.getRequest().getExtras(); 78 | String name=extras.get("name").toString(); 79 | System.out.println("repoName:"+name); 80 | String shortDesc=parser.single(".repository-meta-content","allText"); 81 | System.out.println("shortDesc:"+shortDesc); 82 | } 83 | } 84 | 85 | public Spider createSpider() { 86 | Site site = new Site(); 87 | Spider spider = Spider.builder(this).threadNum(5).site(site) 88 | .urls("https://github.com/xbynet").build(); 89 | // spider.run(); 90 | return spider; 91 | } 92 | 93 | public void startCompleteConfig() { 94 | String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; 95 | String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"; 96 | 97 | Site site = new Site(); 98 | site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/") 99 | .setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000) 100 | .setUa(pcUA); 101 | 102 | Request request = new Request("https://github.com/xbynet"); 103 | HttpClientContext ctx = new HttpClientContext(); 104 | BasicCookieStore cookieStore = new BasicCookieStore(); 105 | ctx.setCookieStore(cookieStore); 106 | request.setAction(new RequestAction() { 107 | @Override 108 | public void before(CloseableHttpClient client, HttpUriRequest req) { 109 | System.out.println("before-haha"); 110 | } 111 | 112 | @Override 113 | public void after(CloseableHttpClient client, 114 | CloseableHttpResponse resp) { 115 | System.out.println("after-haha"); 116 | } 117 | }).setCtx(ctx).setEncoding("GBK") 118 | .putExtra("somekey", "我是可以在response中使用的extras哦") 119 | .setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET) 120 | .setPartRequest(null).setEntity(null) 121 | .setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5) 122 | .setRetrySleepTime(10000); 123 | 124 | Spider spider = Spider.builder(this).threadNum(5) 125 | .name("Spider-github-xbynet") 126 | .defaultDownloader(new DefaultDownloader()) 127 | .fileDownloader(new FileDownloader()) 128 | .httpClientFactory(new HttpClientFactory()).ipProvider(null) 129 | .listener(null).pool(null).scheduler(new DefaultScheduler()) 130 | .shutdownOnComplete(true).site(site).build(); 131 | spider.run(); 132 | } 133 | 134 | public static void main(String[] args) { 135 | // new GithubCrawler().start(); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawler 2 | A simple and flexible web crawler framework for java. 3 | 4 | ## Features: 5 | 1、Code is easy to understand and customized (代码简单易懂,可定制性强) 6 | 2、Api is simple and easy to use 7 | 3、Support File download、Content part fetch.(支持文件下载、分块抓取)     8 | 4、Request And Response support much options、strong customizable.(请求和响应支持的内容和选项比较丰富、每个请求可定制性强) 9 | 5、Support do your own operation before or after network request in downloader(支持网络请求前后执行自定义操作)       10 | 6、Selenium+PhantomJS support   11 | 7、Redis support 12 | 13 | ## Future: 14 | 1、Complete the code comment and test(完善代码注释和完善测试代码)   15 | 16 | ## Install: 17 | 18 | The only module that must be added is crawler-core 19 | ```xml 20 | 21 | com.github.xbynet 22 | crawler-core 23 | 0.3.0 24 | 29 | com.github.xbynet 30 | crawler-selenium 31 | 0.3.0 32 | > headers = resp.getHeaders(); 57 | System.out.println("ResonseHeaders:"); 58 | for (String key : headers.keySet()) { 59 | List values=headers.get(key); 60 | for(String str:values){ 61 | System.out.println(key + ":" +str); 62 | } 63 | } 64 | JsoupParser parser = resp.html(); 65 | // suppport parted ,分块抓取是会有个parent response来关联所有分块response 66 | // System.out.println("isParted:"+resp.isPartResponse()); 67 | // Response parent=resp.getParentResponse(); 68 | // resp.addPartRequest(null); 69 | //Map extras=resp.getRequest().getExtras(); 70 | 71 | if (currentUrl.equals("https://github.com/xbynet")) { 72 | String avatar = parser.single("img.avatar", "src"); 73 | String dir = System.getProperty("java.io.tmpdir"); 74 | String savePath = Paths.get(dir, UUID.randomUUID().toString()) 75 | .toString(); 76 | boolean avatarDownloaded = download(avatar, savePath); 77 | System.out.println("avatar:" + avatar + ", saved:" + savePath); 78 | // System.out.println("avtar downloaded status:"+avatarDownloaded); 79 | String name = parser.single(".vcard-names > .vcard-fullname", 80 | "text"); 81 | System.out.println("name:" + name); 82 | List reponames = parser.list( 83 | ".pinned-repos-list .repo.js-repo", "text"); 84 | List repoUrls = parser.list( 85 | ".pinned-repo-item .d-block >a", "href"); 86 | System.out.println("reponame:url"); 87 | if (reponames != null) { 88 | for (int i = 0; i < reponames.size(); i++) { 89 | String tmpUrl="https://github.com"+repoUrls.get(i); 90 | System.out.println(reponames.get(i) + ":"+tmpUrl); 91 | Request req=new Request(tmpUrl).putExtra("name", reponames.get(i)); 92 | resp.addRequest(req); 93 | } 94 | } 95 | }else{ 96 | Map extras=resp.getRequest().getExtras(); 97 | String name=extras.get("name").toString(); 98 | System.out.println("repoName:"+name); 99 | String shortDesc=parser.single(".repository-meta-content","allText"); 100 | System.out.println("shortDesc:"+shortDesc); 101 | } 102 | } 103 | 104 | public void start() { 105 | Site site = new Site(); 106 | Spider spider = Spider.builder(this).threadNum(5).site(site) 107 | .urls("https://github.com/xbynet").build(); 108 | spider.run(); 109 | } 110 | 111 | public static void main(String[] args) { 112 | new GithubCrawler().start(); 113 | } 114 | 115 | 116 | public void startCompleteConfig() { 117 | String pcUA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; 118 | String androidUA = "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"; 119 | 120 | Site site = new Site(); 121 | site.setEncoding("UTF-8").setHeader("Referer", "https://github.com/") 122 | .setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000) 123 | .setUa(pcUA); 124 | 125 | Request request = new Request("https://github.com/xbynet"); 126 | HttpClientContext ctx = new HttpClientContext(); 127 | BasicCookieStore cookieStore = new BasicCookieStore(); 128 | ctx.setCookieStore(cookieStore); 129 | request.setAction(new RequestAction() { 130 | @Override 131 | public void before(CloseableHttpClient client, HttpUriRequest req) { 132 | System.out.println("before-haha"); 133 | } 134 | 135 | @Override 136 | public void after(CloseableHttpClient client, 137 | CloseableHttpResponse resp) { 138 | System.out.println("after-haha"); 139 | } 140 | }).setCtx(ctx).setEncoding("UTF-8") 141 | .putExtra("somekey", "I can use in the response by your own") 142 | .setHeader("User-Agent", pcUA).setMethod(Const.HttpMethod.GET) 143 | .setPartRequest(null).setEntity(null) 144 | .setParams("appkeyqqqqqq", "1213131232141").setRetryCount(5) 145 | .setRetrySleepTime(10000); 146 | 147 | Spider spider = Spider.builder(this).threadNum(5) 148 | .name("Spider-github-xbynet") 149 | .defaultDownloader(new DefaultDownloader()) 150 | .fileDownloader(new FileDownloader()) 151 | .httpClientFactory(new HttpClientFactory()).ipProvider(null) 152 | .listener(null).pool(null).scheduler(new DefaultScheduler()) 153 | .shutdownOnComplete(true).site(site).build(); 154 | spider.run(); 155 | } 156 | 157 | 158 | } 159 | 160 | ``` 161 | ## Examples: 162 | 163 | - Github(github个人项目信息) 164 | - OSChinaTweets(开源中国动弹) 165 | - Qiushibaike(醜事百科) 166 | - Neihanshequ(内涵段子) 167 | - ZihuRecommend(知乎推荐) 168 | 169 | **More Examples:** Please see [here](https://github.com/xbynet/crawler/tree/master/crawler-core/src/test/java/net/xby1993/crawler) 170 | 171 | ## Thanks: 172 | [webmagic](https://github.com/code4craft/webmagic):本项目借鉴了webmagic多处代码,设计上也作了较多参考,非常感谢。 173 | [xsoup](https://github.com/code4craft/xsoup):本项目使用xsoup作为底层xpath处理器   174 | [JsonPath](https://github.com/json-path/JsonPath):本项目使用JsonPath作为底层jsonpath处理器 175 | [Jsoup](https://jsoup.org/) 本项目使用Jsoup作为底层HTML/XML处理器 176 | [HttpClient](http://hc.apache.org/) 本项目使用HttpClient作为底层网络请求工具 177 | -------------------------------------------------------------------------------- /crawler-selenium/src/main/java/com/github/xbynet/crawler/selenium/WindowUtil.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.selenium; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.util.Set; 7 | import java.util.concurrent.TimeUnit; 8 | 9 | import org.apache.commons.io.FileUtils; 10 | import org.apache.commons.io.IOUtils; 11 | import org.openqa.selenium.By; 12 | import org.openqa.selenium.Cookie; 13 | import org.openqa.selenium.Dimension; 14 | import org.openqa.selenium.JavascriptExecutor; 15 | import org.openqa.selenium.NoSuchElementException; 16 | import org.openqa.selenium.OutputType; 17 | import org.openqa.selenium.TakesScreenshot; 18 | import org.openqa.selenium.WebDriver; 19 | import org.openqa.selenium.WebElement; 20 | import org.openqa.selenium.interactions.Actions; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | 25 | /** 26 | * @author taojw 27 | * 28 | */ 29 | public class WindowUtil { 30 | private static final Logger log=LoggerFactory.getLogger(WindowUtil.class); 31 | /** 32 | * 窗口最大化 33 | * @param driver 34 | */ 35 | public static void maximize(WebDriver driver){ 36 | WebDriver.Options manage = driver.manage(); 37 | // manage.window().maximize(); 38 | manage.window().setSize(new Dimension(1920,1080)); 39 | driver.navigate().refresh(); 40 | } 41 | /** 42 | * 滚动窗口。 43 | * @param driver 44 | * @param height 45 | */ 46 | public static void scroll(WebDriver driver,int height){ 47 | ((JavascriptExecutor)driver).executeScript("window.scrollTo(0,"+height+" );"); 48 | } 49 | /** 50 | * 重新调整窗口大小,以适应页面,需要耗费一定时间。建议等待合理的时间。 51 | * @param driver 52 | */ 53 | public static void loadAll(WebDriver driver){ 54 | Dimension od=driver.manage().window().getSize(); 55 | int width=driver.manage().window().getSize().width; 56 | //尝试性解决:https://github.com/ariya/phantomjs/issues/11526问题 57 | driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS); 58 | long height=(Long)((JavascriptExecutor)driver).executeScript("return document.body.scrollHeight;"); 59 | driver.manage().window().setSize(new Dimension(width, (int)height)); 60 | driver.navigate().refresh(); 61 | } 62 | public static void refresh(WebDriver driver){ 63 | driver.navigate().refresh(); 64 | } 65 | public static void taskScreenShot(WebDriver driver,File saveFile){ 66 | if(saveFile.exists()){ 67 | saveFile.delete(); 68 | } 69 | byte[] src=((TakesScreenshot)driver).getScreenshotAs(OutputType.BYTES);//.FILE);linux下非root用户,java创建临时文件存在问题 70 | log.info("截图文件字节长度"+src.length); 71 | try { 72 | FileUtils.writeByteArrayToFile(saveFile, src); 73 | } catch (IOException e) { 74 | e.printStackTrace(); 75 | log.error("截图写入失败",e); 76 | } 77 | } 78 | public static void changeWindow(WebDriver driver){ 79 | // 获取当前页面句柄 80 | String handle = driver.getWindowHandle(); 81 | // 获取所有页面的句柄,并循环判断不是当前的句柄,就做选取switchTo() 82 | for (String handles : driver.getWindowHandles()) { 83 | if (handles.equals(handle)) 84 | continue; 85 | driver.switchTo().window(handles); 86 | } 87 | } 88 | public static void changeWindowTo(WebDriver driver,String handle){ 89 | for (String tmp : driver.getWindowHandles()) { 90 | if (tmp.equals(handle)){ 91 | driver.switchTo().window(handle); 92 | break; 93 | } 94 | } 95 | } 96 | 97 | /** 98 | * 打开一个新tab页,返回该tab页的windowhandle 99 | * @param driver 100 | * @param url 101 | * @return 102 | */ 103 | public static String openNewTab(WebDriver driver,String url){ 104 | Set strSet1=driver.getWindowHandles(); 105 | ((JavascriptExecutor)driver).executeScript("window.open('"+url+"','_blank');"); 106 | sleep(1000); 107 | Set strSet2=driver.getWindowHandles(); 108 | for(String tmp:strSet2){ 109 | if(!strSet1.contains(tmp)){ 110 | return tmp; 111 | } 112 | } 113 | return null; 114 | } 115 | public static void sleep(long millis){ 116 | try { 117 | Thread.sleep(millis); 118 | } catch (InterruptedException e) { 119 | e.printStackTrace(); 120 | } 121 | } 122 | /** 123 | * 操作关闭模态窗口 124 | * @param driver 125 | * @param type 如Id,ClassName 126 | * @param sel 选择器 127 | */ 128 | public static void clickModal(WebDriver driver,String type,String sel){ 129 | String js="document.getElementsBy"+type+"('"+sel+"')[0].click();"; 130 | ((JavascriptExecutor)driver).executeScript(js); 131 | } 132 | 133 | /** 134 | * 判断一个元素是否存在 135 | * @param driver 136 | * @param by 137 | * @return 138 | */ 139 | public static boolean checkElementExists(WebDriver driver,By by){ 140 | try{ 141 | driver.findElement(by); 142 | return true; 143 | }catch(NoSuchElementException e){ 144 | return false; 145 | } 146 | } 147 | /** 148 | * 点击一个元素 149 | * @param driver 150 | * @param by 151 | */ 152 | public static void clickElement(WebDriver driver,By by){ 153 | WebElement tmp=driver.findElement(by); 154 | Actions actions=new Actions(driver); 155 | actions.moveToElement(tmp).click().perform(); 156 | } 157 | public static void clickElement(WebDriver driver,WebElement tmp){ 158 | Actions actions=new Actions(driver); 159 | actions.moveToElement(tmp).click().perform(); 160 | } 161 | public static Object execJs(WebDriver driver,String js){ 162 | return ((JavascriptExecutor)driver).executeScript(js); 163 | } 164 | public static void clickByJsCssSelector(WebDriver driver,String cssSelector){ 165 | String js="document.querySelector('"+cssSelector+"').click();"; 166 | ((JavascriptExecutor)driver).executeScript(js); 167 | } 168 | 169 | public static Set getCookies(WebDriver driver){ 170 | return driver.manage().getCookies(); 171 | } 172 | public static void setCookies(WebDriver driver,Set cookies){ 173 | if(cookies==null){ 174 | return; 175 | } 176 | //Phantomjs存在Cookie设置bug,只能通过js来设置了。 177 | StringBuilder sb=new StringBuilder(); 178 | for(Cookie cookie:cookies){ 179 | String js="document.cookie=\""+cookie.getName()+"="+cookie.getValue()+";path="+cookie.getPath()+";domain="+cookie.getDomain()+"\";"; 180 | sb.append(js); 181 | } 182 | ((JavascriptExecutor)driver).executeScript(sb.toString()); 183 | } 184 | 185 | public static String getHttpCookieString(Set cookies){ 186 | if(cookies==null){ 187 | return ""; 188 | } 189 | String httpCookie=""; 190 | int index=0; 191 | for(Cookie c:cookies){ 192 | index++; 193 | if(index==cookies.size()){ 194 | httpCookie+=c.getName()+"="+c.getValue(); 195 | }else{ 196 | httpCookie+=c.getName()+"="+c.getValue()+"; "; 197 | } 198 | } 199 | return httpCookie; 200 | } 201 | 202 | /** 203 | * 获取css属性,最典型的就是获取某个元素的display状态 204 | * @param driver 205 | * @param cssSelector 206 | * @param attr 207 | * @return 208 | */ 209 | public static Object getCssAttr(WebDriver driver,String cssSelector,String attr){ 210 | InputStream ins=WindowUtil.class.getResourceAsStream("getCssAttr.js"); 211 | String externalJS=""; 212 | try { 213 | externalJS = IOUtils.toString(ins,"UTF-8"); 214 | } catch (IOException e) { 215 | e.printStackTrace(); 216 | } 217 | IOUtils.closeQuietly(ins); 218 | Object res = ((JavascriptExecutor) driver).executeScript(externalJS,cssSelector,attr); 219 | return res; 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.github.xbynet 5 | crawler-parent 6 | 0.3.0 7 | pom 8 | crawler-parent 9 | 10 | A simple and flexible web crawler framework for java. 11 | 12 | https://github.com/xbynet/crawler 13 | 14 | UTF-8 15 | UTF-8 16 | 1.6 17 | 18 | 19 | 20 | xbynet 21 | JiaWei Tao 22 | xbynet@outlook.com 23 | 24 | 25 | 26 | scm:git:git@github.com:xbynet/crawler.git 27 | scm:git:git@github.com:xbynet/crawler.git 28 | git@github.com:xbynet/crawler.git 29 | v${project.version} 30 | 31 | 32 | 33 | MIT License 34 | https://mit-license.org/ 35 | 36 | 37 | 38 | crawler-core 39 | crawler-selenium 40 | crawler-server 41 | 42 | 43 | 44 | 45 | aliyun 46 | 47 | true 48 | 49 | Public Repositories 50 | http://maven.aliyun.com/nexus/content/groups/public/ 51 | 52 | 53 | 54 | 55 | aliyun-plugin 56 | Public Repositories 57 | http://maven.aliyun.com/nexus/content/groups/public/ 58 | 59 | 60 | 61 | 62 | 63 | 64 | junit 65 | junit 66 | 4.12 67 | test 68 | 69 | 70 | cglib 71 | cglib 72 | 73 | 3.2.4 74 | 75 | 76 | commons-io 77 | commons-io 78 | 2.5 79 | 80 | 81 | org.apache.commons 82 | commons-lang3 83 | 3.5 84 | 85 | 86 | org.apache.httpcomponents 87 | httpclient 88 | 4.5.3 89 | 90 | 91 | com.alibaba 92 | fastjson 93 | 1.2.28 94 | 95 | 96 | 97 | ch.qos.logback 98 | logback-classic 99 | 1.2.1 100 | 101 | 102 | ch.qos.logback 103 | logback-core 104 | 1.2.1 105 | 106 | 107 | org.slf4j 108 | slf4j-api 109 | 1.7.22 110 | 111 | 112 | org.slf4j 113 | jcl-over-slf4j 114 | 1.7.22 115 | 116 | 117 | org.jsoup 118 | jsoup 119 | 1.10.2 120 | 121 | 122 | com.jayway.jsonpath 123 | json-path 124 | 2.2.0 125 | 126 | 127 | us.codecraft 128 | xsoup 129 | 0.3.1 130 | 131 | 132 | 133 | 134 | 135 | org.springframework.boot 136 | spring-boot-dependencies 137 | 1.5.4.RELEASE 138 | pom 139 | import 140 | 141 | 142 | org.seleniumhq.selenium 143 | selenium-java 144 | 2.53.1 145 | 146 | 147 | htmlunit-driver 148 | org.seleniumhq.selenium 149 | 150 | 151 | 152 | 153 | com.codeborne 154 | phantomjsdriver 155 | 1.3.0 156 | 157 | 158 | redis.clients 159 | jedis 160 | 2.9.0 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | org.apache.maven.plugins 169 | maven-compiler-plugin 170 | 3.1 171 | 172 | 1.6 173 | 1.6 174 | UTF-8 175 | 176 | 177 | 178 | org.apache.maven.plugins 179 | maven-resources-plugin 180 | 2.6 181 | 182 | UTF-8 183 | 184 | 185 | 186 | org.apache.maven.plugins 187 | maven-jar-plugin 188 | 190 | 191 | 192 | 193 | org.apache.maven.plugins 194 | maven-javadoc-plugin 195 | 2.10.4 196 | 197 | UTF-8 198 | crawler-0.3.0 199 | zh_CN 200 | 201 | 202 | 203 | aggregate 204 | 205 | aggregate 206 | 207 | site 208 | 209 | 210 | attach-javadocs 211 | 212 | jar 213 | 214 | 215 | 216 | 217 | 218 | org.apache.maven.plugins 219 | maven-release-plugin 220 | 2.4.1 221 | 222 | v@{project.version} 223 | true 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | release 232 | 233 | 234 | 235 | 236 | org.apache.maven.plugins 237 | maven-source-plugin 238 | 2.2.1 239 | 240 | 241 | package 242 | 243 | jar-no-fork 244 | 245 | 246 | 247 | 248 | 249 | 250 | org.apache.maven.plugins 251 | maven-javadoc-plugin 252 | 2.9.1 253 | 254 | 255 | package 256 | 257 | jar 258 | 259 | 260 | 261 | 262 | 263 | 264 | org.apache.maven.plugins 265 | maven-gpg-plugin 266 | 1.6 267 | 268 | 269 | verify 270 | 271 | sign 272 | 273 | 274 | 275 | 276 | 277 | org.sonatype.plugins 278 | nexus-staging-maven-plugin 279 | 1.6 280 | true 281 | 282 | osscenter 283 | https://oss.sonatype.org/ 284 | true 285 | 286 | 287 | 288 | 289 | 290 | 291 | osscenter 292 | https://oss.sonatype.org/content/repositories/snapshots/ 293 | 294 | 295 | osscenter 296 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 297 | 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/http/AbsDownloader.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler.http; 2 | 3 | import java.io.IOException; 4 | import java.io.UnsupportedEncodingException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.http.HttpHost; 10 | import org.apache.http.NameValuePair; 11 | import org.apache.http.client.ClientProtocolException; 12 | import org.apache.http.client.config.CookieSpecs; 13 | import org.apache.http.client.config.RequestConfig; 14 | import org.apache.http.client.entity.UrlEncodedFormEntity; 15 | import org.apache.http.client.methods.CloseableHttpResponse; 16 | import org.apache.http.client.methods.HttpUriRequest; 17 | import org.apache.http.client.methods.RequestBuilder; 18 | import org.apache.http.client.protocol.HttpClientContext; 19 | import org.apache.http.impl.client.CloseableHttpClient; 20 | import org.apache.http.message.BasicNameValuePair; 21 | import org.apache.http.util.EntityUtils; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | import com.github.xbynet.crawler.IpProxyProvider; 26 | import com.github.xbynet.crawler.Request; 27 | import com.github.xbynet.crawler.RequestAction; 28 | import com.github.xbynet.crawler.Response; 29 | import com.github.xbynet.crawler.Site; 30 | import com.github.xbynet.crawler.Spider; 31 | import com.github.xbynet.crawler.SpiderListener; 32 | import com.github.xbynet.crawler.Const.HttpMethod; 33 | import com.github.xbynet.crawler.utils.CrawlerUtils; 34 | 35 | public abstract class AbsDownloader implements Downloader{ 36 | private Logger log=LoggerFactory.getLogger(AbsDownloader.class); 37 | 38 | private CloseableHttpClient client; 39 | private Spider spider; 40 | 41 | public AbsDownloader(){ 42 | 43 | } 44 | public void init(){ 45 | HttpClientFactory clientFactory=spider.getHttpClientFactory(); 46 | if(clientFactory==null){ 47 | clientFactory=new HttpClientFactory(); 48 | } 49 | this.client=clientFactory.getClient(); 50 | } 51 | protected void doDownload(Request request,Object... extras){ 52 | String url=request.getUrl(); 53 | Site site=getSpider().getSite(); 54 | IpProxyProvider ipProxyProvider=getSpider().getIpProvider(); 55 | HttpHost proxy=null; 56 | if(ipProxyProvider!=null){ 57 | proxy=ipProxyProvider.getIp(); 58 | } 59 | 60 | log.debug(getSpider().getName()+",开始请求"+url); 61 | HttpUriRequest httpUriRequest=generateHttpRequest(site, request, proxy); 62 | 63 | Response response=new Response(); 64 | boolean state=cycleRequest(httpUriRequest,request,site,response,extras); 65 | 66 | if(!state){ 67 | log.error("no content crawled for "+request.getUrl()); 68 | notifyListener(false,request,null); 69 | return; 70 | } 71 | addContinueRequest(response); 72 | notifyListener(true,request,null); 73 | //循环遍历所有分块请求 74 | List orderReqList=request.getPartRequest(); 75 | while(orderReqList!=null && orderReqList.size()>0){ 76 | Request req=orderReqList.remove(0); 77 | spider.getScheduler().getDuplicateRemover().isDuplicate(req, spider); 78 | Response resp=new Response(response); 79 | state=cycleRequest(generateHttpRequest(site, req, proxy), req, site, resp, extras); 80 | if(!state){ 81 | log.error("no content crawled for "+req.getUrl()); 82 | notifyListener(false, req, null); 83 | }else{ 84 | notifyListener(true,req,null); 85 | } 86 | addContinueRequest(resp); 87 | } 88 | } 89 | protected void addContinueRequest(Response response){ 90 | List reqlist=response.getContinueReqeusts(); 91 | if(reqlist!=null){ 92 | for(Request req:reqlist){ 93 | spider.getScheduler().push(req, spider); 94 | } 95 | } 96 | } 97 | protected boolean cycleRequest(HttpUriRequest httpUriRequest,Request request,Site site,Response response,Object... extras){ 98 | boolean state=false; 99 | try { 100 | state = doRequest(httpUriRequest, request,site,response,extras); 101 | } catch (Exception e) { 102 | log.error("",e); 103 | } 104 | int retryCount=request.getRetryCount()>=0?request.getRetryCount():site.getRetry(); 105 | int retrySleepTimes=request.getRetrySleepTime()>=0?request.getRetrySleepTime():site.getRetrySleep(); 106 | int retryIndex=1; 107 | while(!state && retryIndex headerEntry : site.getHeaders().entrySet()) { 179 | requestBuilder.setHeader(headerEntry.getKey(), headerEntry.getValue()); 180 | } 181 | } 182 | RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); 183 | if (site != null) { 184 | requestConfigBuilder.setConnectionRequestTimeout(site.getTimeout()) 185 | .setSocketTimeout(site.getTimeout()) 186 | .setConnectTimeout(site.getTimeout()) 187 | .setCookieSpec(CookieSpecs.STANDARD); 188 | } 189 | 190 | if (proxy != null) { 191 | requestConfigBuilder.setProxy(proxy); 192 | } 193 | requestBuilder.setConfig(requestConfigBuilder.build()); 194 | HttpUriRequest httpUriRequest = requestBuilder.build(); 195 | if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { 196 | for (Map.Entry header : request.getHeaders().entrySet()) { 197 | httpUriRequest.setHeader(header.getKey(), header.getValue()); 198 | } 199 | } 200 | return httpUriRequest; 201 | } 202 | private RequestBuilder selectRequestMethod(Request request) { 203 | HttpMethod method = request.getMethod(); 204 | if (method == null || method==HttpMethod.GET) { 205 | return addFormParams(RequestBuilder.get(),request); 206 | } else if (method==HttpMethod.POST) { 207 | return addFormParams(RequestBuilder.post(),request); 208 | } else if (method==HttpMethod.HEAD) { 209 | return addFormParams(RequestBuilder.head(),request); 210 | } 211 | throw new IllegalArgumentException("Illegal HTTP Method " + method); 212 | } 213 | 214 | private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) { 215 | if (request.getEntity() != null && "POST".equalsIgnoreCase(requestBuilder.getMethod())) { 216 | requestBuilder.setEntity(request.getEntity()); 217 | }else if(request.getParams()!=null){ 218 | List nameValuePairs=new ArrayList(); 219 | for(String key:request.getParams().keySet()){ 220 | BasicNameValuePair pair=new BasicNameValuePair(key, request.getParams().get(key)); 221 | nameValuePairs.add(pair); 222 | } 223 | try { 224 | requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairs, "UTF-8")); 225 | } catch (UnsupportedEncodingException e) { 226 | log.error("",e); 227 | } 228 | } 229 | return requestBuilder; 230 | } 231 | 232 | public Spider getSpider() { 233 | return spider; 234 | } 235 | 236 | public void setSpider(Spider spider) { 237 | this.spider = spider; 238 | } 239 | 240 | @Override 241 | public void close() throws IOException { 242 | spider=null; 243 | client.close(); 244 | client=null; 245 | } 246 | 247 | @Override 248 | public void download(Request request) { 249 | throw new RuntimeException("not support!"); 250 | } 251 | 252 | } 253 | -------------------------------------------------------------------------------- /crawler-core/src/main/java/com/github/xbynet/crawler/Spider.java: -------------------------------------------------------------------------------- 1 | package com.github.xbynet.crawler; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | import java.util.Date; 6 | import java.util.UUID; 7 | import java.util.concurrent.TimeUnit; 8 | import java.util.concurrent.atomic.AtomicLong; 9 | import java.util.concurrent.locks.Condition; 10 | import java.util.concurrent.locks.ReentrantLock; 11 | 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.github.xbynet.crawler.http.DefaultDownloader; 16 | import com.github.xbynet.crawler.http.Downloader; 17 | import com.github.xbynet.crawler.http.FileDownloader; 18 | import com.github.xbynet.crawler.http.HttpClientFactory; 19 | import com.github.xbynet.crawler.scheduler.DefaultScheduler; 20 | import com.github.xbynet.crawler.scheduler.Scheduler; 21 | import com.github.xbynet.crawler.utils.CountableThreadPool; 22 | import com.github.xbynet.crawler.utils.CrawlerUtils; 23 | 24 | public class Spider implements ISpider, Runnable { 25 | private static final Logger log=LoggerFactory.getLogger(Spider.class); 26 | 27 | private String name; 28 | private Site site; 29 | private Scheduler scheduler = new DefaultScheduler(); 30 | private IpProxyProvider ipProvider; 31 | private HttpClientFactory httpClientFactory = new HttpClientFactory(); 32 | private FileDownloader fileDownloader = null; 33 | private Downloader defaultDownloader=null; 34 | private Processor processor; 35 | private SpiderListener spiderListener; 36 | /** 是否在任务结束后释放所有资源并终止 */ 37 | private boolean shutdownOnComplete = true; 38 | /** 空闲等待时长,超过此时长便自动结束爬虫 */ 39 | private int idleWaitTime=1*60*1000; 40 | private Date startTime; 41 | private Date endTime; 42 | private AtomicLong processUrlCount=new AtomicLong(0L); 43 | 44 | private ReentrantLock newUrlLock = new ReentrantLock(); 45 | 46 | private Condition newUrlCondition = newUrlLock.newCondition(); 47 | 48 | public enum Status { 49 | NotRun, Running, Stopped, Destroyed 50 | } 51 | 52 | private Status state = Status.NotRun; 53 | private int threadNum = 1; 54 | 55 | private CountableThreadPool pool; 56 | 57 | private Spider() { 58 | this.name = "Spider-" + UUID.randomUUID().toString(); 59 | this.fileDownloader = new FileDownloader(); 60 | this.fileDownloader.setSpider(this); 61 | this.fileDownloader.init(); 62 | this.defaultDownloader=new DefaultDownloader(); 63 | this.defaultDownloader.setSpider(this); 64 | this.defaultDownloader.init(); 65 | } 66 | 67 | 68 | public static class Builder{ 69 | private Spider spider; 70 | private Builder(Spider spider1,Processor p){ 71 | this.spider=spider1; 72 | p.setSpider(spider); 73 | p.setFileDownloader(spider.fileDownloader); 74 | this.spider.processor=p; 75 | } 76 | 77 | public Spider build(){ 78 | return spider; 79 | } 80 | 81 | public Builder urls(String... urls){ 82 | for(String url:urls){ 83 | Request req=new Request(url); 84 | spider.scheduler.push(req, spider); 85 | } 86 | return this; 87 | } 88 | public Builder requests(Request... requestlist){ 89 | for(Request req:requestlist){ 90 | spider.scheduler.push(req, spider); 91 | } 92 | return this; 93 | } 94 | public Builder site(Site site) { 95 | spider.site = site; 96 | return this; 97 | } 98 | public Builder scheduler(Scheduler scheduler) { 99 | Scheduler old=spider.scheduler; 100 | spider.scheduler = scheduler; 101 | Request req=null; 102 | while((req=old.poll(spider))!=null){ 103 | spider.scheduler.push(req, spider); 104 | } 105 | return this; 106 | } 107 | public Builder name(String name) { 108 | spider.name = name; 109 | return this; 110 | } 111 | public Builder ipProvider(IpProxyProvider ipProvider) { 112 | spider.ipProvider = ipProvider; 113 | return this; 114 | } 115 | public Builder httpClientFactory(HttpClientFactory httpClientFactory) { 116 | spider.httpClientFactory = httpClientFactory; 117 | return this; 118 | } 119 | public Builder fileDownloader(FileDownloader fileDownloader1) { 120 | fileDownloader1.setSpider(spider); 121 | fileDownloader1.init(); 122 | spider.fileDownloader=fileDownloader1; 123 | return this; 124 | } 125 | public Builder listener(SpiderListener spiderListener) { 126 | spider.spiderListener = spiderListener; 127 | return this; 128 | } 129 | public Builder threadNum(int threadNum) { 130 | spider.threadNum = threadNum; 131 | return this; 132 | } 133 | public Builder pool(CountableThreadPool pool) { 134 | spider.pool = pool; 135 | return this; 136 | } 137 | public Builder shutdownOnComplete(boolean shutdownOnComplete) { 138 | spider.shutdownOnComplete = shutdownOnComplete; 139 | return this; 140 | } 141 | 142 | public Builder defaultDownloader(Downloader downloader) { 143 | downloader.setSpider(spider); 144 | downloader.init(); 145 | spider.defaultDownloader=downloader; 146 | return this; 147 | } 148 | 149 | } 150 | public static Builder builder(Processor p) { 151 | return new Builder(new Spider(),p); 152 | } 153 | 154 | public String getName() { 155 | return this.name; 156 | } 157 | 158 | 159 | public Site getSite() { 160 | return site; 161 | } 162 | 163 | 164 | public Scheduler getScheduler() { 165 | return scheduler; 166 | } 167 | 168 | 169 | public IpProxyProvider getIpProvider() { 170 | return ipProvider; 171 | } 172 | 173 | public HttpClientFactory getHttpClientFactory() { 174 | return httpClientFactory; 175 | } 176 | 177 | 178 | public FileDownloader getFileDownloader() { 179 | return fileDownloader; 180 | } 181 | 182 | 183 | public Processor getProcessor() { 184 | return processor; 185 | } 186 | 187 | public SpiderListener getSpiderListener() { 188 | return spiderListener; 189 | } 190 | 191 | public int getThreadNum() { 192 | return threadNum; 193 | } 194 | 195 | public void run() { 196 | setStatus(Status.Running); 197 | init(); 198 | log.debug("Spider "+getName()+" start!"); 199 | System.out.println("--------------------------------------------------------------"); 200 | System.out.println("### 不要问我为什么,你要记住,在你最落寞的时候,有个人对你说过,你可以的!###"); 201 | System.out.println("### 为什么要写爬虫呢?因为我们爬的是寂寞;因为泡妹子需要笑话;因为找工作需要筛选职位;因为老板要求;也许因为要装x才是正解 ###"); 202 | System.out.println("--------------------------------------------------------------"); 203 | while (!Thread.currentThread().isInterrupted() && state==Status.Running) { 204 | Request request = scheduler.poll(this); 205 | if (request == null) { 206 | if (pool.getThreadAlive() == 0) { 207 | CrawlerUtils.sleep(idleWaitTime); 208 | request = scheduler.poll(this); 209 | if(request==null && shutdownOnComplete){ 210 | break; 211 | } 212 | } 213 | // wait until new url added 214 | waitNewUrl(); 215 | } else { 216 | final Request tmpReq=request; 217 | pool.execute(new Runnable() { 218 | @Override 219 | public void run() { 220 | try { 221 | defaultDownloader.download(tmpReq); 222 | } catch (Exception e) { 223 | log.error("process request " + tmpReq + " error", e); 224 | } finally { 225 | processUrlCount.incrementAndGet(); 226 | signalNewUrl(); 227 | } 228 | } 229 | }); 230 | } 231 | } 232 | setStatus(Status.Stopped); 233 | if(shutdownOnComplete){ 234 | shutdown(); 235 | } 236 | 237 | } 238 | private void waitNewUrl() { 239 | newUrlLock.lock(); 240 | try { 241 | //double check 242 | if (pool.getThreadAlive() == 0 && shutdownOnComplete) { 243 | return; 244 | } 245 | newUrlCondition.await(idleWaitTime, TimeUnit.MILLISECONDS); 246 | } catch (InterruptedException e) { 247 | log.warn("waitNewUrl - interrupted, error {}", e); 248 | } finally { 249 | newUrlLock.unlock(); 250 | } 251 | } 252 | 253 | private void signalNewUrl() { 254 | try { 255 | newUrlLock.lock(); 256 | newUrlCondition.signalAll(); 257 | } finally { 258 | newUrlLock.unlock(); 259 | } 260 | } 261 | public void runAsync() { 262 | Thread thread = new Thread(this); 263 | thread.setDaemon(false); 264 | thread.start(); 265 | } 266 | 267 | public void stop() { 268 | setStatus(Status.Stopped); 269 | } 270 | 271 | public synchronized void shutdown() { 272 | if(state==Status.Destroyed || state==Status.NotRun){ 273 | throw new IllegalStateException("Spider has never start or already destroyed"); 274 | } 275 | setStatus(Status.Destroyed); 276 | endTime=new Date(); 277 | if(pool!=null){ 278 | pool.shutdown(); 279 | try { 280 | pool.awaitTermination(idleWaitTime<60000?60000:idleWaitTime, TimeUnit.MILLISECONDS); 281 | } catch (InterruptedException e) { 282 | log.warn("thread pool termination interrupted",e); 283 | } 284 | } 285 | closeQuietly(defaultDownloader); 286 | closeQuietly(fileDownloader); 287 | closeQuietly(ipProvider); 288 | closeQuietly(ipProvider); 289 | 290 | } 291 | private void closeQuietly(Closeable clo){ 292 | if(clo!=null){ 293 | try { 294 | clo.close(); 295 | } catch (IOException e) { 296 | log.error("", e); 297 | } 298 | } 299 | } 300 | 301 | protected synchronized void init() { 302 | if (pool == null) { 303 | if (state != Status.Destroyed) { 304 | pool = new CountableThreadPool(threadNum); 305 | } else { 306 | throw new IllegalStateException("current spider is destroyed!"); 307 | } 308 | } 309 | startTime=new Date(); 310 | } 311 | 312 | public CountableThreadPool getPool() { 313 | return pool; 314 | } 315 | 316 | 317 | 318 | public boolean isShutdownOnComplete() { 319 | return shutdownOnComplete; 320 | } 321 | 322 | public Status getState() { 323 | return state; 324 | } 325 | 326 | private synchronized void setStatus(Status s) { 327 | state = s; 328 | } 329 | 330 | public boolean isRunning() { 331 | return state == Status.Running; 332 | } 333 | 334 | public boolean isStopped() { 335 | return state == Status.Stopped; 336 | } 337 | 338 | public boolean isDestroyed() { 339 | return state == Status.Destroyed; 340 | } 341 | 342 | public Date getStartTime() { 343 | return startTime; 344 | } 345 | 346 | private void setStartTime(Date startTime) { 347 | this.startTime = startTime; 348 | } 349 | 350 | public Date getEndTime() { 351 | return endTime; 352 | } 353 | 354 | private void setEndTime(Date endTime) { 355 | this.endTime = endTime; 356 | } 357 | 358 | public Downloader getDefaultDownloader() { 359 | return defaultDownloader; 360 | } 361 | 362 | public AtomicLong getProcessUrlCount() { 363 | return processUrlCount; 364 | } 365 | /** 366 | * 是否处于空闲状态 367 | */ 368 | public boolean isIdle(){ 369 | return pool.getThreadAlive() == 0; 370 | } 371 | } 372 | -------------------------------------------------------------------------------- /crawler-server/src/main/webapp/js/bootstrap.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap v3.3.4 (http://getbootstrap.com) 3 | * Copyright 2011-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | if("undefined"==typeof jQuery)throw new Error("Bootstrap's JavaScript requires jQuery");+function(a){"use strict";var b=a.fn.jquery.split(" ")[0].split(".");if(b[0]<2&&b[1]<9||1==b[0]&&9==b[1]&&b[2]<1)throw new Error("Bootstrap's JavaScript requires jQuery version 1.9.1 or higher")}(jQuery),+function(a){"use strict";function b(){var a=document.createElement("bootstrap"),b={WebkitTransition:"webkitTransitionEnd",MozTransition:"transitionend",OTransition:"oTransitionEnd otransitionend",transition:"transitionend"};for(var c in b)if(void 0!==a.style[c])return{end:b[c]};return!1}a.fn.emulateTransitionEnd=function(b){var c=!1,d=this;a(this).one("bsTransitionEnd",function(){c=!0});var e=function(){c||a(d).trigger(a.support.transition.end)};return setTimeout(e,b),this},a(function(){a.support.transition=b(),a.support.transition&&(a.event.special.bsTransitionEnd={bindType:a.support.transition.end,delegateType:a.support.transition.end,handle:function(b){return a(b.target).is(this)?b.handleObj.handler.apply(this,arguments):void 0}})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var c=a(this),e=c.data("bs.alert");e||c.data("bs.alert",e=new d(this)),"string"==typeof b&&e[b].call(c)})}var c='[data-dismiss="alert"]',d=function(b){a(b).on("click",c,this.close)};d.VERSION="3.3.4",d.TRANSITION_DURATION=150,d.prototype.close=function(b){function c(){g.detach().trigger("closed.bs.alert").remove()}var e=a(this),f=e.attr("data-target");f||(f=e.attr("href"),f=f&&f.replace(/.*(?=#[^\s]*$)/,""));var g=a(f);b&&b.preventDefault(),g.length||(g=e.closest(".alert")),g.trigger(b=a.Event("close.bs.alert")),b.isDefaultPrevented()||(g.removeClass("in"),a.support.transition&&g.hasClass("fade")?g.one("bsTransitionEnd",c).emulateTransitionEnd(d.TRANSITION_DURATION):c())};var e=a.fn.alert;a.fn.alert=b,a.fn.alert.Constructor=d,a.fn.alert.noConflict=function(){return a.fn.alert=e,this},a(document).on("click.bs.alert.data-api",c,d.prototype.close)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.button"),f="object"==typeof b&&b;e||d.data("bs.button",e=new c(this,f)),"toggle"==b?e.toggle():b&&e.setState(b)})}var c=function(b,d){this.$element=a(b),this.options=a.extend({},c.DEFAULTS,d),this.isLoading=!1};c.VERSION="3.3.4",c.DEFAULTS={loadingText:"loading..."},c.prototype.setState=function(b){var c="disabled",d=this.$element,e=d.is("input")?"val":"html",f=d.data();b+="Text",null==f.resetText&&d.data("resetText",d[e]()),setTimeout(a.proxy(function(){d[e](null==f[b]?this.options[b]:f[b]),"loadingText"==b?(this.isLoading=!0,d.addClass(c).attr(c,c)):this.isLoading&&(this.isLoading=!1,d.removeClass(c).removeAttr(c))},this),0)},c.prototype.toggle=function(){var a=!0,b=this.$element.closest('[data-toggle="buttons"]');if(b.length){var c=this.$element.find("input");"radio"==c.prop("type")&&(c.prop("checked")&&this.$element.hasClass("active")?a=!1:b.find(".active").removeClass("active")),a&&c.prop("checked",!this.$element.hasClass("active")).trigger("change")}else this.$element.attr("aria-pressed",!this.$element.hasClass("active"));a&&this.$element.toggleClass("active")};var d=a.fn.button;a.fn.button=b,a.fn.button.Constructor=c,a.fn.button.noConflict=function(){return a.fn.button=d,this},a(document).on("click.bs.button.data-api",'[data-toggle^="button"]',function(c){var d=a(c.target);d.hasClass("btn")||(d=d.closest(".btn")),b.call(d,"toggle"),c.preventDefault()}).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',function(b){a(b.target).closest(".btn").toggleClass("focus",/^focus(in)?$/.test(b.type))})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.carousel"),f=a.extend({},c.DEFAULTS,d.data(),"object"==typeof b&&b),g="string"==typeof b?b:f.slide;e||d.data("bs.carousel",e=new c(this,f)),"number"==typeof b?e.to(b):g?e[g]():f.interval&&e.pause().cycle()})}var c=function(b,c){this.$element=a(b),this.$indicators=this.$element.find(".carousel-indicators"),this.options=c,this.paused=null,this.sliding=null,this.interval=null,this.$active=null,this.$items=null,this.options.keyboard&&this.$element.on("keydown.bs.carousel",a.proxy(this.keydown,this)),"hover"==this.options.pause&&!("ontouchstart"in document.documentElement)&&this.$element.on("mouseenter.bs.carousel",a.proxy(this.pause,this)).on("mouseleave.bs.carousel",a.proxy(this.cycle,this))};c.VERSION="3.3.4",c.TRANSITION_DURATION=600,c.DEFAULTS={interval:5e3,pause:"hover",wrap:!0,keyboard:!0},c.prototype.keydown=function(a){if(!/input|textarea/i.test(a.target.tagName)){switch(a.which){case 37:this.prev();break;case 39:this.next();break;default:return}a.preventDefault()}},c.prototype.cycle=function(b){return b||(this.paused=!1),this.interval&&clearInterval(this.interval),this.options.interval&&!this.paused&&(this.interval=setInterval(a.proxy(this.next,this),this.options.interval)),this},c.prototype.getItemIndex=function(a){return this.$items=a.parent().children(".item"),this.$items.index(a||this.$active)},c.prototype.getItemForDirection=function(a,b){var c=this.getItemIndex(b),d="prev"==a&&0===c||"next"==a&&c==this.$items.length-1;if(d&&!this.options.wrap)return b;var e="prev"==a?-1:1,f=(c+e)%this.$items.length;return this.$items.eq(f)},c.prototype.to=function(a){var b=this,c=this.getItemIndex(this.$active=this.$element.find(".item.active"));return a>this.$items.length-1||0>a?void 0:this.sliding?this.$element.one("slid.bs.carousel",function(){b.to(a)}):c==a?this.pause().cycle():this.slide(a>c?"next":"prev",this.$items.eq(a))},c.prototype.pause=function(b){return b||(this.paused=!0),this.$element.find(".next, .prev").length&&a.support.transition&&(this.$element.trigger(a.support.transition.end),this.cycle(!0)),this.interval=clearInterval(this.interval),this},c.prototype.next=function(){return this.sliding?void 0:this.slide("next")},c.prototype.prev=function(){return this.sliding?void 0:this.slide("prev")},c.prototype.slide=function(b,d){var e=this.$element.find(".item.active"),f=d||this.getItemForDirection(b,e),g=this.interval,h="next"==b?"left":"right",i=this;if(f.hasClass("active"))return this.sliding=!1;var j=f[0],k=a.Event("slide.bs.carousel",{relatedTarget:j,direction:h});if(this.$element.trigger(k),!k.isDefaultPrevented()){if(this.sliding=!0,g&&this.pause(),this.$indicators.length){this.$indicators.find(".active").removeClass("active");var l=a(this.$indicators.children()[this.getItemIndex(f)]);l&&l.addClass("active")}var m=a.Event("slid.bs.carousel",{relatedTarget:j,direction:h});return a.support.transition&&this.$element.hasClass("slide")?(f.addClass(b),f[0].offsetWidth,e.addClass(h),f.addClass(h),e.one("bsTransitionEnd",function(){f.removeClass([b,h].join(" ")).addClass("active"),e.removeClass(["active",h].join(" ")),i.sliding=!1,setTimeout(function(){i.$element.trigger(m)},0)}).emulateTransitionEnd(c.TRANSITION_DURATION)):(e.removeClass("active"),f.addClass("active"),this.sliding=!1,this.$element.trigger(m)),g&&this.cycle(),this}};var d=a.fn.carousel;a.fn.carousel=b,a.fn.carousel.Constructor=c,a.fn.carousel.noConflict=function(){return a.fn.carousel=d,this};var e=function(c){var d,e=a(this),f=a(e.attr("data-target")||(d=e.attr("href"))&&d.replace(/.*(?=#[^\s]+$)/,""));if(f.hasClass("carousel")){var g=a.extend({},f.data(),e.data()),h=e.attr("data-slide-to");h&&(g.interval=!1),b.call(f,g),h&&f.data("bs.carousel").to(h),c.preventDefault()}};a(document).on("click.bs.carousel.data-api","[data-slide]",e).on("click.bs.carousel.data-api","[data-slide-to]",e),a(window).on("load",function(){a('[data-ride="carousel"]').each(function(){var c=a(this);b.call(c,c.data())})})}(jQuery),+function(a){"use strict";function b(b){var c,d=b.attr("data-target")||(c=b.attr("href"))&&c.replace(/.*(?=#[^\s]+$)/,"");return a(d)}function c(b){return this.each(function(){var c=a(this),e=c.data("bs.collapse"),f=a.extend({},d.DEFAULTS,c.data(),"object"==typeof b&&b);!e&&f.toggle&&/show|hide/.test(b)&&(f.toggle=!1),e||c.data("bs.collapse",e=new d(this,f)),"string"==typeof b&&e[b]()})}var d=function(b,c){this.$element=a(b),this.options=a.extend({},d.DEFAULTS,c),this.$trigger=a('[data-toggle="collapse"][href="#'+b.id+'"],[data-toggle="collapse"][data-target="#'+b.id+'"]'),this.transitioning=null,this.options.parent?this.$parent=this.getParent():this.addAriaAndCollapsedClass(this.$element,this.$trigger),this.options.toggle&&this.toggle()};d.VERSION="3.3.4",d.TRANSITION_DURATION=350,d.DEFAULTS={toggle:!0},d.prototype.dimension=function(){var a=this.$element.hasClass("width");return a?"width":"height"},d.prototype.show=function(){if(!this.transitioning&&!this.$element.hasClass("in")){var b,e=this.$parent&&this.$parent.children(".panel").children(".in, .collapsing");if(!(e&&e.length&&(b=e.data("bs.collapse"),b&&b.transitioning))){var f=a.Event("show.bs.collapse");if(this.$element.trigger(f),!f.isDefaultPrevented()){e&&e.length&&(c.call(e,"hide"),b||e.data("bs.collapse",null));var g=this.dimension();this.$element.removeClass("collapse").addClass("collapsing")[g](0).attr("aria-expanded",!0),this.$trigger.removeClass("collapsed").attr("aria-expanded",!0),this.transitioning=1;var h=function(){this.$element.removeClass("collapsing").addClass("collapse in")[g](""),this.transitioning=0,this.$element.trigger("shown.bs.collapse")};if(!a.support.transition)return h.call(this);var i=a.camelCase(["scroll",g].join("-"));this.$element.one("bsTransitionEnd",a.proxy(h,this)).emulateTransitionEnd(d.TRANSITION_DURATION)[g](this.$element[0][i])}}}},d.prototype.hide=function(){if(!this.transitioning&&this.$element.hasClass("in")){var b=a.Event("hide.bs.collapse");if(this.$element.trigger(b),!b.isDefaultPrevented()){var c=this.dimension();this.$element[c](this.$element[c]())[0].offsetHeight,this.$element.addClass("collapsing").removeClass("collapse in").attr("aria-expanded",!1),this.$trigger.addClass("collapsed").attr("aria-expanded",!1),this.transitioning=1;var e=function(){this.transitioning=0,this.$element.removeClass("collapsing").addClass("collapse").trigger("hidden.bs.collapse")};return a.support.transition?void this.$element[c](0).one("bsTransitionEnd",a.proxy(e,this)).emulateTransitionEnd(d.TRANSITION_DURATION):e.call(this)}}},d.prototype.toggle=function(){this[this.$element.hasClass("in")?"hide":"show"]()},d.prototype.getParent=function(){return a(this.options.parent).find('[data-toggle="collapse"][data-parent="'+this.options.parent+'"]').each(a.proxy(function(c,d){var e=a(d);this.addAriaAndCollapsedClass(b(e),e)},this)).end()},d.prototype.addAriaAndCollapsedClass=function(a,b){var c=a.hasClass("in");a.attr("aria-expanded",c),b.toggleClass("collapsed",!c).attr("aria-expanded",c)};var e=a.fn.collapse;a.fn.collapse=c,a.fn.collapse.Constructor=d,a.fn.collapse.noConflict=function(){return a.fn.collapse=e,this},a(document).on("click.bs.collapse.data-api",'[data-toggle="collapse"]',function(d){var e=a(this);e.attr("data-target")||d.preventDefault();var f=b(e),g=f.data("bs.collapse"),h=g?"toggle":e.data();c.call(f,h)})}(jQuery),+function(a){"use strict";function b(b){b&&3===b.which||(a(e).remove(),a(f).each(function(){var d=a(this),e=c(d),f={relatedTarget:this};e.hasClass("open")&&(e.trigger(b=a.Event("hide.bs.dropdown",f)),b.isDefaultPrevented()||(d.attr("aria-expanded","false"),e.removeClass("open").trigger("hidden.bs.dropdown",f)))}))}function c(b){var c=b.attr("data-target");c||(c=b.attr("href"),c=c&&/#[A-Za-z]/.test(c)&&c.replace(/.*(?=#[^\s]*$)/,""));var d=c&&a(c);return d&&d.length?d:b.parent()}function d(b){return this.each(function(){var c=a(this),d=c.data("bs.dropdown");d||c.data("bs.dropdown",d=new g(this)),"string"==typeof b&&d[b].call(c)})}var e=".dropdown-backdrop",f='[data-toggle="dropdown"]',g=function(b){a(b).on("click.bs.dropdown",this.toggle)};g.VERSION="3.3.4",g.prototype.toggle=function(d){var e=a(this);if(!e.is(".disabled, :disabled")){var f=c(e),g=f.hasClass("open");if(b(),!g){"ontouchstart"in document.documentElement&&!f.closest(".navbar-nav").length&&a('',trigger:"hover focus",title:"",delay:0,html:!1,container:!1,viewport:{selector:"body",padding:0}},c.prototype.init=function(b,c,d){if(this.enabled=!0,this.type=b,this.$element=a(c),this.options=this.getOptions(d),this.$viewport=this.options.viewport&&a(this.options.viewport.selector||this.options.viewport),this.$element[0]instanceof document.constructor&&!this.options.selector)throw new Error("`selector` option must be specified when initializing "+this.type+" on the window.document object!");for(var e=this.options.trigger.split(" "),f=e.length;f--;){var g=e[f];if("click"==g)this.$element.on("click."+this.type,this.options.selector,a.proxy(this.toggle,this));else if("manual"!=g){var h="hover"==g?"mouseenter":"focusin",i="hover"==g?"mouseleave":"focusout";this.$element.on(h+"."+this.type,this.options.selector,a.proxy(this.enter,this)),this.$element.on(i+"."+this.type,this.options.selector,a.proxy(this.leave,this))}}this.options.selector?this._options=a.extend({},this.options,{trigger:"manual",selector:""}):this.fixTitle()},c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.getOptions=function(b){return b=a.extend({},this.getDefaults(),this.$element.data(),b),b.delay&&"number"==typeof b.delay&&(b.delay={show:b.delay,hide:b.delay}),b},c.prototype.getDelegateOptions=function(){var b={},c=this.getDefaults();return this._options&&a.each(this._options,function(a,d){c[a]!=d&&(b[a]=d)}),b},c.prototype.enter=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c&&c.$tip&&c.$tip.is(":visible")?void(c.hoverState="in"):(c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),clearTimeout(c.timeout),c.hoverState="in",c.options.delay&&c.options.delay.show?void(c.timeout=setTimeout(function(){"in"==c.hoverState&&c.show()},c.options.delay.show)):c.show())},c.prototype.leave=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),clearTimeout(c.timeout),c.hoverState="out",c.options.delay&&c.options.delay.hide?void(c.timeout=setTimeout(function(){"out"==c.hoverState&&c.hide()},c.options.delay.hide)):c.hide()},c.prototype.show=function(){var b=a.Event("show.bs."+this.type);if(this.hasContent()&&this.enabled){this.$element.trigger(b);var d=a.contains(this.$element[0].ownerDocument.documentElement,this.$element[0]);if(b.isDefaultPrevented()||!d)return;var e=this,f=this.tip(),g=this.getUID(this.type);this.setContent(),f.attr("id",g),this.$element.attr("aria-describedby",g),this.options.animation&&f.addClass("fade");var h="function"==typeof this.options.placement?this.options.placement.call(this,f[0],this.$element[0]):this.options.placement,i=/\s?auto?\s?/i,j=i.test(h);j&&(h=h.replace(i,"")||"top"),f.detach().css({top:0,left:0,display:"block"}).addClass(h).data("bs."+this.type,this),this.options.container?f.appendTo(this.options.container):f.insertAfter(this.$element);var k=this.getPosition(),l=f[0].offsetWidth,m=f[0].offsetHeight;if(j){var n=h,o=this.options.container?a(this.options.container):this.$element.parent(),p=this.getPosition(o);h="bottom"==h&&k.bottom+m>p.bottom?"top":"top"==h&&k.top-mp.width?"left":"left"==h&&k.left-lg.top+g.height&&(e.top=g.top+g.height-i)}else{var j=b.left-f,k=b.left+f+c;jg.width&&(e.left=g.left+g.width-k)}return e},c.prototype.getTitle=function(){var a,b=this.$element,c=this.options;return a=b.attr("data-original-title")||("function"==typeof c.title?c.title.call(b[0]):c.title)},c.prototype.getUID=function(a){do a+=~~(1e6*Math.random());while(document.getElementById(a));return a},c.prototype.tip=function(){return this.$tip=this.$tip||a(this.options.template)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".tooltip-arrow")},c.prototype.enable=function(){this.enabled=!0},c.prototype.disable=function(){this.enabled=!1},c.prototype.toggleEnabled=function(){this.enabled=!this.enabled},c.prototype.toggle=function(b){var c=this;b&&(c=a(b.currentTarget).data("bs."+this.type),c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c))),c.tip().hasClass("in")?c.leave(c):c.enter(c)},c.prototype.destroy=function(){var a=this;clearTimeout(this.timeout),this.hide(function(){a.$element.off("."+a.type).removeData("bs."+a.type)})};var d=a.fn.tooltip;a.fn.tooltip=b,a.fn.tooltip.Constructor=c,a.fn.tooltip.noConflict=function(){return a.fn.tooltip=d,this}}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.popover"),f="object"==typeof b&&b;(e||!/destroy|hide/.test(b))&&(e||d.data("bs.popover",e=new c(this,f)),"string"==typeof b&&e[b]())})}var c=function(a,b){this.init("popover",a,b)};if(!a.fn.tooltip)throw new Error("Popover requires tooltip.js");c.VERSION="3.3.4",c.DEFAULTS=a.extend({},a.fn.tooltip.Constructor.DEFAULTS,{placement:"right",trigger:"click",content:"",template:''}),c.prototype=a.extend({},a.fn.tooltip.Constructor.prototype),c.prototype.constructor=c,c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.setContent=function(){var a=this.tip(),b=this.getTitle(),c=this.getContent();a.find(".popover-title")[this.options.html?"html":"text"](b),a.find(".popover-content").children().detach().end()[this.options.html?"string"==typeof c?"html":"append":"text"](c),a.removeClass("fade top bottom left right in"),a.find(".popover-title").html()||a.find(".popover-title").hide()},c.prototype.hasContent=function(){return this.getTitle()||this.getContent()},c.prototype.getContent=function(){var a=this.$element,b=this.options;return a.attr("data-content")||("function"==typeof b.content?b.content.call(a[0]):b.content)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".arrow")};var d=a.fn.popover;a.fn.popover=b,a.fn.popover.Constructor=c,a.fn.popover.noConflict=function(){return a.fn.popover=d,this}}(jQuery),+function(a){"use strict";function b(c,d){this.$body=a(document.body),this.$scrollElement=a(a(c).is(document.body)?window:c),this.options=a.extend({},b.DEFAULTS,d),this.selector=(this.options.target||"")+" .nav li > a",this.offsets=[],this.targets=[],this.activeTarget=null,this.scrollHeight=0,this.$scrollElement.on("scroll.bs.scrollspy",a.proxy(this.process,this)),this.refresh(),this.process()}function c(c){return this.each(function(){var d=a(this),e=d.data("bs.scrollspy"),f="object"==typeof c&&c;e||d.data("bs.scrollspy",e=new b(this,f)),"string"==typeof c&&e[c]()})}b.VERSION="3.3.4",b.DEFAULTS={offset:10},b.prototype.getScrollHeight=function(){return this.$scrollElement[0].scrollHeight||Math.max(this.$body[0].scrollHeight,document.documentElement.scrollHeight)},b.prototype.refresh=function(){var b=this,c="offset",d=0;this.offsets=[],this.targets=[],this.scrollHeight=this.getScrollHeight(),a.isWindow(this.$scrollElement[0])||(c="position",d=this.$scrollElement.scrollTop()),this.$body.find(this.selector).map(function(){var b=a(this),e=b.data("target")||b.attr("href"),f=/^#./.test(e)&&a(e);return f&&f.length&&f.is(":visible")&&[[f[c]().top+d,e]]||null}).sort(function(a,b){return a[0]-b[0]}).each(function(){b.offsets.push(this[0]),b.targets.push(this[1])})},b.prototype.process=function(){var a,b=this.$scrollElement.scrollTop()+this.options.offset,c=this.getScrollHeight(),d=this.options.offset+c-this.$scrollElement.height(),e=this.offsets,f=this.targets,g=this.activeTarget;if(this.scrollHeight!=c&&this.refresh(),b>=d)return g!=(a=f[f.length-1])&&this.activate(a);if(g&&b=e[a]&&(void 0===e[a+1]||b .dropdown-menu > .active").removeClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!1),b.addClass("active").find('[data-toggle="tab"]').attr("aria-expanded",!0),h?(b[0].offsetWidth,b.addClass("in")):b.removeClass("fade"),b.parent(".dropdown-menu").length&&b.closest("li.dropdown").addClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!0),e&&e()}var g=d.find("> .active"),h=e&&a.support.transition&&(g.length&&g.hasClass("fade")||!!d.find("> .fade").length);g.length&&h?g.one("bsTransitionEnd",f).emulateTransitionEnd(c.TRANSITION_DURATION):f(),g.removeClass("in")};var d=a.fn.tab;a.fn.tab=b,a.fn.tab.Constructor=c,a.fn.tab.noConflict=function(){return a.fn.tab=d,this};var e=function(c){c.preventDefault(),b.call(a(this),"show")};a(document).on("click.bs.tab.data-api",'[data-toggle="tab"]',e).on("click.bs.tab.data-api",'[data-toggle="pill"]',e)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.affix"),f="object"==typeof b&&b;e||d.data("bs.affix",e=new c(this,f)),"string"==typeof b&&e[b]()})}var c=function(b,d){this.options=a.extend({},c.DEFAULTS,d),this.$target=a(this.options.target).on("scroll.bs.affix.data-api",a.proxy(this.checkPosition,this)).on("click.bs.affix.data-api",a.proxy(this.checkPositionWithEventLoop,this)),this.$element=a(b),this.affixed=null,this.unpin=null,this.pinnedOffset=null,this.checkPosition()};c.VERSION="3.3.4",c.RESET="affix affix-top affix-bottom",c.DEFAULTS={offset:0,target:window},c.prototype.getState=function(a,b,c,d){var e=this.$target.scrollTop(),f=this.$element.offset(),g=this.$target.height();if(null!=c&&"top"==this.affixed)return c>e?"top":!1;if("bottom"==this.affixed)return null!=c?e+this.unpin<=f.top?!1:"bottom":a-d>=e+g?!1:"bottom";var h=null==this.affixed,i=h?e:f.top,j=h?g:b;return null!=c&&c>=e?"top":null!=d&&i+j>=a-d?"bottom":!1},c.prototype.getPinnedOffset=function(){if(this.pinnedOffset)return this.pinnedOffset;this.$element.removeClass(c.RESET).addClass("affix");var a=this.$target.scrollTop(),b=this.$element.offset();return this.pinnedOffset=b.top-a},c.prototype.checkPositionWithEventLoop=function(){setTimeout(a.proxy(this.checkPosition,this),1)},c.prototype.checkPosition=function(){if(this.$element.is(":visible")){var b=this.$element.height(),d=this.options.offset,e=d.top,f=d.bottom,g=a(document.body).height();"object"!=typeof d&&(f=e=d),"function"==typeof e&&(e=d.top(this.$element)),"function"==typeof f&&(f=d.bottom(this.$element));var h=this.getState(g,b,e,f);if(this.affixed!=h){null!=this.unpin&&this.$element.css("top","");var i="affix"+(h?"-"+h:""),j=a.Event(i+".bs.affix");if(this.$element.trigger(j),j.isDefaultPrevented())return;this.affixed=h,this.unpin="bottom"==h?this.getPinnedOffset():null,this.$element.removeClass(c.RESET).addClass(i).trigger(i.replace("affix","affixed")+".bs.affix")}"bottom"==h&&this.$element.offset({top:g-b-f})}};var d=a.fn.affix;a.fn.affix=b,a.fn.affix.Constructor=c,a.fn.affix.noConflict=function(){return a.fn.affix=d,this},a(window).on("load",function(){a('[data-spy="affix"]').each(function(){var c=a(this),d=c.data();d.offset=d.offset||{},null!=d.offsetBottom&&(d.offset.bottom=d.offsetBottom),null!=d.offsetTop&&(d.offset.top=d.offsetTop),b.call(c,d)})})}(jQuery); --------------------------------------------------------------------------------