├── README.md ├── crawler-common ├── crawler-common.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── crawler-framework.iml ├── crawler-message ├── crawler-message.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── crawler-monitor ├── crawler-monitor.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── crawler-storage ├── crawler-storage.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── crawler-task ├── crawler-task.iml └── pom.xml ├── crawler-web ├── crawler-web.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── crawler-webdriver ├── crawler-webdriver.iml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── website │ │ └── crawler │ │ └── App.java │ └── test │ └── java │ └── com │ └── website │ └── crawler │ └── AppTest.java ├── pom.xml └── src ├── main └── java │ └── com │ └── website │ └── crawler │ └── App.java └── test └── java └── com └── website └── crawler └── AppTest.java /README.md: -------------------------------------------------------------------------------- 1 | ### 基于webdriver分布式爬虫 2 | #### 涉及到系统的功能如下: 3 | 1. crawler-web 服务代理用户交互输入的数据,如:用户名,密码,验证码,短信密码等 4 | 2. crawler-webdriver 基于webdriver的定向爬取信息,如商品信息,新闻信息,图片信息等 5 | 3. crawler-storage 爬虫信息的存储服务,可以支持hbase存储,redis存储,文件系统存储等,支持横向无线扩容 6 | 4. crawler-task 分布式任务解析器,提供了以jsoup的方式解析爬去网页信息 7 | 5. crawler-message 分布式消息系统,用来接收爬去结果信息接收,解析任务结果接收异步消息消费和订阅的处理 8 | 6. crawler-common 通用基础服务,对外提供如ip地址定位,phone定位,proxy ip随机选取等操作 9 | 7. crawler-monitor 监控爬虫网络从用户交互输入信息,后台抓取网页,网页解析,网页非结构化存储和结构化存储,消息同步等各个环节的监控,给出一个完整从爬取到交付数据团队使用中间链路的行为监控,性能监控,异常监控,数据监控,成功率等质量指标 10 | -------------------------------------------------------------------------------- /crawler-common/crawler-common.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-common/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-common 11 | jar 12 | 13 | crawler-common 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-common/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-common/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crawler-framework.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-message/crawler-message.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-message/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-message 11 | jar 12 | 13 | crawler-message 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-message/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-message/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crawler-monitor/crawler-monitor.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-monitor/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-monitor 11 | jar 12 | 13 | crawler-monitor 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-monitor/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-monitor/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crawler-storage/crawler-storage.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-storage/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-storage 11 | jar 12 | 13 | crawler-storage 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-storage/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-storage/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crawler-task/crawler-task.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /crawler-task/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | framework 7 | com.website.crawler 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | crawler-task 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-web/crawler-web.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-web/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-web 11 | jar 12 | 13 | crawler-web 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-web/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-web/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /crawler-webdriver/crawler-webdriver.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /crawler-webdriver/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | framework 5 | com.website.crawler 6 | 1.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | crawler-webdriver 11 | jar 12 | 13 | crawler-webdriver 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /crawler-webdriver/src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /crawler-webdriver/src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.website.crawler 6 | framework 7 | 1.0-SNAPSHOT 8 | 9 | crawler-web 10 | crawler-webdriver 11 | crawler-task 12 | crawler-monitor 13 | crawler-message 14 | crawler-storage 15 | crawler-common 16 | 17 | pom 18 | 19 | common 20 | http://maven.apache.org 21 | 22 | 23 | UTF-8 24 | 25 | 26 | 27 | 28 | junit 29 | junit 30 | 3.8.1 31 | test 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/website/crawler/App.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/java/com/website/crawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.website.crawler; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------