├── README.md
├── crawler-common
├── crawler-common.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── crawler-framework.iml
├── crawler-message
├── crawler-message.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── crawler-monitor
├── crawler-monitor.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── crawler-storage
├── crawler-storage.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── crawler-task
├── crawler-task.iml
└── pom.xml
├── crawler-web
├── crawler-web.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── crawler-webdriver
├── crawler-webdriver.iml
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── website
│ │ └── crawler
│ │ └── App.java
│ └── test
│ └── java
│ └── com
│ └── website
│ └── crawler
│ └── AppTest.java
├── pom.xml
└── src
├── main
└── java
│ └── com
│ └── website
│ └── crawler
│ └── App.java
└── test
└── java
└── com
└── website
└── crawler
└── AppTest.java
/README.md:
--------------------------------------------------------------------------------
1 | ### 基于webdriver分布式爬虫
2 | #### 涉及到系统的功能如下:
3 | 1. crawler-web 服务代理用户交互输入的数据,如:用户名,密码,验证码,短信密码等
4 | 2. crawler-webdriver 基于webdriver的定向爬取信息,如商品信息,新闻信息,图片信息等
5 | 3. crawler-storage 爬虫信息的存储服务,可以支持hbase存储,redis存储,文件系统存储等,支持横向无线扩容
6 | 4. crawler-task 分布式任务解析器,提供了以jsoup的方式解析爬去网页信息
7 | 5. crawler-message 分布式消息系统,用来接收爬去结果信息接收,解析任务结果接收异步消息消费和订阅的处理
8 | 6. crawler-common 通用基础服务,对外提供如ip地址定位,phone定位,proxy ip随机选取等操作
9 | 7. crawler-monitor 监控爬虫网络从用户交互输入信息,后台抓取网页,网页解析,网页非结构化存储和结构化存储,消息同步等各个环节的监控,给出一个完整从爬取到交付数据团队使用中间链路的行为监控,性能监控,异常监控,数据监控,成功率等质量指标
10 |
--------------------------------------------------------------------------------
/crawler-common/crawler-common.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-common/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-common
11 | jar
12 |
13 | crawler-common
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-common/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-common/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/crawler-framework.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-message/crawler-message.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-message/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-message
11 | jar
12 |
13 | crawler-message
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-message/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-message/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/crawler-monitor/crawler-monitor.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-monitor/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-monitor
11 | jar
12 |
13 | crawler-monitor
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-monitor/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-monitor/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/crawler-storage/crawler-storage.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-storage/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-storage
11 | jar
12 |
13 | crawler-storage
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-storage/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-storage/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/crawler-task/crawler-task.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/crawler-task/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | framework
7 | com.website.crawler
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | crawler-task
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-web/crawler-web.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-web/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-web
11 | jar
12 |
13 | crawler-web
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-web/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-web/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/crawler-webdriver/crawler-webdriver.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/crawler-webdriver/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | framework
5 | com.website.crawler
6 | 1.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | crawler-webdriver
11 | jar
12 |
13 | crawler-webdriver
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/crawler-webdriver/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/crawler-webdriver/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.website.crawler
6 | framework
7 | 1.0-SNAPSHOT
8 |
9 | crawler-web
10 | crawler-webdriver
11 | crawler-task
12 | crawler-monitor
13 | crawler-message
14 | crawler-storage
15 | crawler-common
16 |
17 | pom
18 |
19 | common
20 | http://maven.apache.org
21 |
22 |
23 | UTF-8
24 |
25 |
26 |
27 |
28 | junit
29 | junit
30 | 3.8.1
31 | test
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/src/main/java/com/website/crawler/App.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/test/java/com/website/crawler/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.website.crawler;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------