├── .classpath
├── .project
├── .settings
└── org.eclipse.jdt.core.prefs
├── README.md
├── bin
├── jar
│ ├── commons-codec-1.6.jar
│ ├── commons-logging-1.1.3.jar
│ ├── dom4j-1.6.1.jar
│ ├── fastjson-1.2.2.jar
│ ├── fluent-hc-4.3.1.jar
│ ├── httpclient-4.3.1.jar
│ ├── httpclient-cache-4.3.1.jar
│ ├── httpcore-4.3.jar
│ ├── httpmime-4.3.1.jar
│ ├── json-jena-1.0.jar
│ ├── jsoup-1.10.2.jar
│ └── mysql-connector-java-5.1.18.jar
└── main
│ ├── java
│ └── SohuSpider
│ │ ├── bean
│ │ └── NewsBean.class
│ │ ├── count
│ │ └── Counter.class
│ │ ├── filter
│ │ ├── BloomFilter$SimpleHash.class
│ │ ├── BloomFilter.class
│ │ └── Test.class
│ │ ├── main.class
│ │ ├── miniSpider
│ │ └── IpSpider.class
│ │ ├── service
│ │ ├── SpiderService$1.class
│ │ ├── SpiderService$2.class
│ │ ├── SpiderService$3$1.class
│ │ ├── SpiderService$3.class
│ │ └── SpiderService.class
│ │ └── util
│ │ ├── DBStatement.class
│ │ ├── HttpUtils.class
│ │ ├── JSoupUtils.class
│ │ ├── JsonUtils.class
│ │ └── XmlUtils.class
│ └── resources
│ ├── entry-config.xml
│ └── proxyip.txt
├── bits.ser
├── src
├── jar
│ ├── commons-codec-1.6.jar
│ ├── commons-logging-1.1.3.jar
│ ├── dom4j-1.6.1.jar
│ ├── fastjson-1.2.2.jar
│ ├── fluent-hc-4.3.1.jar
│ ├── httpclient-4.3.1.jar
│ ├── httpclient-cache-4.3.1.jar
│ ├── httpcore-4.3.jar
│ ├── httpmime-4.3.1.jar
│ ├── json-jena-1.0.jar
│ ├── jsoup-1.10.2.jar
│ └── mysql-connector-java-5.1.18.jar
└── main
│ ├── java
│ └── SohuSpider
│ │ ├── bean
│ │ └── NewsBean.java
│ │ ├── count
│ │ └── Counter.java
│ │ ├── filter
│ │ ├── BloomFilter.java
│ │ └── Test.java
│ │ ├── main.java
│ │ ├── miniSpider
│ │ └── IpSpider.java
│ │ ├── service
│ │ └── SpiderService.java
│ │ └── util
│ │ ├── DBStatement.java
│ │ ├── HttpUtils.java
│ │ ├── JSoupUtils.java
│ │ ├── JsonUtils.java
│ │ └── XmlUtils.java
│ └── resources
│ ├── entry-config.xml
│ └── proxyip.txt
└── urlQueue.ser
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | 搜狐爬虫(JAVA)
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.5
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.5
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 搜狐新闻爬虫(Java版)
2 | =================
3 |
4 | 2017.5.2
5 | ------------
6 |
7 | 采用知乎上某位大牛的框架进行改写
8 | 没有使用任何其他框架
9 | 可以实现海量数据新闻去重,多线程
10 | 序列化url队列,暂停之后依然可以去重
11 | 本地测试已爬取40w+新闻
12 |
13 | 工程中的中的一些结构说明:
14 | SohuSpider
15 | --main.java 主程序入口函数
16 | SohuSpider.count 数据库条目数量查询,单独main函数
17 | SohuSpider.filter bloomFilter算法实现
18 | SohuSpider.miniSpider ip代理采集爬虫
19 | SohuSpider.service 爬虫主体部分
20 | SohuSpider.util 一些json解析,请求网页等工具类
21 |
--------------------------------------------------------------------------------
/bin/jar/commons-codec-1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/commons-codec-1.6.jar
--------------------------------------------------------------------------------
/bin/jar/commons-logging-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/commons-logging-1.1.3.jar
--------------------------------------------------------------------------------
/bin/jar/dom4j-1.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/dom4j-1.6.1.jar
--------------------------------------------------------------------------------
/bin/jar/fastjson-1.2.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/fastjson-1.2.2.jar
--------------------------------------------------------------------------------
/bin/jar/fluent-hc-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/fluent-hc-4.3.1.jar
--------------------------------------------------------------------------------
/bin/jar/httpclient-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpclient-4.3.1.jar
--------------------------------------------------------------------------------
/bin/jar/httpclient-cache-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpclient-cache-4.3.1.jar
--------------------------------------------------------------------------------
/bin/jar/httpcore-4.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpcore-4.3.jar
--------------------------------------------------------------------------------
/bin/jar/httpmime-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpmime-4.3.1.jar
--------------------------------------------------------------------------------
/bin/jar/json-jena-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/json-jena-1.0.jar
--------------------------------------------------------------------------------
/bin/jar/jsoup-1.10.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/jsoup-1.10.2.jar
--------------------------------------------------------------------------------
/bin/jar/mysql-connector-java-5.1.18.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/mysql-connector-java-5.1.18.jar
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/bean/NewsBean.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/bean/NewsBean.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/count/Counter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/count/Counter.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/filter/BloomFilter$SimpleHash.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/BloomFilter$SimpleHash.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/filter/BloomFilter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/BloomFilter.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/filter/Test.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/Test.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/main.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/main.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/miniSpider/IpSpider.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/miniSpider/IpSpider.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/service/SpiderService$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$1.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/service/SpiderService$2.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$2.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/service/SpiderService$3$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$3$1.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/service/SpiderService$3.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$3.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/service/SpiderService.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/util/DBStatement.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/DBStatement.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/util/HttpUtils.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/HttpUtils.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/util/JSoupUtils.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/JSoupUtils.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/util/JsonUtils.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/JsonUtils.class
--------------------------------------------------------------------------------
/bin/main/java/SohuSpider/util/XmlUtils.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/XmlUtils.class
--------------------------------------------------------------------------------
/bin/main/resources/entry-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | http://m.sohu.com/c/32/
5 | http://m.sohu.com/c/57/
6 | http://m.sohu.com/c/53/
7 | http://m.sohu.com/cl/2686/
8 | http://m.sohu.com/c/2714/
9 | http://m.sohu.com/c/15/
10 | http://m.sohu.com/c/40/
11 | http://m.sohu.com/c/284/
12 | http://m.sohu.com/c/290/
13 | http://m.sohu.com/c/399/
14 | http://m.sohu.com/c/546/
15 | http://m.sohu.com/c/19/
16 | http://m.sohu.com/c/46/
17 | http://m.sohu.com/c/301/
18 | http://m.sohu.com/c/295/
19 | http://m.sohu.com/c/315/
20 | http://m.sohu.com/c/24/
21 | http://m.sohu.com/c/27/
22 | http://m.sohu.com/c/28/
23 | http://m.sohu.com/c/31/
24 | http://m.sohu.com/c/26/
25 | http://m.sohu.com/c/208/
26 | http://m.sohu.com/c/79/
27 | http://m.sohu.com/c/81/
28 | http://m.sohu.com/c/1918/
29 | http://m.sohu.com/c/1944/
30 | http://m.sohu.com/cl/2026/
31 | http://m.sohu.com/c/3445/
32 | http://m.sohu.com/car/model/index?_once_=000105_carmodel
33 | http://m.sohu.com/cl/33/
34 | http://m.sohu.com/c/22/
35 | http://m.sohu.com/c/103/
36 | http://m.sohu.com/cl/50/
37 | http://m.sohu.com/cl/49/
38 | http://m.sohu.com/cl/29/
39 | http://m.sohu.com/cl/34/
40 | http://m.sohu.com/cl/409/
41 | http://m.sohu.com/cl/51/
42 | http://m.sohu.com/cl/134/
43 | http://m.sohu.com/c/16430/?v=3
44 | http://m.sohu.com/c/101/?v=3
45 | http://m.sohu.com/c/61/?v=3
46 | http://m.sohu.com/c/74/
47 | http://m.sohu.com/c/267/?v=3
48 | http://m.sohu.com/cl/483/
49 | http://m.sohu.com/cl/5124/
50 | http://m.sohu.com/cl/5123/
51 | http://m.sohu.com/cl/470/
52 | http://m.sohu.com/cl/69/
53 | http://m.sohu.com/cl/182/
54 | http://m.sohu.com/cl/199/
55 | http://m.sohu.com/cl/70/
56 | http://m.sohu.com/cl/187/
57 | http://m.sohu.com/c/527/
58 | http://m.sohu.com/cl/483/
59 | http://m.sohu.com/cl/188/
60 | http://m.sohu.com/cl/189/
61 | http://m.sohu.com/cl/195/
62 | http://m.sohu.com/cl/310/
63 | http://m.sohu.com/cl/309/
64 | http://m.sohu.com/c/3124/
65 | http://m.sohu.com/c/3367/
66 | http://m.sohu.com/cl/313/
67 | http://m.sohu.com/cr/2543/
68 | http://m.sohu.com/cr/2560/
69 | http://m.sohu.com/cr/2561/
70 | http://m.sohu.com/cr/2562/
71 | http://m.sohu.com/cr/2563/
72 |
73 |
--------------------------------------------------------------------------------
/bin/main/resources/proxyip.txt:
--------------------------------------------------------------------------------
1 | 112.82.201.237:13864
2 | 175.155.24.2:808
3 | 110.6.75.164:46772
4 | 110.73.28.168:8123
5 | 121.204.165.166:8118
6 | 183.153.2.115:808
7 | 183.78.183.156:82
8 | 183.32.88.97:808
9 | 110.73.14.124:8123
10 | 110.72.33.106:8123
11 | 221.216.94.77:808
12 | 124.88.67.19:80
13 | 61.191.173.31:808
14 | 202.121.96.33:8086
15 | 110.73.7.65:8123
16 | 110.73.0.38:8123
17 | 222.85.50.127:808
18 | 183.32.88.206:808
19 |
--------------------------------------------------------------------------------
/bits.ser:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bits.ser
--------------------------------------------------------------------------------
/src/jar/commons-codec-1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/commons-codec-1.6.jar
--------------------------------------------------------------------------------
/src/jar/commons-logging-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/commons-logging-1.1.3.jar
--------------------------------------------------------------------------------
/src/jar/dom4j-1.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/dom4j-1.6.1.jar
--------------------------------------------------------------------------------
/src/jar/fastjson-1.2.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/fastjson-1.2.2.jar
--------------------------------------------------------------------------------
/src/jar/fluent-hc-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/fluent-hc-4.3.1.jar
--------------------------------------------------------------------------------
/src/jar/httpclient-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpclient-4.3.1.jar
--------------------------------------------------------------------------------
/src/jar/httpclient-cache-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpclient-cache-4.3.1.jar
--------------------------------------------------------------------------------
/src/jar/httpcore-4.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpcore-4.3.jar
--------------------------------------------------------------------------------
/src/jar/httpmime-4.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpmime-4.3.1.jar
--------------------------------------------------------------------------------
/src/jar/json-jena-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/json-jena-1.0.jar
--------------------------------------------------------------------------------
/src/jar/jsoup-1.10.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/jsoup-1.10.2.jar
--------------------------------------------------------------------------------
/src/jar/mysql-connector-java-5.1.18.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/mysql-connector-java-5.1.18.jar
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/bean/NewsBean.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.bean;
2 |
3 | public class NewsBean {
4 | String url; //新闻url
5 |
6 | String category; //新闻类别
7 |
8 | String sourceFrom; //新闻源
9 |
10 | String title; //新闻标题
11 |
12 | String content; //新闻内容
13 |
14 | String date; //发布时间
15 |
16 | String editor; //新闻作者
17 |
18 | public String getUrl(){
19 | return url;
20 | }
21 |
22 | public String getCategory(){
23 | return category;
24 | }
25 |
26 | public String getSourceFrom(){
27 | return sourceFrom;
28 | }
29 |
30 | public String getTitle(){
31 | return title;
32 | }
33 |
34 | public String getContent(){
35 | return content;
36 | }
37 |
38 | public String getDate(){
39 | return date;
40 | }
41 |
42 | public String getEditor(){
43 | return editor;
44 | }
45 |
46 | public void setUrl(String url){
47 | this.url = url;
48 | }
49 |
50 | public void setCategory(String category){
51 | this.category = category;
52 | }
53 |
54 | public void setSourceFrom(String sourceFrom){
55 | this.sourceFrom = sourceFrom;
56 | }
57 |
58 | public void setTitle(String title){
59 | this.title = title;
60 | }
61 |
62 | public void setContent(String content){
63 | this.content = content;
64 | }
65 |
66 | public void setDate(String date){
67 | this.date = date;
68 | }
69 |
70 | public void setEditor(String editor){
71 | this.editor = editor;
72 | }
73 |
74 | @Override
75 | public String toString(){
76 | return "NewsBean:{ \n" +
77 | " title:" + title + "\n" +
78 | " url:" + url + "\n" +
79 | " date:" + date + "\n" +
80 | " category:" + category + "\n" +
81 | " sourceFrom:" + sourceFrom + "\n" +
82 | " editor:" + editor + "\n" +
83 | " content:" + content + "\n" +
84 | " }"
85 | ;
86 | }
87 |
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/count/Counter.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.count;
2 |
3 | import java.sql.Connection;
4 | import java.sql.ResultSet;
5 | import java.sql.SQLException;
6 | import java.sql.Statement;
7 |
8 | import main.java.SohuSpider.util.DBStatement;
9 |
10 | /*
11 | * 查询数据库中新闻条目的数量
12 | */
13 |
14 | public class Counter {
15 | //数据库连接
16 | static Connection con = DBStatement.getCon();
17 |
18 | static Statement stmt = DBStatement.getInstance();
19 |
20 | static String sqlCount = "select count(*) from news_info";
21 | static void monitor(){
22 | while (true) {
23 | try {
24 | ResultSet rs = stmt.executeQuery(sqlCount);
25 |
26 | /**
27 | * 一定要先将结果集指针移动到第一行
28 | */
29 | rs.next();
30 | System.out.println(rs.getInt(1));
31 | } catch (SQLException e) {
32 | e.printStackTrace();
33 | }
34 | try {
35 | Thread.sleep(5000); //每隔5s查询一次
36 | } catch (InterruptedException e) {
37 | e.printStackTrace();
38 | }
39 | }
40 |
41 | }
42 |
43 | public static void main(String[] args){
44 | monitor();
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/filter/BloomFilter.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.filter;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileOutputStream;
6 | import java.io.ObjectInputStream;
7 | import java.io.ObjectOutputStream;
8 | import java.io.Serializable;
9 | import java.util.BitSet;
10 | import java.util.concurrent.BlockingQueue;
11 |
12 |
13 | /*
14 | * Bloom Filter算法 高效去重策略
15 | */
16 |
17 | public class BloomFilter implements Serializable{
18 |
19 | /* BitSet初始分配空间大小 2^24 */
20 | private static final int DEFAULT_SIZE = 1 << 25;
21 |
22 | /* 不同哈希函数种子,一般应取质数 */
23 | private static final int[] seeds = new int[]{5,7,11,13,31,37,61};
24 |
25 | private BitSet bits = null;
26 |
27 | /* 哈希函数对象 */
28 | private SimpleHash[] func = new SimpleHash[seeds.length];
29 |
30 | public BloomFilter(){
31 | for(int i = 0; i < seeds.length; i++){
32 | func[i] = new SimpleHash(DEFAULT_SIZE,seeds[i]);
33 | }
34 |
35 | File filterSer = new File("bits.ser");
36 | if (filterSer.exists()) {
37 | try{
38 | //对象反序列化
39 | ObjectInputStream ois = new ObjectInputStream(new FileInputStream(filterSer));
40 | bits = (BitSet) ois.readObject();
41 | ois.close();
42 | } catch (Exception e) {
43 | e.printStackTrace();
44 | }
45 | }else{
46 | bits = new BitSet(DEFAULT_SIZE);
47 | }
48 |
49 | }
50 |
51 | //将字符串映射到bits中
52 | public synchronized/*同步锁标记*/ void add(String value){
53 | for(SimpleHash f : func){
54 | bits.set(f.hash(value),true);
55 | }
56 | }
57 |
58 | public BitSet getBitset(){
59 | return bits;
60 | }
61 |
62 | //判断字符串是否已存在于bits集合中
63 | public boolean contains(String value){
64 | if(value == null)
65 | return false;
66 |
67 | boolean ret = true;
68 | for(SimpleHash f : func){
69 | ret = ret && bits.get(f.hash(value)); //当前仅当所有哈希函数计算出的标志位都为1的时候确定字符串一定在集合中
70 | }
71 |
72 | return ret;
73 | }
74 |
75 | /*哈希函数类*/
76 | public static class SimpleHash {
77 | private int cap;
78 | private int seed;
79 |
80 | public SimpleHash(int cap, int seed){
81 | this.cap = cap;
82 | this.seed = seed;
83 | }
84 |
85 | //hash函数,采用简单的加权和hash
86 | public int hash(String value){
87 | int result = 0 ;
88 | int len = value.length();
89 | for(int i = 0; i < len; i++){
90 | result = seed * result + value.charAt(i);
91 | }
92 | return (cap - 1) & result;
93 | }
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/filter/Test.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.filter;
2 |
3 | public class Test {
4 | private BloomFilter filter = new BloomFilter();
5 |
6 | private String[] URLs = {
7 | "www.baidu.com",
8 | "www.sohu.com",
9 | "www.sina.com",
10 | "www.google.com",
11 | "www.facebook.com",
12 | "www.wangyi.com",
13 | "www.sina.com",
14 | "www.163.com",
15 | "www.baidu.com"
16 | };
17 |
18 | public void testBloomFilter(){
19 | for(String url : URLs){
20 | if(filter.contains(url)){
21 | System.err.println('"' + url + '"' + " already exists in bits!");
22 | }
23 | else{
24 | filter.add(url);
25 | System.out.println("add " + '"' + url + '"' + " into bits. ");
26 | }
27 | }
28 | }
29 |
30 | public static void main(String[] args){
31 | Test test = new Test();
32 | test.testBloomFilter();
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/main.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider;
2 |
3 | import main.java.SohuSpider.service.SpiderService;
4 |
5 | public class main{
6 | /**
7 | * 搜狐爬虫入口
8 | *
9 | * @param
10 | * @throws InterruptedException
11 | *
12 | */
13 | public static void main(String[] args) throws InterruptedException{
14 | SpiderService spider = new SpiderService();
15 | spider.start();
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/miniSpider/IpSpider.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.miniSpider;
2 |
3 | import static main.java.SohuSpider.util.JSoupUtils.getDocument;
4 |
5 | import java.io.BufferedReader;
6 | import java.io.BufferedWriter;
7 | import java.io.FileInputStream;
8 | import java.io.FileWriter;
9 | import java.io.IOException;
10 | import java.io.InputStreamReader;
11 |
12 | import org.jsoup.Jsoup;
13 | import org.jsoup.nodes.Document;
14 | import org.jsoup.nodes.Element;
15 | import org.jsoup.select.Elements;
16 |
17 | /*
18 | * 获取可用代理ip
19 | */
20 | public class IpSpider {
21 |
22 | //代理ip网址
23 | static String proxyHost = "http://www.xicidaili.com/nn/1";
24 |
25 | static String userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36";
26 |
27 | //测试Ip代理的网址
28 | static String testUrl = "http://ip.chinaz.com/getip.aspx";
29 |
30 | static void getProxyIp() {
31 | try {
32 | //得到项目根目录
33 | String rootPath = System.getProperty("user.dir");
34 | //System.out.println(rootPath);
35 | BufferedWriter proxyIpWriter = new BufferedWriter(new FileWriter(rootPath + "/src/main/resources/proxyip.txt"));
36 | Document doc = getDocument(proxyHost);
37 | Elements ips = doc.select("#ip_list tr");
38 | //System.out.println(ips.size());
39 |
40 | for (Element e : ips) {
41 | Elements ip = e.select("td");
42 | if (ip.size() > 2){
43 | String ipAddr = ip.get(1).text();
44 | int port = Integer.parseInt(ip.get(2).text());
45 | if (testIp(ipAddr,port)) {
46 | System.out.println(ipAddr + ":" + port + " 可用");
47 | proxyIpWriter.write(ipAddr + ":" + port);
48 | proxyIpWriter.newLine();
49 | }
50 | }
51 | }
52 | proxyIpWriter.flush();
53 | proxyIpWriter.close();
54 | } catch (IOException e1) {
55 | e1.printStackTrace();
56 | }
57 | }
58 |
59 | static boolean testIp(String ip, int port) {
60 | try{
61 |
62 | //如果3s内没有响应,则该ip不可用
63 | Document doc = Jsoup.connect(testUrl)
64 | .userAgent(userAgent)
65 | .proxy(ip, port)
66 | .timeout(3000)
67 | .get();
68 | return true;
69 | } catch(Exception e) {
70 | System.out.println("访问超时");
71 | return false;
72 | }
73 | }
74 |
75 | public static void main(String[] args) {
76 | /*
77 | //添加程序监听结束
78 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable(){
79 | public void run() {
80 | System.out.println("程序结束了!");
81 |
82 | }
83 |
84 | }));
85 | */
86 | getProxyIp();
87 | System.out.println("成功获取可用代理Ip!");
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/service/SpiderService.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.service;
2 |
3 | import java.util.List;
4 | import java.util.ArrayList;
5 | import java.io.File;
6 | import java.io.FileInputStream;
7 | import java.io.FileNotFoundException;
8 | import java.io.FileOutputStream;
9 | import java.io.IOException;
10 | import java.io.ObjectInputStream;
11 | import java.io.ObjectOutputStream;
12 | import java.io.Serializable;
13 | import java.sql.Connection;
14 | import java.sql.PreparedStatement;
15 | import java.sql.ResultSet;
16 | import java.sql.SQLException;
17 | import java.sql.Statement;
18 | import java.util.concurrent.BlockingQueue;
19 | import java.util.concurrent.Executor;
20 | import java.util.concurrent.Executors;
21 | import java.util.concurrent.LinkedBlockingQueue;
22 | import java.util.concurrent.ThreadPoolExecutor;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 | import java.util.Date;
26 |
27 | import org.json.JSONException;
28 | import org.jsoup.nodes.Document;
29 | import org.jsoup.nodes.Element;
30 | import org.jsoup.select.Elements;
31 |
32 | import main.java.SohuSpider.bean.NewsBean;
33 | import main.java.SohuSpider.filter.BloomFilter;
34 | import main.java.SohuSpider.util.DBStatement;
35 | import static main.java.SohuSpider.util.XmlUtils.getAllChannels;
36 | import static main.java.SohuSpider.util.JSoupUtils.getDocument;
37 | import static main.java.SohuSpider.util.JsonUtils.parseRestContent;
38 | import static main.java.SohuSpider.util.XmlUtils.writeEntryUrls;
39 | import static main.java.SohuSpider.util.XmlUtils.loadEntryUrls;
40 |
41 | public class SpiderService implements Serializable {
42 |
43 | //使用BloomFilter算法去重
44 | static BloomFilter filter = new BloomFilter();
45 |
46 | //url阻塞队列
47 | BlockingQueue urlQueue = null;
48 |
49 | //数据库连接
50 | static Connection con = DBStatement.getCon();
51 |
52 | static Statement stmt = DBStatement.getInstance();
53 |
54 | static PreparedStatement ps = null;
55 |
56 | //线程池
57 | static Executor executor = Executors.newFixedThreadPool(20);
58 |
59 | static String urlHost = "http://m.sohu.com";
60 |
61 | //导航页面url
62 | static String urlNavigation = "https://m.sohu.com/c/395/?_once_=000025_zhitongche_daohang_v3";
63 |
64 | //爬取深度
65 | static int DEFAULT_DEPTH = 10;
66 |
67 | static int DEFAULT_THREAD_NUM = 10;
68 |
69 | public void start() throws InterruptedException{
70 |
71 | File urlsSer = new File("urlQueue.ser");
72 | if (urlsSer.exists()){
73 |
74 | try{
75 | //对象反序列化
76 | ObjectInputStream ois = new ObjectInputStream(new FileInputStream(urlsSer));
77 | urlQueue = (BlockingQueue) ois.readObject();
78 |
79 | ois.close();
80 | } catch (Exception e) {
81 | e.printStackTrace();
82 | }
83 | }
84 | else{
85 | //创建阻塞队列
86 | urlQueue = new LinkedBlockingQueue();
87 |
88 | //获取入口Url
89 | List urlChannels = genEntryChannel(urlNavigation);
90 |
91 | for (String url : urlChannels) {
92 | urlQueue.add(url);
93 | System.out.println(url);
94 | }
95 | }
96 |
97 |
98 | //添加程序监听结束,程序结束时候应序列化两个重要对象--urlQueue和filter
99 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable(){
100 |
101 | public void run() {
102 | System.out.println(urlQueue.isEmpty());
103 | try{
104 | if (urlQueue.isEmpty() == false) {
105 | //序列化urlQueue
106 | ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream("urlQueue.ser"));
107 | os.writeObject(urlQueue);
108 | os.close();
109 |
110 | }
111 |
112 | //序列化bits
113 | ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream("bits.ser"));
114 | os.writeObject(filter.getBitset());
115 | os.close();
116 | } catch(Exception e) {
117 | e.printStackTrace();
118 | }
119 |
120 | }
121 | }));
122 |
123 | for(int i = 0; i < DEFAULT_THREAD_NUM; i++){
124 | Thread a = new Thread(new Runnable() {
125 |
126 | public void run() {
127 | while (true) {
128 | String url = getAUrl();
129 | if (!filter.contains(url)) {
130 | filter.add(url);
131 | System.out.println(Thread.currentThread().getName()+"正在爬取url:" + url);
132 | if (url != null) {
133 | crawler(url);
134 | }
135 | }else {
136 | System.out.println("此url存在,不爬了." + url);
137 | }
138 | }
139 |
140 | }
141 |
142 | });
143 | executor.execute(a);
144 | }
145 |
146 | //线程池监视线程
147 | new Thread(new Runnable(){
148 | public void run() {
149 | while(true) {
150 | try{
151 | if (((ThreadPoolExecutor)executor).getActiveCount() < 10) {
152 | Thread a = new Thread(new Runnable() {
153 | public void run() {
154 | while (true) {
155 | String url = getAUrl();
156 | if (!filter.contains(url)) {
157 | filter.add(url);
158 | System.out.println(Thread.currentThread().getName()+"正在爬取url:" + url);
159 | if (url != null) {
160 | crawler(url);
161 | }
162 | }else {
163 | System.out.println("此url存在, 不爬了." + url);
164 | }
165 | }
166 | }
167 | });
168 | executor.execute(a);
169 | if (urlQueue.size() == 0) {
170 | System.out.println("队列为0了!!!!!!!");
171 | }
172 | }
173 | Thread.sleep(3000);
174 | } catch (InterruptedException e) {
175 | e.printStackTrace();
176 | }
177 | }
178 |
179 | }
180 |
181 | }).start();
182 |
183 | }
184 |
185 | /* 从导航页解析入口新闻url */
186 | public static List genEntryChannel (String startUrl) {
187 |
188 | List urlArray = new ArrayList();
189 | //小说类别的url不需要,其url特征是含有单词read
190 | String pattern = "^/c.*";
191 |
192 | Document doc = getDocument(startUrl);
193 | Elements Urls = doc.select("a.h3Sub");
194 | for (Element url : Urls) {
195 | String link = url.attr("href");
196 | if (Pattern.matches(pattern, link) == true) {
197 | urlArray.add(urlHost + link);
198 | }
199 | }
200 |
201 | writeEntryUrls(urlArray);
202 | return urlArray;
203 | }
204 |
205 |
206 | /* 爬取新闻网页 */
207 | public void crawler(String url) {
208 |
209 | Document doc = getDocument(url); //返回的Document对象一定是正确的
210 |
211 | String pattern = ".*/n/[0-9]+/.*";
212 | //System.out.println(Pattern.matches(pattern, url));
213 | if (Pattern.matches(pattern, url)){
214 |
215 | String title = "";
216 | String category = null;
217 | String sourceFrom = null;
218 | String date = null;
219 | String content = "";
220 | String editor = null;
221 |
222 | NewsBean news = new NewsBean();
223 |
224 | news.setUrl(url);
225 |
226 | try{
227 | /**
228 | * 新闻标题格式 题目-类别-手机搜狐
229 | * 但是有些题目中本身就含有 "-"
230 | */
231 | String[] temp = doc.title().trim().split("-");
232 | category = temp[temp.length - 2].substring(0, 2);
233 | for (int i = 0; i < temp.length - 2; i++){
234 | title += temp[i];
235 | }
236 | } catch (ArrayIndexOutOfBoundsException e) {
237 | //e.printStackTrace();
238 | return ;
239 | }
240 |
241 | news.setCategory(category);
242 | news.setTitle(title);
243 |
244 | Elements articleInfo = doc.body().select("div.article-info");
245 | if ( articleInfo.isEmpty() == false) {
246 | try{
247 | String[] temp = articleInfo.first().text().split(" ");
248 | sourceFrom = temp[0];
249 | date = temp[1];
250 | } catch (ArrayIndexOutOfBoundsException e) {
251 | e.printStackTrace();
252 | return ;
253 | }
254 | }
255 | news.setSourceFrom(sourceFrom);
256 | news.setDate(date);
257 |
258 | Elements paras = doc.body().select("article p");
259 | if ( paras.isEmpty() == false) {
260 | for (Element e : paras) {
261 | content += e.text();
262 | content += "\n";
263 | }
264 | }
265 |
266 | news.setContent(content);
267 |
268 | if (content.length() > 8000) {
269 | return ;
270 | }
271 |
272 |
273 | Elements divEditor = doc.body().select("div.editor");
274 | if (divEditor.isEmpty() == false) {
275 | editor = divEditor.first().text();
276 | }
277 | news.setEditor(editor);
278 |
279 | //打印用户信息
280 | System.out.println("爬取成功:" + news);
281 |
282 | String sql = "insert into news_info " +
283 | "(title,url,cate,date,srcFrom,content,editor) " +
284 | "values (?,?,?,?,?,?,?)";
285 | try {
286 | ps = con.prepareStatement(sql, Statement.SUCCESS_NO_INFO);
287 | ps.setString(1, news.getTitle());
288 | ps.setString(2, news.getUrl());
289 | ps.setString(3, news.getCategory());
290 | ps.setString(4, news.getDate());
291 | ps.setString(5, news.getSourceFrom());
292 | ps.setString(6, news.getContent());
293 | ps.setString(7, news.getEditor());
294 | //存储news
295 | ps.executeUpdate();
296 | }catch (Exception e){
297 | e.printStackTrace();
298 | }
299 | }
300 |
301 | //新闻正文url的特征 https://m.sohu.com/n/488483157/
302 | Elements urlCandidates = doc.body().select("a[href~=(.*/n/[0-9]+/)|(.*/c.*)]");
303 | for (Element e : urlCandidates){
304 | url = urlHost + e.attr("href");
305 | try {
306 | urlQueue.put(url);
307 | } catch (InterruptedException e1) {
308 |
309 | e1.printStackTrace();
310 | }
311 | }
312 |
313 | }
314 |
315 |
316 | public String getAUrl() {
317 | String tmpAUrl;
318 | try {
319 | tmpAUrl= urlQueue.take();
320 | return tmpAUrl;
321 | } catch (InterruptedException e) {
322 | e.printStackTrace();
323 | }
324 | return null;
325 | }
326 |
327 |
328 | }
329 |
330 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/util/DBStatement.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.util;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.ResultSet;
6 | import java.sql.SQLException;
7 | import java.sql.Statement;
8 |
9 |
10 | public class DBStatement {
11 | static Statement stmt;
12 | static Connection con;
13 |
14 | private DBStatement(){
15 | try {
16 | /**
17 | * jdbc四大配置参数:
18 | * 1.driverClassName:com.mysql.jdbc.Driver
19 | * 2.url:jdbc:mysql://localhost:3306/mydb
20 | * 3.username:root
21 | * 4.password:123
22 | */
23 | Class.forName("com.mysql.jdbc.Driver");//加载驱动类(注册驱动类)
24 | String mySqlUrl = "jdbc:mysql://localhost:3306/sohu";
25 | String username = "root";
26 | String password = "196214";
27 |
28 | //得到连接对象
29 | con = DriverManager.getConnection(mySqlUrl, username, password);
30 |
31 | /*对数据库做增、删、改
32 | * 1.通过Connection对象创建Statement
33 | * Statement语句的发送器,它的功能就是向数据库发送sql语句!
34 | * 2.调用他的int executeUpdate(String sql),返回影响了几行
35 | */
36 | //通过Connection 得到Statement;
37 | stmt = con.createStatement();
38 | }catch (Exception e){
39 | }
40 | }
41 |
42 | private static final DBStatement dbStatement = new DBStatement();
43 |
44 | //静态工厂方法
45 | public synchronized static Statement getInstance() {
46 | return dbStatement.stmt;
47 | }
48 |
49 | //静态工厂方法
50 | public synchronized static Connection getCon() {
51 | return dbStatement.con;
52 | }
53 |
54 | /*
55 | public static void main(String[] args){
56 |
57 | String sql = "select count(*) from news_info";
58 | try {
59 | ResultSet rs = stmt.executeQuery(sql);
60 | //一定要先将结果集指针移动到第一行
61 | rs.next();
62 | System.out.println(rs.getString(1));
63 | } catch (SQLException e) {
64 | e.printStackTrace();
65 | }
66 | }
67 | */
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/util/HttpUtils.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileInputStream;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import java.util.Random;
11 |
12 | public class HttpUtils {
13 |
14 | static String rootPath = System.getProperty("user.dir");
15 | /**
16 | * 设置代理Ip
17 | *
18 | */
19 | public static void setProxyIp(){
20 | try{
21 | List ipList = new ArrayList();
22 | BufferedReader proxyIpReader = new BufferedReader(new FileReader(rootPath + "/src/main/resources/proxyip.txt"));
23 |
24 | String ip = "";
25 | while((ip = proxyIpReader.readLine())!= null){
26 | ipList.add(ip);
27 | }
28 |
29 | Random random = new Random();
30 | int randomInt = random.nextInt(ipList.size());
31 | String ipport = ipList.get(randomInt);
32 | String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
33 | String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
34 |
35 | System.setProperty("http.maxRedirects", "50");
36 | System.getProperties().setProperty("proxySet", "true");
37 | System.getProperties().setProperty("http.proxyHost", proxyIp);
38 | System.getProperties().setProperty("http.proxyPort", proxyPort);
39 |
40 | System.out.println("设置代理ip为: " + proxyIp + "端口号为: " + proxyPort);
41 | }catch(Exception e){
42 | System.err.println("重新设置代理ip");
43 | setProxyIp();
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/util/JSoupUtils.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.util;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.jsoup.nodes.Element;
6 | import org.jsoup.select.Elements;
7 |
8 | public class JSoupUtils {
9 |
10 | static String userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36";
11 | /**
12 | * 通过地址得到document对象
13 | */
14 |
15 | public static Document getDocument(String url){
16 | try{
17 |
18 | Document document = Jsoup.connect(url)
19 | .userAgent(userAgent)
20 | .timeout(3000)
21 | .get();
22 | if(document == null || document.toString().trim().equals("")){ // 表示Ip被拦截或其他情况
23 | System.out.println("出现ip被拦截或者其他情况");
24 | HttpUtils.setProxyIp(); //重新设置代理ip
25 | getDocument(url);
26 | }
27 | return document;
28 | }catch(Exception e){ //链接超时等其他情况
29 | System.out.println("出现链接超时等其他情况");
30 | HttpUtils.setProxyIp(); // 换代理ip
31 | getDocument(url);
32 | }
33 | return getDocument(url);
34 |
35 | }
36 |
37 | /*
38 | public static void main(String[] args){
39 | String url = "https://m.sohu.com/n/557070587/";
40 | Document doc = getDocument(url);
41 | Elements paras = doc.body().select("article p");
42 | if(paras.isEmpty() == false ){
43 | for (Element p : paras) {
44 | System.out.println(p.text());
45 | }
46 | }
47 |
48 | //System.out.println(doc);
49 | }
50 | */
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/util/JsonUtils.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.util;
2 |
3 | import org.json.JSONException;
4 | import org.json.JSONObject;
5 | import org.json.JSONTokener;
6 |
7 | public class JsonUtils {
8 | public static String parseRestContent (String json) throws JSONException {
9 | JSONTokener jsonTok = new JSONTokener(json);
10 | JSONObject jsonObj = new JSONObject(jsonTok);
11 |
12 | String restContent = jsonObj.getString("rest_content");
13 | return restContent;
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/SohuSpider/util/XmlUtils.java:
--------------------------------------------------------------------------------
1 | package main.java.SohuSpider.util;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.text.SimpleDateFormat;
6 | import java.util.Date;
7 | import java.util.Iterator;
8 | import java.util.List;
9 | import java.util.ArrayList;
10 |
11 | import org.dom4j.Document;
12 | import org.dom4j.DocumentException;
13 | import org.dom4j.Element;
14 | import org.dom4j.io.OutputFormat;
15 | import org.dom4j.io.SAXReader;
16 | import org.dom4j.io.XMLWriter;
17 |
18 |
19 | public class XmlUtils {
20 |
21 | //项目根目录
22 | static String rootPath = System.getProperty("user.dir");
23 |
24 | public static List getAllChannels() {
25 | return loadEntryUrls();
26 | }
27 |
28 | public static void writeEntryUrls(List urlArray){
29 | SAXReader sr = new SAXReader();
30 | try{
31 | Document doc = sr.read(new File(rootPath +"/src/main/resources/entry-config.xml"));
32 |
33 | Element root = doc.getRootElement();
34 |
35 | if (root.elements().isEmpty() == false) {
36 | return ;
37 | }
38 |
39 | for (String url : urlArray) {
40 | root.addElement("url").addText(url);
41 | }
42 |
43 | FileOutputStream out =new FileOutputStream(new File(rootPath + "/src/main/resources/entry-config.xml"));
44 | // 指定文本的写出的格式:
45 | OutputFormat format=OutputFormat.createPrettyPrint(); //漂亮格式:有空格换行
46 | format.setEncoding("UTF-8");
47 | //创建写出对象
48 | XMLWriter writer=new XMLWriter(out,format);
49 | //写出Document对象
50 | writer.write(doc);
51 | //关闭流
52 | writer.close();
53 | }catch(Exception e){
54 | e.printStackTrace();
55 | }
56 | }
57 |
58 | public static List loadEntryUrls(){
59 |
60 | List urlArray = new ArrayList();
61 |
62 | //读入xml文件中的用户信息
63 | SAXReader sr = new SAXReader();
64 | try{
65 | Document doc = sr.read(new File(rootPath + "/src/main/resources/entry-config.xml"));
66 | Element root = doc.getRootElement();
67 |
68 | //System.out.println(root.getText());
69 | //查找所有url结点
70 | List urls = root.selectNodes("//url");
71 | for (Iterator it = urls.iterator(); it.hasNext();) {
72 | String url = ((Element)it.next()).getTextTrim();
73 | System.out.println(url);
74 | urlArray.add(url);
75 | }
76 |
77 | }catch(DocumentException e){
78 | e.printStackTrace();
79 | }
80 |
81 | return urlArray;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/resources/entry-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | http://m.sohu.com/c/32/
5 | http://m.sohu.com/c/57/
6 | http://m.sohu.com/c/53/
7 | http://m.sohu.com/cl/2686/
8 | http://m.sohu.com/c/2714/
9 | http://m.sohu.com/c/15/
10 | http://m.sohu.com/c/40/
11 | http://m.sohu.com/c/284/
12 | http://m.sohu.com/c/290/
13 | http://m.sohu.com/c/399/
14 | http://m.sohu.com/c/546/
15 | http://m.sohu.com/c/19/
16 | http://m.sohu.com/c/46/
17 | http://m.sohu.com/c/301/
18 | http://m.sohu.com/c/295/
19 | http://m.sohu.com/c/315/
20 | http://m.sohu.com/c/24/
21 | http://m.sohu.com/c/27/
22 | http://m.sohu.com/c/28/
23 | http://m.sohu.com/c/31/
24 | http://m.sohu.com/c/26/
25 | http://m.sohu.com/c/208/
26 | http://m.sohu.com/c/79/
27 | http://m.sohu.com/c/81/
28 | http://m.sohu.com/c/1918/
29 | http://m.sohu.com/c/1944/
30 | http://m.sohu.com/cl/2026/
31 | http://m.sohu.com/c/3445/
32 | http://m.sohu.com/car/model/index?_once_=000105_carmodel
33 | http://m.sohu.com/cl/33/
34 | http://m.sohu.com/c/22/
35 | http://m.sohu.com/c/103/
36 | http://m.sohu.com/cl/50/
37 | http://m.sohu.com/cl/49/
38 | http://m.sohu.com/cl/29/
39 | http://m.sohu.com/cl/34/
40 | http://m.sohu.com/cl/409/
41 | http://m.sohu.com/cl/51/
42 | http://m.sohu.com/cl/134/
43 | http://m.sohu.com/c/16430/?v=3
44 | http://m.sohu.com/c/101/?v=3
45 | http://m.sohu.com/c/61/?v=3
46 | http://m.sohu.com/c/74/
47 | http://m.sohu.com/c/267/?v=3
48 | http://m.sohu.com/cl/483/
49 | http://m.sohu.com/cl/5124/
50 | http://m.sohu.com/cl/5123/
51 | http://m.sohu.com/cl/470/
52 | http://m.sohu.com/cl/69/
53 | http://m.sohu.com/cl/182/
54 | http://m.sohu.com/cl/199/
55 | http://m.sohu.com/cl/70/
56 | http://m.sohu.com/cl/187/
57 | http://m.sohu.com/c/527/
58 | http://m.sohu.com/cl/483/
59 | http://m.sohu.com/cl/188/
60 | http://m.sohu.com/cl/189/
61 | http://m.sohu.com/cl/195/
62 | http://m.sohu.com/cl/310/
63 | http://m.sohu.com/cl/309/
64 | http://m.sohu.com/c/3124/
65 | http://m.sohu.com/c/3367/
66 | http://m.sohu.com/cl/313/
67 | http://m.sohu.com/cr/2543/
68 | http://m.sohu.com/cr/2560/
69 | http://m.sohu.com/cr/2561/
70 | http://m.sohu.com/cr/2562/
71 | http://m.sohu.com/cr/2563/
72 |
73 |
--------------------------------------------------------------------------------
/src/main/resources/proxyip.txt:
--------------------------------------------------------------------------------
1 | 112.82.201.237:13864
2 | 175.155.24.2:808
3 | 110.6.75.164:46772
4 | 110.73.28.168:8123
5 | 121.204.165.166:8118
6 | 183.153.2.115:808
7 | 183.78.183.156:82
8 | 183.32.88.97:808
9 | 110.73.14.124:8123
10 | 110.72.33.106:8123
11 | 221.216.94.77:808
12 | 124.88.67.19:80
13 | 61.191.173.31:808
14 | 202.121.96.33:8086
15 | 110.73.7.65:8123
16 | 110.73.0.38:8123
17 | 222.85.50.127:808
18 | 183.32.88.206:808
19 |
--------------------------------------------------------------------------------
/urlQueue.ser:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/urlQueue.ser
--------------------------------------------------------------------------------