├── .classpath ├── .project ├── .settings └── org.eclipse.jdt.core.prefs ├── README.md ├── bin ├── jar │ ├── commons-codec-1.6.jar │ ├── commons-logging-1.1.3.jar │ ├── dom4j-1.6.1.jar │ ├── fastjson-1.2.2.jar │ ├── fluent-hc-4.3.1.jar │ ├── httpclient-4.3.1.jar │ ├── httpclient-cache-4.3.1.jar │ ├── httpcore-4.3.jar │ ├── httpmime-4.3.1.jar │ ├── json-jena-1.0.jar │ ├── jsoup-1.10.2.jar │ └── mysql-connector-java-5.1.18.jar └── main │ ├── java │ └── SohuSpider │ │ ├── bean │ │ └── NewsBean.class │ │ ├── count │ │ └── Counter.class │ │ ├── filter │ │ ├── BloomFilter$SimpleHash.class │ │ ├── BloomFilter.class │ │ └── Test.class │ │ ├── main.class │ │ ├── miniSpider │ │ └── IpSpider.class │ │ ├── service │ │ ├── SpiderService$1.class │ │ ├── SpiderService$2.class │ │ ├── SpiderService$3$1.class │ │ ├── SpiderService$3.class │ │ └── SpiderService.class │ │ └── util │ │ ├── DBStatement.class │ │ ├── HttpUtils.class │ │ ├── JSoupUtils.class │ │ ├── JsonUtils.class │ │ └── XmlUtils.class │ └── resources │ ├── entry-config.xml │ └── proxyip.txt ├── bits.ser ├── src ├── jar │ ├── commons-codec-1.6.jar │ ├── commons-logging-1.1.3.jar │ ├── dom4j-1.6.1.jar │ ├── fastjson-1.2.2.jar │ ├── fluent-hc-4.3.1.jar │ ├── httpclient-4.3.1.jar │ ├── httpclient-cache-4.3.1.jar │ ├── httpcore-4.3.jar │ ├── httpmime-4.3.1.jar │ ├── json-jena-1.0.jar │ ├── jsoup-1.10.2.jar │ └── mysql-connector-java-5.1.18.jar └── main │ ├── java │ └── SohuSpider │ │ ├── bean │ │ └── NewsBean.java │ │ ├── count │ │ └── Counter.java │ │ ├── filter │ │ ├── BloomFilter.java │ │ └── Test.java │ │ ├── main.java │ │ ├── miniSpider │ │ └── IpSpider.java │ │ ├── service │ │ └── SpiderService.java │ │ └── util │ │ ├── DBStatement.java │ │ ├── HttpUtils.java │ │ ├── JSoupUtils.java │ │ ├── JsonUtils.java │ │ └── XmlUtils.java │ └── resources │ ├── entry-config.xml │ └── proxyip.txt └── urlQueue.ser /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | 搜狐爬虫(JAVA) 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.5 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.5 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 搜狐新闻爬虫(Java版) 2 | ================= 3 | 4 | 2017.5.2 5 | ------------ 6 | 7 | 采用知乎上某位大牛的框架进行改写
8 | 没有使用任何其他框架
9 | 可以实现海量数据新闻去重,多线程
10 | 序列化url队列,暂停之后依然可以去重
11 | 本地测试已爬取40w+新闻
12 | 13 | 工程中的中的一些结构说明:
14 | SohuSpider
15 | --main.java 主程序入口函数
16 | SohuSpider.count 数据库条目数量查询,单独main函数
17 | SohuSpider.filter bloomFilter算法实现
18 | SohuSpider.miniSpider ip代理采集爬虫
19 | SohuSpider.service 爬虫主体部分
20 | SohuSpider.util 一些json解析,请求网页等工具类
21 | -------------------------------------------------------------------------------- /bin/jar/commons-codec-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/commons-codec-1.6.jar -------------------------------------------------------------------------------- /bin/jar/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /bin/jar/dom4j-1.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/dom4j-1.6.1.jar -------------------------------------------------------------------------------- /bin/jar/fastjson-1.2.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/fastjson-1.2.2.jar -------------------------------------------------------------------------------- /bin/jar/fluent-hc-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/fluent-hc-4.3.1.jar -------------------------------------------------------------------------------- /bin/jar/httpclient-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpclient-4.3.1.jar -------------------------------------------------------------------------------- /bin/jar/httpclient-cache-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpclient-cache-4.3.1.jar -------------------------------------------------------------------------------- /bin/jar/httpcore-4.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpcore-4.3.jar -------------------------------------------------------------------------------- /bin/jar/httpmime-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/httpmime-4.3.1.jar -------------------------------------------------------------------------------- /bin/jar/json-jena-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/json-jena-1.0.jar -------------------------------------------------------------------------------- /bin/jar/jsoup-1.10.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/jsoup-1.10.2.jar -------------------------------------------------------------------------------- /bin/jar/mysql-connector-java-5.1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/jar/mysql-connector-java-5.1.18.jar -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/bean/NewsBean.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/bean/NewsBean.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/count/Counter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/count/Counter.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/filter/BloomFilter$SimpleHash.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/BloomFilter$SimpleHash.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/filter/BloomFilter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/BloomFilter.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/filter/Test.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/filter/Test.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/main.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/main.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/miniSpider/IpSpider.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/miniSpider/IpSpider.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/service/SpiderService$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$1.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/service/SpiderService$2.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$2.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/service/SpiderService$3$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$3$1.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/service/SpiderService$3.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService$3.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/service/SpiderService.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/service/SpiderService.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/util/DBStatement.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/DBStatement.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/util/HttpUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/HttpUtils.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/util/JSoupUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/JSoupUtils.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/util/JsonUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/JsonUtils.class -------------------------------------------------------------------------------- /bin/main/java/SohuSpider/util/XmlUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bin/main/java/SohuSpider/util/XmlUtils.class -------------------------------------------------------------------------------- /bin/main/resources/entry-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://m.sohu.com/c/32/ 5 | http://m.sohu.com/c/57/ 6 | http://m.sohu.com/c/53/ 7 | http://m.sohu.com/cl/2686/ 8 | http://m.sohu.com/c/2714/ 9 | http://m.sohu.com/c/15/ 10 | http://m.sohu.com/c/40/ 11 | http://m.sohu.com/c/284/ 12 | http://m.sohu.com/c/290/ 13 | http://m.sohu.com/c/399/ 14 | http://m.sohu.com/c/546/ 15 | http://m.sohu.com/c/19/ 16 | http://m.sohu.com/c/46/ 17 | http://m.sohu.com/c/301/ 18 | http://m.sohu.com/c/295/ 19 | http://m.sohu.com/c/315/ 20 | http://m.sohu.com/c/24/ 21 | http://m.sohu.com/c/27/ 22 | http://m.sohu.com/c/28/ 23 | http://m.sohu.com/c/31/ 24 | http://m.sohu.com/c/26/ 25 | http://m.sohu.com/c/208/ 26 | http://m.sohu.com/c/79/ 27 | http://m.sohu.com/c/81/ 28 | http://m.sohu.com/c/1918/ 29 | http://m.sohu.com/c/1944/ 30 | http://m.sohu.com/cl/2026/ 31 | http://m.sohu.com/c/3445/ 32 | http://m.sohu.com/car/model/index?_once_=000105_carmodel 33 | http://m.sohu.com/cl/33/ 34 | http://m.sohu.com/c/22/ 35 | http://m.sohu.com/c/103/ 36 | http://m.sohu.com/cl/50/ 37 | http://m.sohu.com/cl/49/ 38 | http://m.sohu.com/cl/29/ 39 | http://m.sohu.com/cl/34/ 40 | http://m.sohu.com/cl/409/ 41 | http://m.sohu.com/cl/51/ 42 | http://m.sohu.com/cl/134/ 43 | http://m.sohu.com/c/16430/?v=3 44 | http://m.sohu.com/c/101/?v=3 45 | http://m.sohu.com/c/61/?v=3 46 | http://m.sohu.com/c/74/ 47 | http://m.sohu.com/c/267/?v=3 48 | http://m.sohu.com/cl/483/ 49 | http://m.sohu.com/cl/5124/ 50 | http://m.sohu.com/cl/5123/ 51 | http://m.sohu.com/cl/470/ 52 | http://m.sohu.com/cl/69/ 53 | http://m.sohu.com/cl/182/ 54 | http://m.sohu.com/cl/199/ 55 | http://m.sohu.com/cl/70/ 56 | http://m.sohu.com/cl/187/ 57 | http://m.sohu.com/c/527/ 58 | http://m.sohu.com/cl/483/ 59 | http://m.sohu.com/cl/188/ 60 | http://m.sohu.com/cl/189/ 61 | http://m.sohu.com/cl/195/ 62 | http://m.sohu.com/cl/310/ 63 | http://m.sohu.com/cl/309/ 64 | http://m.sohu.com/c/3124/ 65 | http://m.sohu.com/c/3367/ 66 | http://m.sohu.com/cl/313/ 67 | http://m.sohu.com/cr/2543/ 68 | http://m.sohu.com/cr/2560/ 69 | http://m.sohu.com/cr/2561/ 70 | http://m.sohu.com/cr/2562/ 71 | http://m.sohu.com/cr/2563/ 72 | 73 | -------------------------------------------------------------------------------- /bin/main/resources/proxyip.txt: -------------------------------------------------------------------------------- 1 | 112.82.201.237:13864 2 | 175.155.24.2:808 3 | 110.6.75.164:46772 4 | 110.73.28.168:8123 5 | 121.204.165.166:8118 6 | 183.153.2.115:808 7 | 183.78.183.156:82 8 | 183.32.88.97:808 9 | 110.73.14.124:8123 10 | 110.72.33.106:8123 11 | 221.216.94.77:808 12 | 124.88.67.19:80 13 | 61.191.173.31:808 14 | 202.121.96.33:8086 15 | 110.73.7.65:8123 16 | 110.73.0.38:8123 17 | 222.85.50.127:808 18 | 183.32.88.206:808 19 | -------------------------------------------------------------------------------- /bits.ser: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/bits.ser -------------------------------------------------------------------------------- /src/jar/commons-codec-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/commons-codec-1.6.jar -------------------------------------------------------------------------------- /src/jar/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /src/jar/dom4j-1.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/dom4j-1.6.1.jar -------------------------------------------------------------------------------- /src/jar/fastjson-1.2.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/fastjson-1.2.2.jar -------------------------------------------------------------------------------- /src/jar/fluent-hc-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/fluent-hc-4.3.1.jar -------------------------------------------------------------------------------- /src/jar/httpclient-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpclient-4.3.1.jar -------------------------------------------------------------------------------- /src/jar/httpclient-cache-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpclient-cache-4.3.1.jar -------------------------------------------------------------------------------- /src/jar/httpcore-4.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpcore-4.3.jar -------------------------------------------------------------------------------- /src/jar/httpmime-4.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/httpmime-4.3.1.jar -------------------------------------------------------------------------------- /src/jar/json-jena-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/json-jena-1.0.jar -------------------------------------------------------------------------------- /src/jar/jsoup-1.10.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/jsoup-1.10.2.jar -------------------------------------------------------------------------------- /src/jar/mysql-connector-java-5.1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/src/jar/mysql-connector-java-5.1.18.jar -------------------------------------------------------------------------------- /src/main/java/SohuSpider/bean/NewsBean.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.bean; 2 | 3 | public class NewsBean { 4 | String url; //新闻url 5 | 6 | String category; //新闻类别 7 | 8 | String sourceFrom; //新闻源 9 | 10 | String title; //新闻标题 11 | 12 | String content; //新闻内容 13 | 14 | String date; //发布时间 15 | 16 | String editor; //新闻作者 17 | 18 | public String getUrl(){ 19 | return url; 20 | } 21 | 22 | public String getCategory(){ 23 | return category; 24 | } 25 | 26 | public String getSourceFrom(){ 27 | return sourceFrom; 28 | } 29 | 30 | public String getTitle(){ 31 | return title; 32 | } 33 | 34 | public String getContent(){ 35 | return content; 36 | } 37 | 38 | public String getDate(){ 39 | return date; 40 | } 41 | 42 | public String getEditor(){ 43 | return editor; 44 | } 45 | 46 | public void setUrl(String url){ 47 | this.url = url; 48 | } 49 | 50 | public void setCategory(String category){ 51 | this.category = category; 52 | } 53 | 54 | public void setSourceFrom(String sourceFrom){ 55 | this.sourceFrom = sourceFrom; 56 | } 57 | 58 | public void setTitle(String title){ 59 | this.title = title; 60 | } 61 | 62 | public void setContent(String content){ 63 | this.content = content; 64 | } 65 | 66 | public void setDate(String date){ 67 | this.date = date; 68 | } 69 | 70 | public void setEditor(String editor){ 71 | this.editor = editor; 72 | } 73 | 74 | @Override 75 | public String toString(){ 76 | return "NewsBean:{ \n" + 77 | " title:" + title + "\n" + 78 | " url:" + url + "\n" + 79 | " date:" + date + "\n" + 80 | " category:" + category + "\n" + 81 | " sourceFrom:" + sourceFrom + "\n" + 82 | " editor:" + editor + "\n" + 83 | " content:" + content + "\n" + 84 | " }" 85 | ; 86 | } 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/count/Counter.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.count; 2 | 3 | import java.sql.Connection; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.sql.Statement; 7 | 8 | import main.java.SohuSpider.util.DBStatement; 9 | 10 | /* 11 | * 查询数据库中新闻条目的数量 12 | */ 13 | 14 | public class Counter { 15 | //数据库连接 16 | static Connection con = DBStatement.getCon(); 17 | 18 | static Statement stmt = DBStatement.getInstance(); 19 | 20 | static String sqlCount = "select count(*) from news_info"; 21 | static void monitor(){ 22 | while (true) { 23 | try { 24 | ResultSet rs = stmt.executeQuery(sqlCount); 25 | 26 | /** 27 | * 一定要先将结果集指针移动到第一行 28 | */ 29 | rs.next(); 30 | System.out.println(rs.getInt(1)); 31 | } catch (SQLException e) { 32 | e.printStackTrace(); 33 | } 34 | try { 35 | Thread.sleep(5000); //每隔5s查询一次 36 | } catch (InterruptedException e) { 37 | e.printStackTrace(); 38 | } 39 | } 40 | 41 | } 42 | 43 | public static void main(String[] args){ 44 | monitor(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/filter/BloomFilter.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.filter; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.ObjectInputStream; 7 | import java.io.ObjectOutputStream; 8 | import java.io.Serializable; 9 | import java.util.BitSet; 10 | import java.util.concurrent.BlockingQueue; 11 | 12 | 13 | /* 14 | * Bloom Filter算法 高效去重策略 15 | */ 16 | 17 | public class BloomFilter implements Serializable{ 18 | 19 | /* BitSet初始分配空间大小 2^24 */ 20 | private static final int DEFAULT_SIZE = 1 << 25; 21 | 22 | /* 不同哈希函数种子,一般应取质数 */ 23 | private static final int[] seeds = new int[]{5,7,11,13,31,37,61}; 24 | 25 | private BitSet bits = null; 26 | 27 | /* 哈希函数对象 */ 28 | private SimpleHash[] func = new SimpleHash[seeds.length]; 29 | 30 | public BloomFilter(){ 31 | for(int i = 0; i < seeds.length; i++){ 32 | func[i] = new SimpleHash(DEFAULT_SIZE,seeds[i]); 33 | } 34 | 35 | File filterSer = new File("bits.ser"); 36 | if (filterSer.exists()) { 37 | try{ 38 | //对象反序列化 39 | ObjectInputStream ois = new ObjectInputStream(new FileInputStream(filterSer)); 40 | bits = (BitSet) ois.readObject(); 41 | ois.close(); 42 | } catch (Exception e) { 43 | e.printStackTrace(); 44 | } 45 | }else{ 46 | bits = new BitSet(DEFAULT_SIZE); 47 | } 48 | 49 | } 50 | 51 | //将字符串映射到bits中 52 | public synchronized/*同步锁标记*/ void add(String value){ 53 | for(SimpleHash f : func){ 54 | bits.set(f.hash(value),true); 55 | } 56 | } 57 | 58 | public BitSet getBitset(){ 59 | return bits; 60 | } 61 | 62 | //判断字符串是否已存在于bits集合中 63 | public boolean contains(String value){ 64 | if(value == null) 65 | return false; 66 | 67 | boolean ret = true; 68 | for(SimpleHash f : func){ 69 | ret = ret && bits.get(f.hash(value)); //当前仅当所有哈希函数计算出的标志位都为1的时候确定字符串一定在集合中 70 | } 71 | 72 | return ret; 73 | } 74 | 75 | /*哈希函数类*/ 76 | public static class SimpleHash { 77 | private int cap; 78 | private int seed; 79 | 80 | public SimpleHash(int cap, int seed){ 81 | this.cap = cap; 82 | this.seed = seed; 83 | } 84 | 85 | //hash函数,采用简单的加权和hash 86 | public int hash(String value){ 87 | int result = 0 ; 88 | int len = value.length(); 89 | for(int i = 0; i < len; i++){ 90 | result = seed * result + value.charAt(i); 91 | } 92 | return (cap - 1) & result; 93 | } 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/filter/Test.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.filter; 2 | 3 | public class Test { 4 | private BloomFilter filter = new BloomFilter(); 5 | 6 | private String[] URLs = { 7 | "www.baidu.com", 8 | "www.sohu.com", 9 | "www.sina.com", 10 | "www.google.com", 11 | "www.facebook.com", 12 | "www.wangyi.com", 13 | "www.sina.com", 14 | "www.163.com", 15 | "www.baidu.com" 16 | }; 17 | 18 | public void testBloomFilter(){ 19 | for(String url : URLs){ 20 | if(filter.contains(url)){ 21 | System.err.println('"' + url + '"' + " already exists in bits!"); 22 | } 23 | else{ 24 | filter.add(url); 25 | System.out.println("add " + '"' + url + '"' + " into bits. "); 26 | } 27 | } 28 | } 29 | 30 | public static void main(String[] args){ 31 | Test test = new Test(); 32 | test.testBloomFilter(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/main.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider; 2 | 3 | import main.java.SohuSpider.service.SpiderService; 4 | 5 | public class main{ 6 | /** 7 | * 搜狐爬虫入口 8 | * 9 | * @param 10 | * @throws InterruptedException 11 | * 12 | */ 13 | public static void main(String[] args) throws InterruptedException{ 14 | SpiderService spider = new SpiderService(); 15 | spider.start(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/miniSpider/IpSpider.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.miniSpider; 2 | 3 | import static main.java.SohuSpider.util.JSoupUtils.getDocument; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.BufferedWriter; 7 | import java.io.FileInputStream; 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.io.InputStreamReader; 11 | 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | 17 | /* 18 | * 获取可用代理ip 19 | */ 20 | public class IpSpider { 21 | 22 | //代理ip网址 23 | static String proxyHost = "http://www.xicidaili.com/nn/1"; 24 | 25 | static String userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36"; 26 | 27 | //测试Ip代理的网址 28 | static String testUrl = "http://ip.chinaz.com/getip.aspx"; 29 | 30 | static void getProxyIp() { 31 | try { 32 | //得到项目根目录 33 | String rootPath = System.getProperty("user.dir"); 34 | //System.out.println(rootPath); 35 | BufferedWriter proxyIpWriter = new BufferedWriter(new FileWriter(rootPath + "/src/main/resources/proxyip.txt")); 36 | Document doc = getDocument(proxyHost); 37 | Elements ips = doc.select("#ip_list tr"); 38 | //System.out.println(ips.size()); 39 | 40 | for (Element e : ips) { 41 | Elements ip = e.select("td"); 42 | if (ip.size() > 2){ 43 | String ipAddr = ip.get(1).text(); 44 | int port = Integer.parseInt(ip.get(2).text()); 45 | if (testIp(ipAddr,port)) { 46 | System.out.println(ipAddr + ":" + port + " 可用"); 47 | proxyIpWriter.write(ipAddr + ":" + port); 48 | proxyIpWriter.newLine(); 49 | } 50 | } 51 | } 52 | proxyIpWriter.flush(); 53 | proxyIpWriter.close(); 54 | } catch (IOException e1) { 55 | e1.printStackTrace(); 56 | } 57 | } 58 | 59 | static boolean testIp(String ip, int port) { 60 | try{ 61 | 62 | //如果3s内没有响应,则该ip不可用 63 | Document doc = Jsoup.connect(testUrl) 64 | .userAgent(userAgent) 65 | .proxy(ip, port) 66 | .timeout(3000) 67 | .get(); 68 | return true; 69 | } catch(Exception e) { 70 | System.out.println("访问超时"); 71 | return false; 72 | } 73 | } 74 | 75 | public static void main(String[] args) { 76 | /* 77 | //添加程序监听结束 78 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable(){ 79 | public void run() { 80 | System.out.println("程序结束了!"); 81 | 82 | } 83 | 84 | })); 85 | */ 86 | getProxyIp(); 87 | System.out.println("成功获取可用代理Ip!"); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/service/SpiderService.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.service; 2 | 3 | import java.util.List; 4 | import java.util.ArrayList; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | import java.io.ObjectInputStream; 11 | import java.io.ObjectOutputStream; 12 | import java.io.Serializable; 13 | import java.sql.Connection; 14 | import java.sql.PreparedStatement; 15 | import java.sql.ResultSet; 16 | import java.sql.SQLException; 17 | import java.sql.Statement; 18 | import java.util.concurrent.BlockingQueue; 19 | import java.util.concurrent.Executor; 20 | import java.util.concurrent.Executors; 21 | import java.util.concurrent.LinkedBlockingQueue; 22 | import java.util.concurrent.ThreadPoolExecutor; 23 | import java.util.regex.Matcher; 24 | import java.util.regex.Pattern; 25 | import java.util.Date; 26 | 27 | import org.json.JSONException; 28 | import org.jsoup.nodes.Document; 29 | import org.jsoup.nodes.Element; 30 | import org.jsoup.select.Elements; 31 | 32 | import main.java.SohuSpider.bean.NewsBean; 33 | import main.java.SohuSpider.filter.BloomFilter; 34 | import main.java.SohuSpider.util.DBStatement; 35 | import static main.java.SohuSpider.util.XmlUtils.getAllChannels; 36 | import static main.java.SohuSpider.util.JSoupUtils.getDocument; 37 | import static main.java.SohuSpider.util.JsonUtils.parseRestContent; 38 | import static main.java.SohuSpider.util.XmlUtils.writeEntryUrls; 39 | import static main.java.SohuSpider.util.XmlUtils.loadEntryUrls; 40 | 41 | public class SpiderService implements Serializable { 42 | 43 | //使用BloomFilter算法去重 44 | static BloomFilter filter = new BloomFilter(); 45 | 46 | //url阻塞队列 47 | BlockingQueue urlQueue = null; 48 | 49 | //数据库连接 50 | static Connection con = DBStatement.getCon(); 51 | 52 | static Statement stmt = DBStatement.getInstance(); 53 | 54 | static PreparedStatement ps = null; 55 | 56 | //线程池 57 | static Executor executor = Executors.newFixedThreadPool(20); 58 | 59 | static String urlHost = "http://m.sohu.com"; 60 | 61 | //导航页面url 62 | static String urlNavigation = "https://m.sohu.com/c/395/?_once_=000025_zhitongche_daohang_v3"; 63 | 64 | //爬取深度 65 | static int DEFAULT_DEPTH = 10; 66 | 67 | static int DEFAULT_THREAD_NUM = 10; 68 | 69 | public void start() throws InterruptedException{ 70 | 71 | File urlsSer = new File("urlQueue.ser"); 72 | if (urlsSer.exists()){ 73 | 74 | try{ 75 | //对象反序列化 76 | ObjectInputStream ois = new ObjectInputStream(new FileInputStream(urlsSer)); 77 | urlQueue = (BlockingQueue) ois.readObject(); 78 | 79 | ois.close(); 80 | } catch (Exception e) { 81 | e.printStackTrace(); 82 | } 83 | } 84 | else{ 85 | //创建阻塞队列 86 | urlQueue = new LinkedBlockingQueue(); 87 | 88 | //获取入口Url 89 | List urlChannels = genEntryChannel(urlNavigation); 90 | 91 | for (String url : urlChannels) { 92 | urlQueue.add(url); 93 | System.out.println(url); 94 | } 95 | } 96 | 97 | 98 | //添加程序监听结束,程序结束时候应序列化两个重要对象--urlQueue和filter 99 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable(){ 100 | 101 | public void run() { 102 | System.out.println(urlQueue.isEmpty()); 103 | try{ 104 | if (urlQueue.isEmpty() == false) { 105 | //序列化urlQueue 106 | ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream("urlQueue.ser")); 107 | os.writeObject(urlQueue); 108 | os.close(); 109 | 110 | } 111 | 112 | //序列化bits 113 | ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream("bits.ser")); 114 | os.writeObject(filter.getBitset()); 115 | os.close(); 116 | } catch(Exception e) { 117 | e.printStackTrace(); 118 | } 119 | 120 | } 121 | })); 122 | 123 | for(int i = 0; i < DEFAULT_THREAD_NUM; i++){ 124 | Thread a = new Thread(new Runnable() { 125 | 126 | public void run() { 127 | while (true) { 128 | String url = getAUrl(); 129 | if (!filter.contains(url)) { 130 | filter.add(url); 131 | System.out.println(Thread.currentThread().getName()+"正在爬取url:" + url); 132 | if (url != null) { 133 | crawler(url); 134 | } 135 | }else { 136 | System.out.println("此url存在,不爬了." + url); 137 | } 138 | } 139 | 140 | } 141 | 142 | }); 143 | executor.execute(a); 144 | } 145 | 146 | //线程池监视线程 147 | new Thread(new Runnable(){ 148 | public void run() { 149 | while(true) { 150 | try{ 151 | if (((ThreadPoolExecutor)executor).getActiveCount() < 10) { 152 | Thread a = new Thread(new Runnable() { 153 | public void run() { 154 | while (true) { 155 | String url = getAUrl(); 156 | if (!filter.contains(url)) { 157 | filter.add(url); 158 | System.out.println(Thread.currentThread().getName()+"正在爬取url:" + url); 159 | if (url != null) { 160 | crawler(url); 161 | } 162 | }else { 163 | System.out.println("此url存在, 不爬了." + url); 164 | } 165 | } 166 | } 167 | }); 168 | executor.execute(a); 169 | if (urlQueue.size() == 0) { 170 | System.out.println("队列为0了!!!!!!!"); 171 | } 172 | } 173 | Thread.sleep(3000); 174 | } catch (InterruptedException e) { 175 | e.printStackTrace(); 176 | } 177 | } 178 | 179 | } 180 | 181 | }).start(); 182 | 183 | } 184 | 185 | /* 从导航页解析入口新闻url */ 186 | public static List genEntryChannel (String startUrl) { 187 | 188 | List urlArray = new ArrayList(); 189 | //小说类别的url不需要,其url特征是含有单词read 190 | String pattern = "^/c.*"; 191 | 192 | Document doc = getDocument(startUrl); 193 | Elements Urls = doc.select("a.h3Sub"); 194 | for (Element url : Urls) { 195 | String link = url.attr("href"); 196 | if (Pattern.matches(pattern, link) == true) { 197 | urlArray.add(urlHost + link); 198 | } 199 | } 200 | 201 | writeEntryUrls(urlArray); 202 | return urlArray; 203 | } 204 | 205 | 206 | /* 爬取新闻网页 */ 207 | public void crawler(String url) { 208 | 209 | Document doc = getDocument(url); //返回的Document对象一定是正确的 210 | 211 | String pattern = ".*/n/[0-9]+/.*"; 212 | //System.out.println(Pattern.matches(pattern, url)); 213 | if (Pattern.matches(pattern, url)){ 214 | 215 | String title = ""; 216 | String category = null; 217 | String sourceFrom = null; 218 | String date = null; 219 | String content = ""; 220 | String editor = null; 221 | 222 | NewsBean news = new NewsBean(); 223 | 224 | news.setUrl(url); 225 | 226 | try{ 227 | /** 228 | * 新闻标题格式 题目-类别-手机搜狐 229 | * 但是有些题目中本身就含有 "-" 230 | */ 231 | String[] temp = doc.title().trim().split("-"); 232 | category = temp[temp.length - 2].substring(0, 2); 233 | for (int i = 0; i < temp.length - 2; i++){ 234 | title += temp[i]; 235 | } 236 | } catch (ArrayIndexOutOfBoundsException e) { 237 | //e.printStackTrace(); 238 | return ; 239 | } 240 | 241 | news.setCategory(category); 242 | news.setTitle(title); 243 | 244 | Elements articleInfo = doc.body().select("div.article-info"); 245 | if ( articleInfo.isEmpty() == false) { 246 | try{ 247 | String[] temp = articleInfo.first().text().split(" "); 248 | sourceFrom = temp[0]; 249 | date = temp[1]; 250 | } catch (ArrayIndexOutOfBoundsException e) { 251 | e.printStackTrace(); 252 | return ; 253 | } 254 | } 255 | news.setSourceFrom(sourceFrom); 256 | news.setDate(date); 257 | 258 | Elements paras = doc.body().select("article p"); 259 | if ( paras.isEmpty() == false) { 260 | for (Element e : paras) { 261 | content += e.text(); 262 | content += "\n"; 263 | } 264 | } 265 | 266 | news.setContent(content); 267 | 268 | if (content.length() > 8000) { 269 | return ; 270 | } 271 | 272 | 273 | Elements divEditor = doc.body().select("div.editor"); 274 | if (divEditor.isEmpty() == false) { 275 | editor = divEditor.first().text(); 276 | } 277 | news.setEditor(editor); 278 | 279 | //打印用户信息 280 | System.out.println("爬取成功:" + news); 281 | 282 | String sql = "insert into news_info " + 283 | "(title,url,cate,date,srcFrom,content,editor) " + 284 | "values (?,?,?,?,?,?,?)"; 285 | try { 286 | ps = con.prepareStatement(sql, Statement.SUCCESS_NO_INFO); 287 | ps.setString(1, news.getTitle()); 288 | ps.setString(2, news.getUrl()); 289 | ps.setString(3, news.getCategory()); 290 | ps.setString(4, news.getDate()); 291 | ps.setString(5, news.getSourceFrom()); 292 | ps.setString(6, news.getContent()); 293 | ps.setString(7, news.getEditor()); 294 | //存储news 295 | ps.executeUpdate(); 296 | }catch (Exception e){ 297 | e.printStackTrace(); 298 | } 299 | } 300 | 301 | //新闻正文url的特征 https://m.sohu.com/n/488483157/ 302 | Elements urlCandidates = doc.body().select("a[href~=(.*/n/[0-9]+/)|(.*/c.*)]"); 303 | for (Element e : urlCandidates){ 304 | url = urlHost + e.attr("href"); 305 | try { 306 | urlQueue.put(url); 307 | } catch (InterruptedException e1) { 308 | 309 | e1.printStackTrace(); 310 | } 311 | } 312 | 313 | } 314 | 315 | 316 | public String getAUrl() { 317 | String tmpAUrl; 318 | try { 319 | tmpAUrl= urlQueue.take(); 320 | return tmpAUrl; 321 | } catch (InterruptedException e) { 322 | e.printStackTrace(); 323 | } 324 | return null; 325 | } 326 | 327 | 328 | } 329 | 330 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/util/DBStatement.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.util; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.sql.Statement; 8 | 9 | 10 | public class DBStatement { 11 | static Statement stmt; 12 | static Connection con; 13 | 14 | private DBStatement(){ 15 | try { 16 | /** 17 | * jdbc四大配置参数: 18 | * 1.driverClassName:com.mysql.jdbc.Driver 19 | * 2.url:jdbc:mysql://localhost:3306/mydb 20 | * 3.username:root 21 | * 4.password:123 22 | */ 23 | Class.forName("com.mysql.jdbc.Driver");//加载驱动类(注册驱动类) 24 | String mySqlUrl = "jdbc:mysql://localhost:3306/sohu"; 25 | String username = "root"; 26 | String password = "196214"; 27 | 28 | //得到连接对象 29 | con = DriverManager.getConnection(mySqlUrl, username, password); 30 | 31 | /*对数据库做增、删、改 32 | * 1.通过Connection对象创建Statement 33 | * Statement语句的发送器,它的功能就是向数据库发送sql语句! 34 | * 2.调用他的int executeUpdate(String sql),返回影响了几行 35 | */ 36 | //通过Connection 得到Statement; 37 | stmt = con.createStatement(); 38 | }catch (Exception e){ 39 | } 40 | } 41 | 42 | private static final DBStatement dbStatement = new DBStatement(); 43 | 44 | //静态工厂方法 45 | public synchronized static Statement getInstance() { 46 | return dbStatement.stmt; 47 | } 48 | 49 | //静态工厂方法 50 | public synchronized static Connection getCon() { 51 | return dbStatement.con; 52 | } 53 | 54 | /* 55 | public static void main(String[] args){ 56 | 57 | String sql = "select count(*) from news_info"; 58 | try { 59 | ResultSet rs = stmt.executeQuery(sql); 60 | //一定要先将结果集指针移动到第一行 61 | rs.next(); 62 | System.out.println(rs.getString(1)); 63 | } catch (SQLException e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | */ 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/util/HttpUtils.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Random; 11 | 12 | public class HttpUtils { 13 | 14 | static String rootPath = System.getProperty("user.dir"); 15 | /** 16 | * 设置代理Ip 17 | * 18 | */ 19 | public static void setProxyIp(){ 20 | try{ 21 | List ipList = new ArrayList(); 22 | BufferedReader proxyIpReader = new BufferedReader(new FileReader(rootPath + "/src/main/resources/proxyip.txt")); 23 | 24 | String ip = ""; 25 | while((ip = proxyIpReader.readLine())!= null){ 26 | ipList.add(ip); 27 | } 28 | 29 | Random random = new Random(); 30 | int randomInt = random.nextInt(ipList.size()); 31 | String ipport = ipList.get(randomInt); 32 | String proxyIp = ipport.substring(0, ipport.lastIndexOf(":")); 33 | String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length()); 34 | 35 | System.setProperty("http.maxRedirects", "50"); 36 | System.getProperties().setProperty("proxySet", "true"); 37 | System.getProperties().setProperty("http.proxyHost", proxyIp); 38 | System.getProperties().setProperty("http.proxyPort", proxyPort); 39 | 40 | System.out.println("设置代理ip为: " + proxyIp + "端口号为: " + proxyPort); 41 | }catch(Exception e){ 42 | System.err.println("重新设置代理ip"); 43 | setProxyIp(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/util/JSoupUtils.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.util; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | public class JSoupUtils { 9 | 10 | static String userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36"; 11 | /** 12 | * 通过地址得到document对象 13 | */ 14 | 15 | public static Document getDocument(String url){ 16 | try{ 17 | 18 | Document document = Jsoup.connect(url) 19 | .userAgent(userAgent) 20 | .timeout(3000) 21 | .get(); 22 | if(document == null || document.toString().trim().equals("")){ // 表示Ip被拦截或其他情况 23 | System.out.println("出现ip被拦截或者其他情况"); 24 | HttpUtils.setProxyIp(); //重新设置代理ip 25 | getDocument(url); 26 | } 27 | return document; 28 | }catch(Exception e){ //链接超时等其他情况 29 | System.out.println("出现链接超时等其他情况"); 30 | HttpUtils.setProxyIp(); // 换代理ip 31 | getDocument(url); 32 | } 33 | return getDocument(url); 34 | 35 | } 36 | 37 | /* 38 | public static void main(String[] args){ 39 | String url = "https://m.sohu.com/n/557070587/"; 40 | Document doc = getDocument(url); 41 | Elements paras = doc.body().select("article p"); 42 | if(paras.isEmpty() == false ){ 43 | for (Element p : paras) { 44 | System.out.println(p.text()); 45 | } 46 | } 47 | 48 | //System.out.println(doc); 49 | } 50 | */ 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/util/JsonUtils.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.util; 2 | 3 | import org.json.JSONException; 4 | import org.json.JSONObject; 5 | import org.json.JSONTokener; 6 | 7 | public class JsonUtils { 8 | public static String parseRestContent (String json) throws JSONException { 9 | JSONTokener jsonTok = new JSONTokener(json); 10 | JSONObject jsonObj = new JSONObject(jsonTok); 11 | 12 | String restContent = jsonObj.getString("rest_content"); 13 | return restContent; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/SohuSpider/util/XmlUtils.java: -------------------------------------------------------------------------------- 1 | package main.java.SohuSpider.util; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Date; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.ArrayList; 10 | 11 | import org.dom4j.Document; 12 | import org.dom4j.DocumentException; 13 | import org.dom4j.Element; 14 | import org.dom4j.io.OutputFormat; 15 | import org.dom4j.io.SAXReader; 16 | import org.dom4j.io.XMLWriter; 17 | 18 | 19 | public class XmlUtils { 20 | 21 | //项目根目录 22 | static String rootPath = System.getProperty("user.dir"); 23 | 24 | public static List getAllChannels() { 25 | return loadEntryUrls(); 26 | } 27 | 28 | public static void writeEntryUrls(List urlArray){ 29 | SAXReader sr = new SAXReader(); 30 | try{ 31 | Document doc = sr.read(new File(rootPath +"/src/main/resources/entry-config.xml")); 32 | 33 | Element root = doc.getRootElement(); 34 | 35 | if (root.elements().isEmpty() == false) { 36 | return ; 37 | } 38 | 39 | for (String url : urlArray) { 40 | root.addElement("url").addText(url); 41 | } 42 | 43 | FileOutputStream out =new FileOutputStream(new File(rootPath + "/src/main/resources/entry-config.xml")); 44 | // 指定文本的写出的格式: 45 | OutputFormat format=OutputFormat.createPrettyPrint(); //漂亮格式:有空格换行 46 | format.setEncoding("UTF-8"); 47 | //创建写出对象 48 | XMLWriter writer=new XMLWriter(out,format); 49 | //写出Document对象 50 | writer.write(doc); 51 | //关闭流 52 | writer.close(); 53 | }catch(Exception e){ 54 | e.printStackTrace(); 55 | } 56 | } 57 | 58 | public static List loadEntryUrls(){ 59 | 60 | List urlArray = new ArrayList(); 61 | 62 | //读入xml文件中的用户信息 63 | SAXReader sr = new SAXReader(); 64 | try{ 65 | Document doc = sr.read(new File(rootPath + "/src/main/resources/entry-config.xml")); 66 | Element root = doc.getRootElement(); 67 | 68 | //System.out.println(root.getText()); 69 | //查找所有url结点 70 | List urls = root.selectNodes("//url"); 71 | for (Iterator it = urls.iterator(); it.hasNext();) { 72 | String url = ((Element)it.next()).getTextTrim(); 73 | System.out.println(url); 74 | urlArray.add(url); 75 | } 76 | 77 | }catch(DocumentException e){ 78 | e.printStackTrace(); 79 | } 80 | 81 | return urlArray; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/resources/entry-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://m.sohu.com/c/32/ 5 | http://m.sohu.com/c/57/ 6 | http://m.sohu.com/c/53/ 7 | http://m.sohu.com/cl/2686/ 8 | http://m.sohu.com/c/2714/ 9 | http://m.sohu.com/c/15/ 10 | http://m.sohu.com/c/40/ 11 | http://m.sohu.com/c/284/ 12 | http://m.sohu.com/c/290/ 13 | http://m.sohu.com/c/399/ 14 | http://m.sohu.com/c/546/ 15 | http://m.sohu.com/c/19/ 16 | http://m.sohu.com/c/46/ 17 | http://m.sohu.com/c/301/ 18 | http://m.sohu.com/c/295/ 19 | http://m.sohu.com/c/315/ 20 | http://m.sohu.com/c/24/ 21 | http://m.sohu.com/c/27/ 22 | http://m.sohu.com/c/28/ 23 | http://m.sohu.com/c/31/ 24 | http://m.sohu.com/c/26/ 25 | http://m.sohu.com/c/208/ 26 | http://m.sohu.com/c/79/ 27 | http://m.sohu.com/c/81/ 28 | http://m.sohu.com/c/1918/ 29 | http://m.sohu.com/c/1944/ 30 | http://m.sohu.com/cl/2026/ 31 | http://m.sohu.com/c/3445/ 32 | http://m.sohu.com/car/model/index?_once_=000105_carmodel 33 | http://m.sohu.com/cl/33/ 34 | http://m.sohu.com/c/22/ 35 | http://m.sohu.com/c/103/ 36 | http://m.sohu.com/cl/50/ 37 | http://m.sohu.com/cl/49/ 38 | http://m.sohu.com/cl/29/ 39 | http://m.sohu.com/cl/34/ 40 | http://m.sohu.com/cl/409/ 41 | http://m.sohu.com/cl/51/ 42 | http://m.sohu.com/cl/134/ 43 | http://m.sohu.com/c/16430/?v=3 44 | http://m.sohu.com/c/101/?v=3 45 | http://m.sohu.com/c/61/?v=3 46 | http://m.sohu.com/c/74/ 47 | http://m.sohu.com/c/267/?v=3 48 | http://m.sohu.com/cl/483/ 49 | http://m.sohu.com/cl/5124/ 50 | http://m.sohu.com/cl/5123/ 51 | http://m.sohu.com/cl/470/ 52 | http://m.sohu.com/cl/69/ 53 | http://m.sohu.com/cl/182/ 54 | http://m.sohu.com/cl/199/ 55 | http://m.sohu.com/cl/70/ 56 | http://m.sohu.com/cl/187/ 57 | http://m.sohu.com/c/527/ 58 | http://m.sohu.com/cl/483/ 59 | http://m.sohu.com/cl/188/ 60 | http://m.sohu.com/cl/189/ 61 | http://m.sohu.com/cl/195/ 62 | http://m.sohu.com/cl/310/ 63 | http://m.sohu.com/cl/309/ 64 | http://m.sohu.com/c/3124/ 65 | http://m.sohu.com/c/3367/ 66 | http://m.sohu.com/cl/313/ 67 | http://m.sohu.com/cr/2543/ 68 | http://m.sohu.com/cr/2560/ 69 | http://m.sohu.com/cr/2561/ 70 | http://m.sohu.com/cr/2562/ 71 | http://m.sohu.com/cr/2563/ 72 | 73 | -------------------------------------------------------------------------------- /src/main/resources/proxyip.txt: -------------------------------------------------------------------------------- 1 | 112.82.201.237:13864 2 | 175.155.24.2:808 3 | 110.6.75.164:46772 4 | 110.73.28.168:8123 5 | 121.204.165.166:8118 6 | 183.153.2.115:808 7 | 183.78.183.156:82 8 | 183.32.88.97:808 9 | 110.73.14.124:8123 10 | 110.72.33.106:8123 11 | 221.216.94.77:808 12 | 124.88.67.19:80 13 | 61.191.173.31:808 14 | 202.121.96.33:8086 15 | 110.73.7.65:8123 16 | 110.73.0.38:8123 17 | 222.85.50.127:808 18 | 183.32.88.206:808 19 | -------------------------------------------------------------------------------- /urlQueue.ser: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetFeng/SohuSpider-Java/8ad23026183f523ff00b12450d258109e9c3ca18/urlQueue.ser --------------------------------------------------------------------------------