├── .github
    ├── my.JPG
    └── FUNDING.yml
├── src
    ├── main
    │   ├── resources
    │   │   ├── demo
    │   │   │   └── demo.xlsx
    │   │   └── conf.properties
    │   └── java
    │   │   └── com
    │   │       └── xyzj
    │   │           ├── crawler
    │   │               ├── utils
    │   │               │   ├── .DS_Store
    │   │               │   ├── parsehtmlstring
    │   │               │   │   ├── DataFormatStatus.java
    │   │               │   │   ├── JsoupSelectItemPojo.java
    │   │               │   │   ├── SystemAssert.java
    │   │               │   │   ├── StaticValue.java
    │   │               │   │   ├── UrlOperatorUtil.java
    │   │               │   │   ├── StringOperatorUtil.java
    │   │               │   │   ├── ParseTsUrls.java
    │   │               │   │   ├── DownloadTsFile.java
    │   │               │   │   ├── RegexUtil.java
    │   │               │   │   ├── RegexPaserUtil.java
    │   │               │   │   └── JsoupHtmlParser.java
    │   │               │   ├── gethtmlstring
    │   │               │   │   ├── BaseHttpCallBack.java
    │   │               │   │   ├── UrlUtil.java
    │   │               │   │   ├── SpiderUtil.java
    │   │               │   │   ├── M3u8HttpClientUtil.java
    │   │               │   │   └── HttpResponseUtil.java
    │   │               │   ├── proxyip
    │   │               │   │   ├── IPModel
    │   │               │   │   │   ├── IPMessage.java
    │   │               │   │   │   └── SerializeUtil.java
    │   │               │   │   ├── spider
    │   │               │   │   │   ├── doRule
    │   │               │   │   │   │   ├── ProxyFilterSpiderRule.java
    │   │               │   │   │   │   └── ProxyXcSpiderRule.java
    │   │               │   │   │   └── docrawler
    │   │               │   │   │   │   └── ProxyXcDoMain.java
    │   │               │   │   └── config
    │   │               │   │   │   ├── RedisUtil.java
    │   │               │   │   │   └── RedisConfig.java
    │   │               │   ├── packageutil
    │   │               │   │   ├── GetAllFiles.java
    │   │               │   │   ├── PackageGetClassUtil.java
    │   │               │   │   └── PackageGetJavaUtil.java
    │   │               │   ├── importfrom
    │   │               │   │   ├── FileCopyUtil.java
    │   │               │   │   ├── IOUtil.java
    │   │               │   │   └── ImportExcelUtil.java
    │   │               │   ├── savetomysql
    │   │               │   │   ├── SaveToMysql.java
    │   │               │   │   └── SaveToOracle.java
    │   │               │   └── authcode
    │   │               │   │   └── AuthcodeDistinguisher.java
    │   │               ├── framework
    │   │               │   ├── enums
    │   │               │   │   └── FactionEnum.java
    │   │               │   ├── interfaces
    │   │               │   │   └── ISpiderRule.java
    │   │               │   ├── abstracts
    │   │               │   │   └── SpiderRuleAbstract.java
    │   │               │   ├── entity
    │   │               │   │   ├── Goods.java
    │   │               │   │   └── Param.java
    │   │               │   ├── factory
    │   │               │   │   └── SpiderRuleFactory.java
    │   │               │   ├── runnable
    │   │               │   │   └── SpiderRunnable.java
    │   │               │   ├── defaults
    │   │               │   │   ├── DefaultM3u8SpiderRule.java
    │   │               │   │   └── DefaultSpiderRule.java
    │   │               │   └── handler
    │   │               │   │   └── SpiderRuleHandler.java
    │   │               └── spidertask
    │   │               │   ├── example
    │   │               │       ├── dorule
    │   │               │       │   ├── DoRule58.java
    │   │               │       │   └── DoRule51Cto.java
    │   │               │       └── docrawler
    │   │               │       │   └── DoCrawler58.java
    │   │               │   └── zlr
    │   │               │       ├── docrawler
    │   │               │           ├── SsqDoErrorMain.java
    │   │               │           └── SsqDoMain.java
    │   │               │       └── dorule
    │   │               │           └── SsqDetailSpiderRule.java
    │   │           └── bigdata
    │   │               └── in
    │   │                   ├── DemoData.java
    │   │                   ├── TestFileUtil.java
    │   │                   ├── ReadTest.java
    │   │                   └── DemoDataListener.java
    └── test
    │   └── java
    │       └── com
    │           └── xyzj
    │               └── crawler
    │                   └── DefaultSpiderRuleTest.java
├── .gitignore
├── ddl
    └── goods.sql
├── README.md
└── pom.xml


/.github/my.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/.github/my.JPG


--------------------------------------------------------------------------------
/src/main/resources/demo/demo.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/src/main/resources/demo/demo.xlsx


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/src/main/java/com/xyzj/crawler/utils/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | !.mvn/wrapper/maven-wrapper.jar
 3 | 
 4 | ### STS ###
 5 | .apt_generated
 6 | .classpath
 7 | .factorypath
 8 | .project
 9 | .settings
10 | .springBeans
11 | 
12 | ### IntelliJ IDEA ###
13 | .idea
14 | *.iws
15 | *.iml
16 | *.ipr
17 | 
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/enums/FactionEnum.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.enums;
 2 | 
 3 | /**
 4 |  * @author lyy
 5 |  * @since 2019-06-28 11:01
 6 |  */
 7 | public enum FactionEnum {
 8 |     getHtml,
 9 |     getHtmlWithJavaScript,
10 |     getJson,
11 |     ;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/DataFormatStatus.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | /**
 4 |  * 对所要结果数据格式的枚举,现分为2种，纯文本和标签自身的全部内容,默认为纯文本
 5 |  * 
 6 |  * @author zel
 7 |  * 
 8 |  */
 9 | public enum DataFormatStatus {
10 | 	CleanTxt, TagAllContent
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/gethtmlstring/BaseHttpCallBack.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.gethtmlstring;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | /**
 6 |  * http回调
 7 |  * */
 8 | public interface BaseHttpCallBack {
 9 | 
10 |     void httpCallBack(int responseCode, InputStream inputStream);
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/bigdata/in/DemoData.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.bigdata.in;
 2 | 
 3 | import java.util.Date;
 4 | import lombok.Data;
 5 | 
 6 | /**
 7 |  * 基础数据类.这里的排序和excel里面的排序一致
 8 |  *
 9 |  * @author Jiaju Zhuang
10 |  **/
11 | @Data
12 | public class DemoData {
13 |     private String string;
14 |     private Date date;
15 |     private Double doubleData;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/interfaces/ISpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.interfaces;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | 
 5 | 
 6 | /**
 7 |  * 爬虫规则 接口
 8 |  *
 9 |  * @author liuyangyang
10 |  */
11 | public interface ISpiderRule {
12 | 
13 |     void runSpider(Param param,ISpiderRule spiderRule);
14 | 
15 |     void handlerGoods(Param param, String htmlSource);
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/abstracts/SpiderRuleAbstract.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.abstracts;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 5 | 
 6 | /**
 7 |  *  抽象类
 8 |  *  @author liuyangyang
 9 |  * */
10 | public abstract class SpiderRuleAbstract implements ISpiderRule {
11 | 
12 | 	@Override
13 | 	public void runSpider(Param param, ISpiderRule spiderRule) {
14 | 
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/JsoupSelectItemPojo.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | public class JsoupSelectItemPojo {
 4 | 	private String selector;
 5 | 
 6 | 	public JsoupSelectItemPojo(String selector, boolean have_remove_sel) {
 7 | 		this.selector = selector;
 8 | 	}
 9 | 
10 | 	public String getSelector() {
11 | 		return selector;
12 | 	}
13 | 
14 | 	public void setSelector(String selector) {
15 | 		this.selector = selector;
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/resources/conf.properties:
--------------------------------------------------------------------------------
 1 | ########redis############
 2 | jedis.addr=192.168.34.251
 3 | jedis.port=6379
 4 | jedis.passwd=
 5 | 
 6 | 
 7 | #########mysql#######
 8 | mysql.url=jdbc:mysql://localhost/crawler?characterEncoding=utf8&useSSL=false
 9 | mysql.username=root
10 | mysql.password=x5
11 | 
12 | #mysql.url=jdbc:mysql://192.168.34.250/crawler?characterEncoding=utf8&useSSL=false&generateSimpleParameterMetadata=true
13 | #mysql.username=root
14 | #mysql.password=root
15 | 
16 | 
17 | 
18 | #oracle
19 | oracle.url=jdbc:oracle:thin:@10.64.2.62:1521:testdb
20 | oracle.username=xdkf
21 | oracle.password=xdkf


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/IPModel/IPMessage.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.IPModel;
 2 | 
 3 | import java.io.Serializable;
 4 | import lombok.Data;
 5 | 
 6 | 
 7 | @Data
 8 | public class IPMessage implements Serializable {
 9 |     private static final long serialVersionUID = 1L;
10 | 
11 |     /**
12 |      * ip地址
13 |      */
14 |     private String ip;
15 | 
16 |     /**
17 |      * 端口号
18 |      */
19 |     private String port;
20 | 
21 | 
22 |     /**
23 |      * 类型
24 |      */
25 |     private String type;
26 | 
27 |     /**
28 |      * 延迟
29 |      */
30 |     private String speed;
31 | 
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/entity/Goods.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.entity;
 2 | 
 3 | import java.io.Serializable;
 4 | import lombok.Data;
 5 | 
 6 | /**
 7 |  * @author liuyangyang
 8 |  * @since 2017-12-05 11:49
 9 |  */
10 | @Data
11 | public class Goods implements Serializable {
12 | 
13 |     /** 主键 id*/
14 |     private Integer id;
15 | 
16 |     /** 类型 */
17 |     private String type;
18 | 
19 |     /**名称  详细内容*/
20 |     private String name;
21 | 
22 |     /**来源网站*/
23 |     private String webUrl;
24 | 
25 |     /**提供*/
26 |     private String provide;
27 | 
28 |     /** 排序列*/
29 |     private String orderNum;
30 | 
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/java/com/xyzj/crawler/DefaultSpiderRuleTest.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | import org.junit.Test;
 7 | 
 8 | /**
 9 |  * @author lyy
10 |  * @since 2018-10-27 14:06
11 |  */
12 | 
13 | public class DefaultSpiderRuleTest {
14 | 
15 |     @Test
16 |     public  void runSpider(){
17 |         Param param = new Param();
18 |         param.setWebUrl("https://www.baidu.com");
19 |         ISpiderRule spiderRule = new SpiderRuleFactory().getInstance();
20 |         spiderRule.runSpider(param,spiderRule);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with a single custom sponsorship URL
13 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/SystemAssert.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | /**
 4 |  * 系统断言类,方便引用,不使用junit之类的assert
 5 |  * 
 6 |  * @author zel
 7 |  *  
 8 |  */
 9 | public class SystemAssert {
10 | 	public static void assertNotNull(Object obj) {
11 | 		if (obj == null) {
12 | 			try {
13 | 				throw new Exception("object should not be null,please check");
14 | 			} catch (Exception e) {
15 | 				e.printStackTrace();
16 | 				System.exit(0);
17 | 			}
18 | 		}
19 | 	}
20 | 
21 | 	public static void assertTrue(boolean bool, String message) {
22 | 		if (bool) {
23 | 			try {
24 | 				throw new Exception(message);
25 | 			} catch (Exception e) {
26 | 				e.printStackTrace();
27 | 				System.exit(0);
28 | 			}
29 | 		}
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/factory/SpiderRuleFactory.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.factory;
 2 | 
 3 | import com.xyzj.crawler.framework.defaults.DefaultSpiderRule;
 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 5 | 
 6 | /**
 7 |  * @author lyy
 8 |  * @since 2018-10-27 13:57
 9 |  */
10 | public class SpiderRuleFactory {
11 | 
12 | 
13 |     //提供默认实现
14 |     private ISpiderRule spiderRule = new DefaultSpiderRule();
15 | 
16 | 
17 |     //无参构造
18 |     public SpiderRuleFactory() {
19 | 
20 |     }
21 | 
22 |     //有参数构造
23 |     public SpiderRuleFactory(ISpiderRule spiderRule) {
24 |         this.spiderRule = spiderRule;
25 |     }
26 | 
27 | 
28 |     //取得实例
29 |     public ISpiderRule getInstance() {
30 |         return spiderRule;
31 |     }
32 | 
33 | 
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/StaticValue.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | /**
 4 |  * 静态变量设置
 5 |  * 
 6 |  * @author zel
 7 |  * 
 8 |  */
 9 | public class StaticValue {
10 | 	public static String default_encoding = "UTF-8";
11 | 	public static String encoding_gbk = "gbk";
12 | 	
13 | 	public static String separator_tab = "\t";
14 | 	public static String separator_vertical_line = "\\|";
15 | 	public static String separator_space = " ";
16 | 	public static String separator_next_line = "\n";
17 | 	
18 | 	public static String NULL = null;
19 | 	
20 | 	/**
21 | 	 * 要去除的html tags,regex string
22 | 	 */
23 | 	public static String htmlTagRegex="";
24 | //	public static String htmlTagRegex="";
25 | 	
26 | 	//定义分隔块变量，与spider中保持一致，减少项目上依赖
27 | 	public static String split_block_index = "#block_index#";
28 | 	
29 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/runnable/SpiderRunnable.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.runnable;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 5 | import lombok.extern.slf4j.Slf4j;
 6 | 
 7 | 
 8 | /**
 9 |  * 多线程任务定义
10 |  *
11 |  * @since liuyangyang
12 |  */
13 | @Slf4j
14 | public class SpiderRunnable implements Runnable {
15 | 
16 |     //封装参数
17 |     private Param param;
18 | 
19 |     //规则
20 |     private ISpiderRule spiderRule;
21 | 
22 |     //构造方法
23 |     public SpiderRunnable(ISpiderRule spiderRule, Param param) {
24 |         super();
25 |         this.spiderRule = spiderRule;
26 |         this.param = param;
27 |     }
28 | 
29 |     @Override
30 |     public void run() {
31 |         spiderRule.runSpider(param,spiderRule);
32 | 
33 |     }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/UrlOperatorUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | import java.net.MalformedURLException;
 4 | import java.net.URL;
 5 | 
 6 | /**
 7 |  * 关于url操作的工具类，如提取url的域名、keyword等
 8 |  * 
 9 |  * @author zel
10 |  * 
11 |  */
12 | public class UrlOperatorUtil {
13 | 
14 | 	public static boolean isValidUrl(String url) {
15 | 		if (url == null || url.isEmpty()) {
16 | 			return false;
17 | 		}
18 | 		try {
19 | 			@SuppressWarnings("unused")
20 | 			URL urlObj = new URL(url);
21 | 			return true;
22 | 		} catch (MalformedURLException e) {
23 | 			// e.printStackTrace();
24 | 		}
25 | 		return false;
26 | 	}
27 | 
28 | 	public static void main(String[] args) throws Exception {
29 | 		@SuppressWarnings("unused")
30 | 		UrlOperatorUtil urlOperatorUtil = new UrlOperatorUtil();
31 | 
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/example/dorule/DoRule58.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.example.dorule;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | 
 7 | /**
 8 |  * 单页面抓取
 9 |  *
10 |  * @author lyy
11 |  * @since 2018-10-27 14:22
12 |  */
13 | public class DoRule58 {
14 | 
15 |     public static void main(String[] args) {
16 | 
17 |         //工厂取得默认实例
18 |         ISpiderRule spiderRule = new SpiderRuleFactory().getInstance();
19 |         //封装参数
20 |         Param param = new Param();
21 |         param.setWebUrl("https://cq.58.com/shouji/?PGTID=0d100000-0002-5d3a-b0c9-e83a870d03be&ClickID=3");
22 | 
23 |         //走你
24 |         spiderRule.runSpider(param,spiderRule);
25 | 
26 | 
27 | 
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/defaults/DefaultM3u8SpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.defaults;
 2 | 
 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract;
 4 | import com.xyzj.crawler.framework.entity.Param;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | import com.xyzj.crawler.utils.parsehtmlstring.ParseTsUrls;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | 
 9 | /**
10 |  * @author lyy
11 |  * @since 2018-11-25 19:40
12 |  */
13 | @Slf4j
14 | public class DefaultM3u8SpiderRule extends SpiderRuleAbstract {
15 |     @Override
16 |     public void runSpider(Param param, ISpiderRule spiderRule) {
17 |         //执行解析
18 |         new ParseTsUrls(param.getWebUrl(), param.getHeaderInfos(), param.getFileFullName()).httpRequestForTsUrls();
19 |         log.info("文件生成成功......");
20 |     }
21 | 
22 |     @Override
23 |     public void handlerGoods(Param param, String htmlSource) {
24 | 
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/StringOperatorUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * String常用操作类
 7 |  * 
 8 |  * @author zel
 9 |  * 
10 |  */
11 | public class StringOperatorUtil {
12 | 	public static boolean isBlank(String str) {
13 | 		if (str == null || str.trim().length() == 0) {
14 | 			return true;
15 | 		}
16 | 		return false;
17 | 	}
18 | 
19 | 	public static boolean isBlankCollection(List<?> list) {
20 | 		if (list == null || list.isEmpty()) {
21 | 			return true;
22 | 		}
23 | 		return false;
24 | 	}
25 | 
26 | 	public static boolean isNotBlank(String str) {
27 | 		if (str == null || str.trim().length() == 0) {
28 | 			return false;
29 | 		}
30 | 		return true;
31 | 	}
32 | 
33 | 	public static boolean isNotBlankCollection(List<?> list) {
34 | 		if (list == null || list.isEmpty()) {
35 | 			return false;
36 | 		}
37 | 		return true;
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/bigdata/in/TestFileUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.bigdata.in;
 2 | 
 3 | import java.io.File;
 4 | import java.io.InputStream;
 5 | 
 6 | public class TestFileUtil {
 7 | 
 8 |     public static InputStream getResourcesFileInputStream(String fileName) {
 9 |         return Thread.currentThread().getContextClassLoader().getResourceAsStream("" + fileName);
10 |     }
11 | 
12 |     public static String getPath() {
13 |         return TestFileUtil.class.getResource("/").getPath();
14 |     }
15 | 
16 |     public static File createNewFile(String pathName) {
17 |         File file = new File(getPath() + pathName);
18 |         if (file.exists()) {
19 |             file.delete();
20 |         } else {
21 |             if (!file.getParentFile().exists()) {
22 |                 file.getParentFile().mkdirs();
23 |             }
24 |         }
25 |         return file;
26 |     }
27 | 
28 |     public static File readFile(String pathName) {
29 |         return new File(getPath() + pathName);
30 |     }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/handler/SpiderRuleHandler.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.handler;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 5 | import com.xyzj.crawler.utils.gethtmlstring.HttpResponseUtil;
 6 | import lombok.extern.slf4j.Slf4j;
 7 | 
 8 | /**
 9 |  * @author lyy
10 |  * @since 2018-10-27 13:08
11 |  */
12 | @Slf4j
13 | public  class SpiderRuleHandler {
14 |     public void handler(Param param, ISpiderRule spiderRule) {
15 |         try {
16 |             //第一步 拿到源码
17 |             String htmlSource = HttpResponseUtil.getHtmlSource(param);
18 |             if (htmlSource == null)return;
19 |             //第二步 匹配出内容 并进行下一步处理
20 |             spiderRule.handlerGoods(param, htmlSource);
21 |         }finally {
22 |             //第三步 如果有减1个操作
23 |             if (param.getCountDownLatch() !=null){
24 |                 param.getCountDownLatch().countDown();
25 |                 log.info("还有{}个任务等待中{}", param.getCountDownLatch().getCount());
26 |             }
27 |         }
28 |     }
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/packageutil/GetAllFiles.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.packageutil;
 2 |  
 3 | import java.io.File;
 4 | 
 5 | /**
 6 |  * 遍历文件夹
 7 |  */
 8 | public class GetAllFiles {
 9 |  
10 |     public static void main(String[] args) {
11 |         //路径   这里写一个路径进去
12 |         String path="F:\\QQ文档";
13 |         //调用方法
14 |         getFiles(path);
15 |     }
16 |  
17 |     /**
18 |      * 递归获取某路径下的所有文件，文件夹，并输出
19 |      */
20 |  
21 |     public static void getFiles(String path) {
22 |         File file = new File(path);
23 |         // 如果这个路径是文件夹
24 |         if (file.isDirectory()) {
25 |             // 获取路径下的所有文件
26 |             File[] files = file.listFiles();
27 |             for (int i = 0; i < files.length; i++) {
28 |                 // 如果还是文件夹 递归获取里面的文件 文件夹
29 |                 if (files[i].isDirectory()) {
30 |                     System.out.println("目录：" + files[i].getPath());
31 |                     getFiles(files[i].getPath());
32 |                 } else {
33 |                     System.out.println("文件：" + files[i].getPath());
34 |                 }
35 |  
36 |             }
37 |         } else {
38 |             System.out.println("文件：" + file.getPath());
39 |         }
40 |     }
41 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/bigdata/in/ReadTest.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.bigdata.in;
 2 | 
 3 | import com.alibaba.excel.EasyExcel;
 4 | import java.io.File;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | /**
 9 |  * 读的常见写法
10 |  *
11 |  * @author Jiaju Zhuang
12 |  */
13 | 
14 | public class ReadTest {
15 |     private static final Logger LOGGER = LoggerFactory.getLogger(ReadTest.class);
16 | 
17 |     private static String fileName ;
18 |     public void simpleRead() {
19 |         // 写法1：
20 |         if (fileName == null) {
21 |             fileName = TestFileUtil.getPath() + "demo" + File.separator + "demo.xlsx";
22 |         }
23 |         // 这里 需要指定读用哪个class去读，然后读取第一个sheet 文件流会自动关闭
24 |         LOGGER.info("文件路径是{}",fileName);
25 |         EasyExcel.read(fileName, DemoData.class, new DemoDataListener()).sheet().doRead();
26 |     }
27 | 
28 |     public static void main(String[] args) {
29 |         if (args.length!=0) {
30 |             fileName = args[0];
31 |         }
32 |         long l = System.currentTimeMillis();
33 |         ReadTest readTest = new ReadTest();
34 |         readTest.simpleRead();
35 |         long l1 = System.currentTimeMillis() - l;
36 |         LOGGER.info("总耗时{}毫秒 ",l1);
37 |     }
38 | 
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/ddl/goods.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : 192.168.73.21
 5 | Source Server Version : 50719
 6 | Source Host           : 192.168.73.21:3306
 7 | Source Database       : crawler
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50719
11 | File Encoding         : 65001
12 | 
13 | Date: 2018-01-29 19:10:20
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for goods
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `goods`;
22 | CREATE TABLE `goods` (
23 |   `id` int(11) NOT NULL AUTO_INCREMENT,
24 |   `type` text COMMENT '类型',
25 |   `name` longtext COMMENT '名称',
26 |   `webUrl` text COMMENT '来源网站',
27 |   `provide` text COMMENT '提供方',
28 |   `orderNum` text COMMENT '排序列',
29 |   PRIMARY KEY (`id`)
30 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
31 | 
32 | DROP TABLE IF EXISTS `ungoods`;
33 | CREATE TABLE `ungoods` (
34 |   `id` int(11) NOT NULL AUTO_INCREMENT,
35 |   `type` text COMMENT '类型',
36 |   `name` longtext COMMENT '名称',
37 |   `webUrl` text COMMENT '来源网站',
38 |   `provide` text COMMENT '提供方',
39 |   `orderNum` text COMMENT '排序列',
40 |   PRIMARY KEY (`id`)
41 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
42 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/importfrom/FileCopyUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.importfrom;
 2 | 
 3 | 
 4 | import java.io.File;
 5 | import java.nio.file.Files;
 6 | 
 7 | /**
 8 |  * 文件夹拷贝(文件内含有文件和文件夹)
 9 |  */
10 | public class FileCopyUtil {
11 | 
12 | 
13 | 
14 |     //复制方法
15 |     public static void copy(String filePath, String srcPath, String targetPath) throws Exception {
16 |         //初始化文件复制
17 |         File srcFile=new File(srcPath + filePath);
18 | 
19 |         //初始化文件目标
20 |         File targetFile=new File(targetPath+filePath);
21 |         if(!targetFile.getParentFile().exists()){
22 |             targetFile.getParentFile().mkdirs();
23 |         }
24 |         //调用文件拷贝的方法
25 |         targetFile.delete();
26 |         Files.copy(srcFile.toPath(), targetFile.toPath());
27 |     }
28 | 
29 | 
30 |     //主入口
31 |     public static void main(String[] args) throws Exception {
32 |         //复制方法
33 |         String filePath = "/src/main/java/com/xyzj/crawler/framework/defaults/DefaultM3u8SpiderRule.java";
34 |         String srcPath = "/Users/liuyangyang/workspace/xyzj/xyzj-crawler";
35 |         String targetPath = "/Users/liuyangyang/Downloads";
36 |         copy(filePath,srcPath,targetPath);
37 |         //打印完成
38 |         System.out.println("文件拷贝完成!");
39 |     }
40 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/IPModel/SerializeUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.IPModel;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.ByteArrayOutputStream;
 5 | import java.io.ObjectInputStream;
 6 | import java.io.ObjectOutputStream;
 7 | 
 8 | public class SerializeUtil {
 9 |     public static byte[] serialize(Object object) {
10 |         ObjectOutputStream oos;
11 |         ByteArrayOutputStream baos;
12 |         try {
13 |             // 序列化
14 |             baos = new ByteArrayOutputStream();
15 |             oos = new ObjectOutputStream(baos);
16 |             oos.writeObject(object);
17 | 
18 |             byte[] bytes = baos.toByteArray();
19 | 
20 |             return bytes;
21 |         } catch (Exception e) {
22 |             e.printStackTrace();
23 |         }
24 |         return null;
25 |     }
26 | 
27 |     //反序列化
28 |     public static Object unSerialize(byte[] bytes) {
29 |         ByteArrayInputStream bais;
30 |         ObjectInputStream ois;
31 | 
32 |         try {
33 |             // 反序列化
34 |             bais = new ByteArrayInputStream(bytes);
35 |             ois = new ObjectInputStream(bais);
36 |             return ois.readObject();
37 |         } catch (Exception e) {
38 |             e.printStackTrace();
39 |         }
40 | 
41 |         return null;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xyzj-crawler
 2 | 
 3 | ## 一、项目介绍
 4 | 
 5 |     爬虫? no no no
 6 |     就是一个网页信息采集器。
 7 |     
 8 |     java语言实现，会java，开箱即用。
 9 |     
10 |     想学习一下具体实现,和探讨一下技术的小伙伴，也可以跟我团队一起讨论。
11 |     交流学习--
12 |     联系lyy qq:719882551。 
13 | 
14 | 
15 | 
16 | ### 技术交流&&商务合作
17 | 如果你感兴趣，请加入技术讨论群，添加微信，备注入群。
18 | 有偿提供公开资源爬取，请加微信，备注合作。
19 | 
20 | ![](.github/my.JPG)
21 | 
22 | 
23 | 
24 | 
25 | ## 二、使用帮助
26 | 
27 | ### 2-1 页面有用信息提取
28 | 
29 | ```shell
30 | #第一步:克隆项目 导入idea
31 | git clone https://github.com/xy-xyzj/xyzj-crawler
32 | 
33 | #第二步:创建数据库、数据表
34 | 创建语句 goods.sql
35 | 
36 | #第三步:修改数据库配置
37 | conf.properties
38 | mysql.url=jdbc:mysql://localhost/crawler?characterEncoding=utf8&useSSL=false
39 | mysql.username=xxx
40 | mysql.password=xxx
41 |     
42 | 
43 | #第四步:熟悉代码
44 | --默认实现
45 | 1)DefaultSpiderRuleTest 
46 | 
47 | --58单页面
48 | 2）DoRule58
49 | 
50 | --58分页多页面 开启多线程爬取
51 | 3）DoCrawler58
52 | 
53 | ```
54 | 
55 |    
56 | 
57 | ### 2-2 m3u8视频下载
58 | 
59 | ```shell
60 | --m3u8规则实现类
61 | 1)com.xyzj.crawler.framework.factory.M3u8SpiderRule
62 | 
63 | --m3u8下载实例
64 | 2）com.xyzj.crawler.spidertask.dorule.DoRule51Cto
65 | 
66 | ```
67 | 
68 | 
69 | 
70 | ## 三、爬取你想要的网站
71 | 
72 | ​        
73 | 
74 | ## 四、模拟登陆
75 | 
76 | ```shell
77 | #其实已实现了。可以自己看看 
78 | params.put("headInfos",Map<String,String>);
79 | ```
80 | 
81 | 
82 | 
83 | ## 五、IP代理
84 | 
85 | ```shell
86 | #已实现。看代码。后面写个说明
87 | ```
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/spider/doRule/ProxyFilterSpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.spider.doRule;
 2 | 
 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract;
 4 | import com.xyzj.crawler.framework.entity.Param;
 5 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler;
 6 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 7 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage;
 8 | import com.xyzj.crawler.utils.proxyip.config.RedisUtil;
 9 | import lombok.extern.slf4j.Slf4j;
10 | 
11 | /**
12 |  * @author lyy
13 |  * @since 2018-10-27 13:08
14 |  */
15 | @Slf4j
16 | public  class ProxyFilterSpiderRule extends SpiderRuleAbstract {
17 | 
18 |     @Override
19 |     public void runSpider(Param param, ISpiderRule spiderRule) {
20 |         SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler();
21 |         spiderRuleHandler.handler(param, spiderRule);
22 |     }
23 | 
24 |     @Override
25 |     public void handlerGoods(Param param, String htmlSource) {
26 |         IPMessage ipMessage = (IPMessage) param.getExtParamMap().get("ipMessage");
27 |         String ipType = ipMessage.getType();
28 |         String ipSpeed = ipMessage.getSpeed();
29 |         ipSpeed = ipSpeed.substring(0, ipSpeed.indexOf('秒'));
30 |         double speed = Double.parseDouble(ipSpeed);
31 |         if (ipType.equals("HTTPS") && speed <= 2.0) {
32 |             RedisUtil.setOneIp(ipMessage);
33 |         }
34 | 
35 |     }
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/spider/docrawler/ProxyXcDoMain.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.spider.docrawler;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable;
 7 | import com.xyzj.crawler.utils.proxyip.spider.doRule.ProxyXcSpiderRule;
 8 | import lombok.extern.slf4j.Slf4j;
 9 | 
10 | import java.util.concurrent.ExecutorService;
11 | import java.util.concurrent.Executors;
12 | 
13 | @Slf4j
14 | public class ProxyXcDoMain {
15 | 
16 | 
17 |     public static void main(String[] args) throws Exception {
18 |         log.info("开始采集有效代理");
19 |         String baseUrl = "https://www.xicidaili.com/nn/";
20 |         ExecutorService executorService = Executors.newFixedThreadPool(3);
21 |         for (int i = 0; i <5; i++) {
22 |             Param param = new Param();
23 |             param.setWebUrl(baseUrl+i);
24 |             param.getExtParamMap().put("targetUrl", "https://www.baidu.com");
25 |             ISpiderRule spiderRule = new SpiderRuleFactory(new ProxyXcSpiderRule()).getInstance();
26 |             //spiderRule 参数
27 |             SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, param);
28 |             executorService.execute(spiderRunnable);
29 |         }
30 |         //等到任务执行完毕，关闭线程池。
31 |         executorService.shutdown();
32 |     }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/entity/Param.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.entity;
 2 | 
 3 | import avro.shaded.com.google.common.collect.Maps;
 4 | import com.xyzj.crawler.framework.enums.FactionEnum;
 5 | import java.util.Map;
 6 | import java.util.concurrent.CountDownLatch;
 7 | import lombok.Data;
 8 | 
 9 | /**
10 |  * ==================================================
11 |  * <p>
12 |  * FileName: Param
13 |  *
14 |  * @description:
15 |  * @author: lyy
16 |  * @create: 2019/6/28
17 |  * @since: 1.0.0
18 |  * <p>
19 |  * ==================================================
20 |  */
21 | @Data
22 | public class Param {
23 | 
24 |     /** 请求地址 */
25 |     private String webUrl;
26 | 
27 |     /** 网页编码 */
28 |     private String charset = "utf-8";
29 | 
30 |     /** 请求头信息 模拟登陆*/
31 |     private Map<String, String> headerInfos = Maps.newHashMap();
32 | 
33 |     /** 请求体信息 post json参数*/
34 |     private  Map<String, String> bodyParams = Maps.newHashMap();
35 | 
36 |     /** 指定源码获取方法 */
37 |     private FactionEnum factionEnum= FactionEnum.getHtml;
38 | 
39 |     /** 页面加载延迟时间单位 毫秒 */
40 |     private Integer delayTime;
41 | 
42 |     /** 计数器锁 */
43 |     private CountDownLatch countDownLatch;
44 | 
45 | 
46 |     /** isProxy */
47 |     private Boolean isProxy = false;
48 | 
49 |     /** 代理ip */
50 |     private String proxyIp;
51 | 
52 |     /** 代理port */
53 |     private String proxyPort;
54 | 
55 |     /** 文件保存路径 */
56 |     private String fileFullName;
57 | 
58 |     /**
59 |      * 其余定制参数
60 |      */
61 |     private Map<String, Object> extParamMap = Maps.newHashMap();
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/gethtmlstring/UrlUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.gethtmlstring;
 2 | 
 3 | import java.io.UnsupportedEncodingException;
 4 | /**
 5 |  * url转码、解码
 6 |  *
 7 |  */
 8 | public class UrlUtil {
 9 |     private final static String ENCODE = "utf-8";
10 |     /**
11 |      * URL 解码
12 |      *
13 |      * @return String
14 |      * @author lifq
15 |      * @date 2015-3-17 下午04:09:51
16 |      */
17 |     private static String getURLDecoderString(String str) {
18 |         String result = "";
19 |         if (null == str) {
20 |             return "";
21 |         }
22 |         try {
23 |             result = java.net.URLDecoder.decode(str, ENCODE);
24 |         } catch (UnsupportedEncodingException e) {
25 |             e.printStackTrace();
26 |         }
27 |         return result;
28 |     }
29 |     /**
30 |      * URL 转码
31 |      *
32 |      * @return String
33 |      * @author lifq
34 |      * @date 2015-3-17 下午04:10:28
35 |      */
36 |     public static String getURLEncoderString(String str) {
37 |         String result = "";
38 |         if (null == str) {
39 |             return "";
40 |         }
41 |         try {
42 |             result = java.net.URLEncoder.encode(str, ENCODE);
43 |         } catch (UnsupportedEncodingException e) {
44 |             e.printStackTrace();
45 |         }
46 |         return result;
47 |     }
48 | 
49 |     /**
50 |      * 
51 |      * @author lifq
52 |      * @date 2015-3-17 下午04:09:16
53 |      */
54 |     public static void main(String[] args) {
55 |         String str = "测试1";
56 |         System.out.println(getURLEncoderString(str));
57 |         System.out.println(getURLDecoderString(str));
58 |         
59 |     }
60 | 
61 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/gethtmlstring/SpiderUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.gethtmlstring;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import java.io.ByteArrayOutputStream;
 7 | import java.io.InputStream;
 8 | import java.net.HttpURLConnection;
 9 | import java.net.URL;
10 | 
11 | /**
12 |  * 爬虫工具类
13 |  *
14 |  */
15 | public final class SpiderUtil {
16 |     private static final Logger THREAD_LOG = LoggerFactory.getLogger(SpiderUtil.class);
17 | 
18 |     private static final byte[] getImageFromNetByUrl(final String strUrl) {
19 |         try {
20 |             URL url = new URL("http:" + strUrl);
21 |             if(strUrl.startsWith("http:")) {
22 |                 url = new URL(strUrl);
23 |             }
24 |             HttpURLConnection conn = (HttpURLConnection) url.openConnection();
25 |             conn.setRequestMethod("GET");
26 |             conn.setConnectTimeout(5 * 1000);
27 |             InputStream inStream = conn.getInputStream();// 通过输入流获取图片数据
28 |             byte[] btImg = readInputStream(inStream);// 得到图片的二进制数据
29 |             return btImg;
30 |         } catch (Exception e) {
31 |             THREAD_LOG.info("爬虫的getImageFromNetByUrl方法报错：", e.getMessage());
32 |         }
33 |         return null;
34 |     }
35 | 
36 |     public static final byte[] readInputStream(InputStream inStream) throws Exception {
37 |         ByteArrayOutputStream outStream = new ByteArrayOutputStream();
38 |         byte[] buffer = new byte[1024];
39 |         int len = 0;
40 |         while ((len = inStream.read(buffer)) != -1) {
41 |             outStream.write(buffer, 0, len);
42 |         }
43 |         inStream.close();
44 |         return outStream.toByteArray();
45 |     }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/framework/defaults/DefaultSpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.framework.defaults;
 2 | 
 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract;
 4 | import com.xyzj.crawler.framework.entity.Goods;
 5 | import com.xyzj.crawler.framework.entity.Param;
 6 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler;
 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 8 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil;
 9 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql;
10 | import java.util.List;
11 | import lombok.extern.slf4j.Slf4j;
12 | import org.springframework.util.CollectionUtils;
13 | 
14 | /**
15 |  * @author lyy
16 |  * @since 2018-10-27 13:08
17 |  */
18 | @Slf4j
19 | public class DefaultSpiderRule extends SpiderRuleAbstract {
20 | 
21 |     public void runSpider(Param param,ISpiderRule spiderRule) {
22 |         SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler();
23 |         spiderRuleHandler.handler(param, spiderRule);
24 |     }
25 | 
26 |     public void handlerGoods(Param param, String htmlSource) {
27 |         String regexPattern = "<html>([\\s\\S]*)</html>";
28 |         List<String> stringList = RegexUtil.getSubUtil(htmlSource, regexPattern);
29 |         if (CollectionUtils.isEmpty(stringList)) {
30 |             log.info("没有匹配需要都内容......");
31 |         }
32 |         Goods saveGoods = new Goods();
33 |         saveGoods.setWebUrl(param.getWebUrl());
34 |         SaveToMysql saveToMysql = new SaveToMysql();
35 |         for (int i = 0; i < stringList.size(); i++) {
36 |             saveGoods.setType(Integer.toString(i + 1));
37 |             saveGoods.setName(stringList.get(i));
38 |             saveToMysql.saveToMasql("goods", saveGoods);
39 |         }
40 |     }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/config/RedisUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.config;
 2 | 
 3 | 
 4 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage;
 5 | import com.xyzj.crawler.utils.proxyip.IPModel.SerializeUtil;
 6 | import lombok.extern.slf4j.Slf4j;
 7 | import redis.clients.jedis.Jedis;
 8 | 
 9 | 
10 | @Slf4j
11 | public class RedisUtil {
12 |     public static Jedis jedis = RedisConfig.getJedis();
13 | 
14 |     /**
15 |      * ========================================
16 |      *
17 |      * @description: 保存ip信息到redis队列
18 |      * @author: lyy
19 |      * @param:
20 |      * @return:
21 |      * @exception:
22 |      * @create: 2019/7/4 10:10
23 |      * <p>
24 |      * ========================================
25 |      */
26 |     public static void setOneIp(IPMessage ipMessage) {
27 |         //首先将ipMessage进行序列化
28 |         byte[] bytes = SerializeUtil.serialize(ipMessage);
29 |         jedis.rpush("IpPool".getBytes(), bytes);
30 |     }
31 | 
32 |     /**
33 |      * ========================================
34 |      *
35 |      * @description: 从队列中取出ip信息
36 |      * @author: lyy
37 |      * @param:
38 |      * @return:
39 |      * @exception:
40 |      * @create: 2019/7/4 10:11
41 |      * <p>
42 |      * ========================================
43 |      */
44 |     public static IPMessage getOneIp() {
45 |         byte[] bytes = jedis.lpop("IpPool".getBytes());
46 |         if (bytes != null) {
47 |             Object o = SerializeUtil.unSerialize(bytes);
48 |             if (o instanceof IPMessage) {
49 |                 return (IPMessage) o;
50 |             }
51 |         }
52 |         return null;
53 |     }
54 | 
55 |     public static void deleteKey(String key) {
56 |         jedis.del(key);
57 |     }
58 | 
59 |     public static void close() {
60 |         RedisConfig.close(jedis);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/packageutil/PackageGetClassUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.packageutil;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileOutputStream;
 6 | import java.io.IOException;
 7 | 
 8 | /**
 9 |  * @author lyy
10 |  * @since 2019-09-18 13:01
11 |  * 读取excel里面的文件到指定目录
12 |  */
13 | public class PackageGetClassUtil {
14 |     private static final String SRC_PATH = "C:\\Users\\quling\\Desktop\\java";
15 |     private static final String TARGET_PATH = "D:\\IdeaProjects\\demo2\\out\\production\\classes";
16 | 
17 |     public static void main(String[] args) throws Exception {
18 |         try{
19 |             fileReplace(new File(SRC_PATH),SRC_PATH,TARGET_PATH);
20 |         }catch(Exception e){
21 |             e.printStackTrace();
22 |         }
23 |         System.out.println("类提取完成");
24 | 
25 |     }
26 | 
27 |     private static void fileReplace(File base, String sourcePath, String targetPath) throws IOException {
28 |         if (!base.exists() || base.getName().contains(".txt")){
29 |             return;
30 |         }
31 |         if (base.isDirectory()) {
32 |             File[] files = base.listFiles();
33 |             for (File file : files) {
34 |                 fileReplace(file, sourcePath, targetPath);
35 |             }
36 |         } else {
37 |             String path = base.getPath();
38 |             String tempPath = path = path.replace(".java", ".class");
39 |             System.out.println(base.getName());
40 |             base.delete();
41 |             path = targetPath + path.substring(sourcePath.length());
42 |             FileInputStream in = new FileInputStream(new File(path));
43 |             FileOutputStream out = new FileOutputStream(new File(tempPath));
44 |             int i;
45 |             while ((i = in.read()) != -1) {
46 |                 out.write(i);
47 |             }
48 |             in.close();
49 |             out.close();
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/bigdata/in/DemoDataListener.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.bigdata.in;
 2 | 
 3 | import com.alibaba.excel.context.AnalysisContext;
 4 | import com.alibaba.excel.event.AnalysisEventListener;
 5 | import com.xyzj.crawler.utils.savetomysql.SaveToOracle;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | /**
12 |  * 模板的读取类
13 |  *
14 |  * @author Jiaju Zhuang
15 |  */
16 | public class DemoDataListener extends AnalysisEventListener<DemoData> {
17 |     private static final Logger LOGGER = LoggerFactory.getLogger(DemoDataListener.class);
18 |     /**
19 |      * 每隔5条存储数据库，实际使用中可以3000条，然后清理list ，方便内存回收
20 |      */
21 |     private static final int BATCH_COUNT = 5000;
22 |     List<DemoData> list = new ArrayList<DemoData>();
23 | 
24 |     @Override
25 |     public void invoke(DemoData data, AnalysisContext context) {
26 |         list.add(data);
27 |         if (list.size() >= BATCH_COUNT) {
28 |             saveData();
29 |             list.clear();
30 |         }
31 |     }
32 | 
33 |     @Override
34 |     public void doAfterAllAnalysed(AnalysisContext context) {
35 |         saveData();
36 |         LOGGER.info("所有数据解析完成！");
37 |     }
38 | 
39 |     /**
40 |      * 加上存储数据库
41 |      */
42 |     private void saveData() {
43 |         LOGGER.info("{}条数据，开始存储数据库！", list.size());
44 |         String sql = "INSERT INTO my_test " +
45 |                 "(name, birthday, age) VALUES (?, ?, ?)";
46 |         List<Object[]> param = new ArrayList<>();
47 | 
48 |         for(int i=0;i<list.size();i++) {
49 |             DemoData demoData = list.get(i);
50 |             String[] arr =new String[3];
51 |             arr[0] = demoData.getString();
52 |             arr[1] = String.valueOf(demoData.getDate());
53 |             arr[2] = String.valueOf(demoData.getDoubleData());
54 |             param.add(arr);
55 |         }
56 |         //保存到数据库
57 |         SaveToOracle saveToOracle = new SaveToOracle();
58 |         saveToOracle.batchUpdate(sql,param);
59 |         LOGGER.info("存储数据库成功！");
60 |     }
61 | 
62 | 
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/gethtmlstring/M3u8HttpClientUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.gethtmlstring;
 2 | 
 3 | 
 4 | import java.io.IOException;
 5 | import java.net.HttpURLConnection;
 6 | import java.net.MalformedURLException;
 7 | import java.net.URL;
 8 | import java.util.Map;
 9 | import org.springframework.util.CollectionUtils;
10 | 
11 | /**
12 |  *
13 |  * @author liuyangyang
14 |  * */
15 | public class M3u8HttpClientUtil {
16 | 
17 |     public static void doGet(String httpUrl, BaseHttpCallBack baseHttpCallBack, Map<String, String> headerInfos) {
18 |         HttpURLConnection connection = null;
19 |         try {
20 |             // 创建远程url连接对象
21 |             URL url = new URL(httpUrl);
22 |             // 通过远程url连接对象打开一个连接，强转成httpURLConnection类
23 |             connection = (HttpURLConnection) url.openConnection();
24 |             // 设置连接主机服务器的超时时间：15000毫秒
25 |             int connectTimeout = 15000;
26 |             // 设置读取远程返回的数据时间：60000毫秒
27 |             int readTimeout = 60000;
28 |             // 设置连接方式：get
29 |             connection.setRequestMethod("GET");
30 |             connection.setConnectTimeout(connectTimeout);
31 |             connection.setReadTimeout(readTimeout);
32 |             // 遍历map 设置请求头信息
33 |             if (!CollectionUtils.isEmpty(headerInfos)) {
34 |                 for (String key : headerInfos.keySet()) {
35 |                     connection.setRequestProperty(key, headerInfos.get(key));
36 |                 }
37 |             }
38 |             // 发送请求
39 |             connection.connect();
40 |             if (connection.getResponseCode() == 200) {
41 |                 baseHttpCallBack.httpCallBack(connection.getResponseCode(), connection.getInputStream());
42 |             }
43 |         } catch (MalformedURLException e) {
44 |             e.printStackTrace();
45 |             baseHttpCallBack.httpCallBack(-1, null);
46 |         } catch (IOException e) {
47 |             e.printStackTrace();
48 |             baseHttpCallBack.httpCallBack(-2, null);
49 |         } finally {
50 |             // 关闭远程连接
51 |             if (connection != null) {
52 |                 connection.disconnect();
53 |             }
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/example/docrawler/DoCrawler58.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.example.docrawler;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable;
 7 | import java.util.HashMap;
 8 | import java.util.Map;
 9 | import java.util.concurrent.CountDownLatch;
10 | import java.util.concurrent.ExecutorService;
11 | import java.util.concurrent.Executors;
12 | import lombok.extern.slf4j.Slf4j;
13 | 
14 | /**
15 |  * 线程池抓取
16 |  * 单个线程抓取某一个页面
17 |  * 翻页的情况,开启线程池
18 |  *
19 |  * 58页面。
20 |  *
21 |  * @author lyy
22 |  * @since 2018-10-27 18:14
23 |  */
24 | @Slf4j
25 | public class DoCrawler58 {
26 | 
27 |     private static final int THREAD_COUNT = 10;
28 | 
29 |     public static void main(String[] args) {
30 |         //总记录数
31 |         Integer totalCount =3199;
32 |         //每页数
33 |         Integer pageSize = 30;
34 |         //目标数量 107
35 |         Integer pageCount = totalCount / pageSize + 1;
36 |         //开启一个线程池
37 |         ExecutorService executorService = Executors.newFixedThreadPool(THREAD_COUNT);
38 |         //计数器锁
39 |         CountDownLatch countDownLatch = new CountDownLatch(pageCount);
40 |         for(int i=1;i<=pageCount;i++) {
41 |             Map<String, Object> params = new HashMap<>();
42 |             //目标url
43 |             String webUrl = "https://cq.58.com/shouji/pn"+i+"/?PGTID=0d300024-0002-5274-9167-f56e706b72b9&ClickID=1";
44 |             Param param = new Param();
45 |             param.setWebUrl(webUrl);
46 |             param.setCountDownLatch(countDownLatch);
47 | 
48 |             //spiderRule 规则
49 |             ISpiderRule spiderRule = new SpiderRuleFactory().getInstance();
50 |             //spiderRule 参数
51 |             SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule,param);
52 |             executorService.execute(spiderRunnable);
53 |         }
54 |         //等到任务执行完毕，关闭线程池。
55 |         executorService.shutdown();
56 |         try {
57 |             countDownLatch.await();
58 |         } catch (InterruptedException e) {
59 |            log.error("出毛病了{}",e);
60 |         }
61 |         log.info("main --爬完了");
62 |     }
63 | 
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/ParseTsUrls.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | import com.xyzj.crawler.utils.gethtmlstring.BaseHttpCallBack;
 4 | import com.xyzj.crawler.utils.gethtmlstring.M3u8HttpClientUtil;
 5 | import java.io.BufferedReader;
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import java.io.InputStreamReader;
 9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.Map;
12 | import lombok.extern.slf4j.Slf4j;
13 | import org.apache.commons.collections.CollectionUtils;
14 | 
15 | /**
16 |  * 解析ts路径
17 |  * @author liuyangyang
18 |  * */
19 | @Slf4j
20 | public class ParseTsUrls implements BaseHttpCallBack {
21 | 
22 |     private String httpUrl;
23 | 
24 |     private String fileName;
25 | 
26 |     private Map<String,String> headerInfos;
27 | 
28 |     private List<String> tsUrlList = new ArrayList<String>();
29 | 
30 |     public ParseTsUrls(String httpUrl, Map<String,String> headerInfos,String fileName){
31 |         this.httpUrl = httpUrl;
32 |         this.headerInfos = headerInfos;
33 |         this.fileName = fileName;
34 |     }
35 | 
36 |     public void httpRequestForTsUrls(){
37 |         log.info("正在发送请求:httpUrl={}",httpUrl);
38 |         M3u8HttpClientUtil.doGet(httpUrl,this,headerInfos);
39 |         if (CollectionUtils.isNotEmpty(tsUrlList)) {
40 |             new DownloadTsFile(tsUrlList, headerInfos, fileName).download();
41 |         } else {
42 |             log.info("没有拿到ts路径,请检查...");
43 |         }
44 |     }
45 | 
46 |     @Override
47 |     public void httpCallBack(int responseCode,InputStream inputStream) {
48 |         log.info("开始解析TS路径.....");
49 |         if(responseCode == 200){
50 |             try {
51 |                 // 封装输入流is，并指定字符集
52 |                 BufferedReader br  = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
53 |                 String lineStr = null;
54 |                 while ((lineStr = br.readLine()) != null) {
55 |                     if (lineStr.contains("http") && lineStr.contains(".ts")) {
56 |                         tsUrlList.add(lineStr);
57 |                     }
58 |                 }
59 |             } catch (IOException e) {
60 |                 log.error("解析ts出错了 e={}",e);
61 |             }
62 | 
63 |             log.info("解析TS路径完成.....");
64 |         }
65 |     }
66 | 
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/spider/doRule/ProxyXcSpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.spider.doRule;
 2 | 
 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract;
 4 | import com.xyzj.crawler.framework.entity.Param;
 5 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 6 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler;
 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 8 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage;
 9 | import lombok.extern.slf4j.Slf4j;
10 | import org.jsoup.Jsoup;
11 | import org.jsoup.nodes.Document;
12 | import org.jsoup.select.Elements;
13 | 
14 | /**
15 |  * @author lyy
16 |  * @since 2018-10-27 13:08
17 |  */
18 | @Slf4j
19 | public  class ProxyXcSpiderRule extends SpiderRuleAbstract {
20 |     @Override
21 |     public void runSpider(Param param, ISpiderRule spiderRule) {
22 |         SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler();
23 |         spiderRuleHandler.handler(param, spiderRule);
24 |     }
25 | 
26 |     @Override
27 |     public void handlerGoods(Param param, String htmlSource) {
28 |         Document document = Jsoup.parse(htmlSource);
29 |         Elements trs = document.select("table[id=ip_list]").select("tbody").select("tr");
30 |         for (int i = 1; i < trs.size(); i++) {
31 |             String newIp = trs.get(i).select("td").get(1).text();
32 |             String newPort = trs.get(i).select("td").get(2).text();
33 |             String newType = trs.get(i).select("td").get(5).text();
34 |             String newSpeed = trs.get(i).select("td").get(6).select("div[class=bar]").attr("title");
35 |             //取得单个ip
36 |             IPMessage ipMessage = new IPMessage();
37 |             ipMessage.setIp(newIp);
38 |             ipMessage.setPort(newPort);
39 |             ipMessage.setType(newType);
40 |             ipMessage.setSpeed(newSpeed);
41 | 
42 |             ISpiderRule spiderRule = new SpiderRuleFactory(new ProxyFilterSpiderRule()).getInstance();
43 |             Param newParam = new Param();
44 |             newParam.setProxyIp(newIp);
45 |             newParam.setProxyPort(newPort);
46 |             newParam.setWebUrl(String.valueOf(param.getExtParamMap().get("targetUrl")));
47 |             newParam.getExtParamMap().put("ipMessage",ipMessage);
48 |             spiderRule.runSpider(newParam,spiderRule);
49 |         }
50 |     }
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/DownloadTsFile.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | import com.xyzj.crawler.utils.gethtmlstring.BaseHttpCallBack;
 4 | import com.xyzj.crawler.utils.gethtmlstring.M3u8HttpClientUtil;
 5 | import java.io.File;
 6 | import java.io.FileNotFoundException;
 7 | import java.io.FileOutputStream;
 8 | import java.io.IOException;
 9 | import java.io.InputStream;
10 | import java.util.List;
11 | import java.util.Map;
12 | import lombok.extern.slf4j.Slf4j;
13 | import org.springframework.util.StringUtils;
14 | 
15 | /**
16 |  *
17 |  * @author liuyangyang
18 |  * */
19 | @Slf4j
20 | public class DownloadTsFile implements BaseHttpCallBack {
21 |     private List<String> tsUrlList;
22 | 
23 |     private Map<String,String> headerInfos;
24 | 
25 |     private String fileName;
26 | 
27 |     private FileOutputStream fileOutputStream=null;
28 | 
29 |     public DownloadTsFile(List<String> tsUrlList, Map<String,String> headerInfos, String fileName){
30 |         this.tsUrlList=tsUrlList;
31 |         this.headerInfos = headerInfos;
32 |         this.fileName = fileName;
33 |     }
34 | 
35 | 
36 |     public void download(){
37 |         log.info("开始生成文件，请等待......");
38 |         if(!StringUtils.isEmpty(tsUrlList)){
39 |             try {
40 |                 fileOutputStream = new FileOutputStream(new File(fileName));
41 |             } catch (FileNotFoundException e) {
42 |                 log.info("输出流创建异常 e={}",e);
43 |             }
44 |             for (String url:tsUrlList) {
45 |                 M3u8HttpClientUtil.doGet(url,this,headerInfos);
46 |             }
47 |             if(fileOutputStream!=null){
48 |                 try {
49 |                     fileOutputStream.close();
50 |                 } catch (IOException e) {
51 |                     log.info("输出流关闭异常 e={}",e);
52 |                 }
53 |             }
54 |         }
55 |     }
56 | 
57 |     @Override
58 |     public void httpCallBack(int responseCode, InputStream inputStream) {
59 |         if(responseCode == 200){
60 |             byte[] tempBytes = new byte[100];
61 |             int byteRead = 0;
62 |             try {
63 |                 while ((byteRead = inputStream.read(tempBytes)) != -1) {
64 |                     fileOutputStream.write(tempBytes, 0, byteRead);
65 |                 }
66 |             } catch (IOException e) {
67 |                 log.info("文件合并异常 e={}",e);
68 |             }
69 |         }
70 |     }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/RegexUtil.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.parsehtmlstring;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.regex.Matcher;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | /**
 9 |  * 正则表达式匹配两个字符串之间的内容
10 |  * @author Administrator
11 |  *
12 |  */
13 | public class RegexUtil {
14 | 	
15 | 	public static void main(String[] args) {
16 | 		String str = "<?xml version='1.0' encoding='UTF-8'?><ufinterface billtype='gl' filename='e:\1.xml' isexchange='Y' proc='add' receiver='1060337@1060337-003' replace='Y' roottag='sendresult' sender='01' successful='Y'><sendresult><billpk></billpk><bdocid>w764</bdocid><filename>e:\1.xml</filename><resultcode>1</resultcode><resultdescription>单据w764开始处理...单据w764处理完毕!</resultdescription><content>2017.09-记账凭证-1</content></sendresult><sendresult><billpk></billpk><bdocid>w1007</bdocid><filename>e:\1.xml</filename><resultcode>1</resultcode><resultdescription>单据w1007开始处理...单据w1007处理完毕!</resultdescription><content>2017.10-记账凭证-1</content></sendresult><sendresult><billpk></billpk><bdocid>w516</bdocid><filename>e:\1.xml</filename><resultcode>1</resultcode><resultdescription>单据w516开始处理...单据w516处理完毕!</resultdescription><content>2017.07-记账凭证-50</content></sendresult></ufinterface>";
17 | 		str = "[{ \\\"CretType\":\"name\"}]";
18 |         System.out.println(str);
19 | 		String rgex = "CretType\":\"(.*?)\"";
20 | 		
21 | 	    System.out.println((RegexUtil.getSubUtil(str,rgex)));
22 | 	    List<String> lists = RegexUtil.getSubUtil(str,rgex);
23 | 	    for (String string : lists) {
24 | 			System.out.println(string);
25 | 		}
26 | 	    System.out.println(RegexUtil.getSubUtilSimple(str, rgex));
27 | 	}
28 | 	
29 | 	/** 
30 |      * 正则表达式匹配两个指定字符串中间的内容 
31 |      * @param soap 
32 |      * @return 
33 |      */  
34 |     public static List<String> getSubUtil(String soap,String rgex){
35 |         List<String> list = new ArrayList<String>();  
36 |         Pattern pattern = Pattern.compile(rgex);// 匹配的模式  
37 |         Matcher m = pattern.matcher(soap);  
38 |         while (m.find()) {  
39 |             int i = 1;  
40 |             list.add(m.group(i));  
41 |             i++;  
42 |         }  
43 |         return list;  
44 |     }  
45 |       
46 |     /** 
47 |      * 返回单个字符串，若匹配到多个的话就返回第一个，方法与getSubUtil一样 
48 |      * @param soap 
49 |      * @param rgex 
50 |      * @return 
51 |      */  
52 |     public static String getSubUtilSimple(String soap,String rgex){
53 |         Pattern pattern = Pattern.compile(rgex);// 匹配的模式  
54 |         Matcher m = pattern.matcher(soap);  
55 |         while(m.find()){  
56 |             return m.group(1);  
57 |         }  
58 |         return "";  
59 |     }  
60 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/zlr/docrawler/SsqDoErrorMain.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.zlr.docrawler;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable;
 7 | import com.xyzj.crawler.spidertask.zlr.dorule.SsqDetailSpiderRule;
 8 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql;
 9 | 
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.concurrent.CountDownLatch;
13 | import java.util.concurrent.ExecutorService;
14 | import java.util.concurrent.Executors;
15 | import lombok.extern.slf4j.Slf4j;
16 | import org.springframework.util.CollectionUtils;
17 | import org.springframework.util.StringUtils;
18 | 
19 | @Slf4j
20 | public class SsqDoErrorMain {
21 |     public static void main(String[] args) throws Exception {
22 |         log.info("尝试从新拉去数据");
23 |         reTry();
24 |     }
25 | 
26 |     public static void reTry() {
27 |         SaveToMysql query = new SaveToMysql();
28 |         List<Map<String, Object>> mapList = query.queryBySql("select * from ungoods");
29 |         if (CollectionUtils.isEmpty(mapList)) {
30 |             log.info("无失败记录......");
31 |             return;
32 |         }
33 |         // 删除
34 |         query.executeBySql("delete from ungoods");
35 |         //计数器锁
36 |         CountDownLatch countDownLatch = new CountDownLatch(mapList.size());
37 |         ExecutorService executorService = Executors.newFixedThreadPool(3);
38 |         for (Map<String, Object> map : mapList) {
39 |             String url = String.valueOf(map.get("webUrl"));
40 |             if (!StringUtils.isEmpty(url)) {
41 |                 // 存在数据
42 |                 log.info("url={}", url);
43 |                 Param param = new Param();
44 |                 param.setWebUrl(url);
45 |                 param.setCountDownLatch(countDownLatch);
46 | 
47 |                 //抓取器
48 |                 SsqDetailSpiderRule ssqDetailSpiderRule = new SsqDetailSpiderRule();
49 |                 ISpiderRule spiderRule = new SpiderRuleFactory(ssqDetailSpiderRule).getInstance();
50 | 
51 |                 //spiderRule 参数
52 |                 SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, param);
53 |                 executorService.execute(spiderRunnable);
54 |             }
55 |         }
56 |         //等到任务执行完毕，关闭线程池。
57 |         executorService.shutdown();
58 |         try {
59 |             countDownLatch.await();
60 |         } catch (InterruptedException e) {
61 |             log.info("出毛病......");
62 |         }
63 |         // 循环
64 |         reTry();
65 | 
66 |     }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/proxyip/config/RedisConfig.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.utils.proxyip.config;
 2 | 
 3 | import java.util.ResourceBundle;
 4 | import lombok.extern.slf4j.Slf4j;
 5 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
 6 | import redis.clients.jedis.Jedis;
 7 | import redis.clients.jedis.JedisPool;
 8 | 
 9 | @Slf4j
10 | public class RedisConfig {
11 | 
12 |     private static GenericObjectPoolConfig config = null;
13 |     private static String addr;
14 |     private static int port;
15 | 
16 |     private static JedisPool jedisPool ;
17 | 
18 |     //加载配置文件
19 |     private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf");
20 | 
21 |     //初始化连接
22 |     static {
23 |         addr = resourceBundle.getString("jedis.addr");
24 |         port = Integer.parseInt(resourceBundle.getString("jedis.port"));
25 |         try {
26 |             //先进行redis数据的参数配置
27 |             config = new GenericObjectPoolConfig();
28 |             //链接耗尽时是否阻塞，false时抛出异常，默认是true，阻塞超时之后抛出异常
29 |             config.setBlockWhenExhausted(true);
30 |             //逐出策略类名，当连接超过最大空闲时间或最大空闲数抛出异常
31 |             config.setEvictionPolicyClassName("org.apache.commons.pool2.impl.DefaultEvictionPolicy");
32 |             //是否启用pool的jmx管理功能，默认是true
33 |             config.setJmxEnabled(true);
34 |             //最大空闲数，默认为8，一个pool最多有多少空闲的Jedis实例
35 |             config.setMaxIdle(8);
36 |             //最大连接数
37 |             config.setMaxTotal(100);
38 |             //当引入一个Jedis实例时，最大的等待时间，如果超过等待时间，抛出异常
39 |             config.setMaxWaitMillis(1000 * 10);
40 |             //获得一个jedis实例的时候是否检查连接可用性（ping()
41 |             config.setTestOnBorrow(true);
42 |         } catch (Exception e) {
43 |             log.error("Exception:{}", e);
44 |         }
45 |     }
46 | 
47 |     /**
48 |      * ========================================
49 |      *
50 |      * @description: 取得Jedis实例
51 |      * @author: lyy
52 |      * @param:
53 |      * @return:
54 |      * @exception:
55 |      * @create: 2019/7/4 10:06
56 |      * <p>
57 |      * ========================================
58 |      */
59 |     public synchronized static Jedis getJedis() {
60 |         if (jedisPool == null) {
61 |             jedisPool = new JedisPool(config, addr, port);
62 |         }
63 |         return jedisPool.getResource();
64 |     }
65 | 
66 |     /**
67 |      * ========================================
68 |      *
69 |      * @description: 释放Jedis连接
70 |      * @author: lyy
71 |      * @param:
72 |      * @return:
73 |      * @exception:
74 |      * @create: 2019/7/4 10:06
75 |      * <p>
76 |      * ========================================
77 |      */
78 |     public static void close(final Jedis jedis) {
79 |         if (jedis != null) {
80 |             jedis.close();
81 |         }
82 |     }
83 | }


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/zlr/docrawler/SsqDoMain.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.zlr.docrawler;
 2 | 
 3 | import avro.shaded.com.google.common.collect.Lists;
 4 | import com.xyzj.crawler.framework.entity.Goods;
 5 | import com.xyzj.crawler.framework.entity.Param;
 6 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 8 | import com.xyzj.crawler.framework.runnable.SpiderRunnable;
 9 | import com.xyzj.crawler.spidertask.zlr.dorule.SsqDetailSpiderRule;
10 | import com.xyzj.crawler.utils.gethtmlstring.HttpResponseUtil;
11 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil;
12 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql;
13 | import java.util.ArrayList;
14 | import java.util.List;
15 | import java.util.concurrent.ExecutorService;
16 | import java.util.concurrent.Executors;
17 | import lombok.extern.slf4j.Slf4j;
18 | 
19 | @Slf4j
20 | public class SsqDoMain {
21 | 
22 | 
23 |     public static void main(String[] args) throws Exception {
24 |         log.info("开始抓取区域数据");
25 |         //第一步 取得源码
26 |         Param param = new Param();
27 |         String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
28 |         param.setWebUrl(baseUrl);
29 |         param.setCharset("gb2312");
30 |         String htmlSource = HttpResponseUtil.getHtmlSource(param);
31 |         if (htmlSource == null) return;
32 | 
33 |         //第二步 提取url
34 |         String numberRegexString = "<a href='(.*?).html'>";
35 |         String nameRegexString = ".html'>(.*?)<br/>";
36 |         List<String> numUtil = RegexUtil.getSubUtil(htmlSource, numberRegexString);
37 |         List<String> nameUtil = RegexUtil.getSubUtil(htmlSource, nameRegexString);
38 | 
39 |         ArrayList<Goods> goodsList = Lists.newArrayList();
40 |         for (int i = 0; i < numUtil.size(); i++) {
41 |             Goods goods = new Goods();
42 |             goods.setName(nameUtil.get(i));
43 |             goods.setWebUrl(baseUrl + numUtil.get(i) + ".html");
44 |             goods.setOrderNum(numUtil.get(i));
45 |             //第三步 往数据库中存
46 |             SaveToMysql saveToMysql = new SaveToMysql();
47 |             saveToMysql.saveToMasql("goods", goods);
48 |             goodsList.add(goods);
49 |         }
50 | 
51 |         //第四步 爬取明细记录
52 |         ExecutorService executorService = Executors.newFixedThreadPool(3);
53 |         for (Goods goods : goodsList) {
54 |             Param detailParam = new Param();
55 |             detailParam.setWebUrl(goods.getWebUrl());
56 |             detailParam.setCharset("gb2312");
57 |             //抓取器
58 |             SsqDetailSpiderRule ssqDetailSpiderRule = new SsqDetailSpiderRule();
59 |             ISpiderRule spiderRule = new SpiderRuleFactory(ssqDetailSpiderRule).getInstance();
60 |             //spiderRule 参数
61 |             SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, detailParam);
62 |             executorService.execute(spiderRunnable);
63 |         }
64 |         //等到任务执行完毕，关闭线程池。
65 |         executorService.shutdown();
66 | 
67 | 
68 |     }
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/savetomysql/SaveToMysql.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.savetomysql;
  2 | 
  3 | import java.lang.reflect.Field;
  4 | import java.lang.reflect.Method;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.ResourceBundle;
  8 | import lombok.extern.slf4j.Slf4j;
  9 | import org.springframework.jdbc.core.JdbcTemplate;
 10 | import org.springframework.jdbc.datasource.DriverManagerDataSource;
 11 | 
 12 | @Slf4j
 13 | public class SaveToMysql {
 14 | 	private static JdbcTemplate jdbcTemplate = null;
 15 | 	private final static String MYSQL_URL;
 16 | 	private final static String MYSQL_USERNAME;
 17 | 	private final static String MYSQL_PASSWORD;
 18 | 
 19 | 	//加载配置文件
 20 | 	private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf");
 21 | 
 22 | 	static {
 23 | 		MYSQL_URL = resourceBundle.getString("mysql.url");
 24 | 		MYSQL_USERNAME = resourceBundle.getString("mysql.username");
 25 | 		MYSQL_PASSWORD = resourceBundle.getString("mysql.password");
 26 | 
 27 | 		DriverManagerDataSource dataSource = new DriverManagerDataSource();
 28 | 		dataSource.setDriverClassName("com.mysql.jdbc.Driver");
 29 | 		dataSource.setUrl(MYSQL_URL);
 30 | 		dataSource.setUsername(MYSQL_USERNAME);
 31 | 		dataSource.setPassword(MYSQL_PASSWORD);
 32 | 
 33 | 		jdbcTemplate = new JdbcTemplate(dataSource);
 34 | 	}
 35 | 
 36 | 	public boolean saveToMasql(String tableName,Object object) {
 37 | 		try {
 38 | 			save(sqlBuilder(tableName,object), getValues(object));
 39 | 		} catch (Exception e) {
 40 | 			log.error("error,exception: {}",e);
 41 | 		}
 42 | 		return true;
 43 | 	}
 44 | 
 45 | 	public List<Map<String, Object>> queryBySql(String sql) {
 46 | 		return jdbcTemplate.queryForList(sql);
 47 | 	}
 48 | 
 49 | 	public void executeBySql(String sql) {
 50 | 		jdbcTemplate.execute(sql);
 51 | 	}
 52 | 
 53 | 	public void batchUpdate(String sql, List<Object[]> param) {
 54 | 		jdbcTemplate.batchUpdate(sql, param);
 55 | 	}
 56 | 
 57 | 
 58 | 
 59 | 	// 取得要执行的sql语句
 60 | 	private static String sqlBuilder(String tableName,Object object) {
 61 | 		Class<? extends Object> clazz = object.getClass();
 62 | 		Field[] fields = clazz.getDeclaredFields();
 63 | 
 64 | 		StringBuilder sql = new StringBuilder();
 65 |         sql.append("INSERT INTO ");
 66 |         sql.append("`"+tableName+"`");
 67 |         sql.append("(");
 68 | 
 69 |         StringBuilder insertValues = new StringBuilder();
 70 |         insertValues.append("values(");
 71 | 		for(int i=1;i<fields.length-1;i++) {
 72 |             sql.append("`"+ fields[i].getName()+"`,");
 73 |             insertValues.append("?,");
 74 | 		}
 75 |         sql.append("`"+ fields[fields.length-1].getName()+"`)");
 76 |         insertValues.append("?)");
 77 | 
 78 | 		sql.append(insertValues);
 79 | 		System.out.println(sql.toString());
 80 | 		return sql.toString();
 81 | 	}
 82 | 
 83 | 
 84 | 	private Object[] getValues(Object object) throws Exception {
 85 | 		Class<? extends Object> clazz =  object.getClass();
 86 | 		Field[] fields = clazz.getDeclaredFields();
 87 | 		Object[] params = new Object[fields.length - 1];
 88 | 		for (int i = 1; i < fields.length; i++) {
 89 | 			Method method = (Method) clazz.getMethod("get" + getMethodName(fields[i].getName()));
 90 | 			Object value = method.invoke(object);
 91 | 			params[i - 1] = value;
 92 | 		}
 93 | 		return params;
 94 | 	}
 95 | 
 96 | 	private static String getMethodName(String fieldName) {
 97 |         // 把一个字符串的第一个字母大写、效率是最高的
 98 | 		byte[] items = fieldName.getBytes();
 99 | 		items[0] = (byte) ((char) items[0] - 'a' + 'A');
100 | 		return new String(items);
101 | 	}
102 | 
103 | 	private void save(String sql, Object[] params) {
104 | 		// 根据模板和sql语句执行数据库操作
105 | 		try {
106 | 			int count = jdbcTemplate.update(sql, params);
107 | 			log.info("插入数据成功......");
108 | 		} catch (Exception e) {
109 | 			log.error("插入异常 exception: {}",e);
110 | 		}
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/savetomysql/SaveToOracle.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.savetomysql;
  2 | 
  3 | import java.lang.reflect.Field;
  4 | import java.lang.reflect.Method;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.ResourceBundle;
  8 | import lombok.extern.slf4j.Slf4j;
  9 | import org.springframework.jdbc.core.JdbcTemplate;
 10 | import org.springframework.jdbc.datasource.DriverManagerDataSource;
 11 | 
 12 | @Slf4j
 13 | public class SaveToOracle {
 14 | 	private static JdbcTemplate jdbcTemplate = null;
 15 | 	private final static String ORACLE_URL;
 16 | 	private final static String ORACLE_USERNAME;
 17 | 	private final static String ORACLE_PASSWORD;
 18 | 
 19 | 	//加载配置文件
 20 | 	private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf");
 21 | 
 22 | 	static {
 23 | 		ORACLE_URL = resourceBundle.getString("oracle.url");
 24 | 		ORACLE_USERNAME = resourceBundle.getString("oracle.username");
 25 | 		ORACLE_PASSWORD = resourceBundle.getString("oracle.password");
 26 | 
 27 | 		DriverManagerDataSource dataSource = new DriverManagerDataSource();
 28 | 		dataSource.setDriverClassName("oracle.jdbc.driver.OracleDriver");
 29 | 		dataSource.setUrl(ORACLE_URL);
 30 | 		dataSource.setUsername(ORACLE_USERNAME);
 31 | 		dataSource.setPassword(ORACLE_PASSWORD);
 32 | 
 33 | 		jdbcTemplate = new JdbcTemplate(dataSource);
 34 | 	}
 35 | 
 36 | 	public boolean saveToOracle(String tableName,Object object) {
 37 | 		try {
 38 | 			save(sqlBuilder(tableName,object), getValues(object));
 39 | 		} catch (Exception e) {
 40 | 			log.error("error,exception: {}",e);
 41 | 		}
 42 | 		return true;
 43 | 	}
 44 | 
 45 | 	public List<Map<String, Object>> queryBySql(String sql) {
 46 | 		return jdbcTemplate.queryForList(sql);
 47 | 	}
 48 | 
 49 | 	public void executeBySql(String sql) {
 50 | 		jdbcTemplate.execute(sql);
 51 | 	}
 52 | 
 53 | 	public void batchUpdate(String sql, List<Object[]> param) {
 54 | 		jdbcTemplate.batchUpdate(sql, param);
 55 | 	}
 56 | 
 57 | 
 58 | 
 59 | 	// 取得要执行的sql语句
 60 | 	private static String sqlBuilder(String tableName,Object object) {
 61 | 		Class<? extends Object> clazz = object.getClass();
 62 | 		Field[] fields = clazz.getDeclaredFields();
 63 | 
 64 | 		StringBuilder sql = new StringBuilder();
 65 |         sql.append("INSERT INTO ");
 66 |         sql.append("`"+tableName+"`");
 67 |         sql.append("(");
 68 | 
 69 |         StringBuilder insertValues = new StringBuilder();
 70 |         insertValues.append("values(");
 71 | 		for(int i=1;i<fields.length-1;i++) {
 72 |             sql.append("`"+ fields[i].getName()+"`,");
 73 |             insertValues.append("?,");
 74 | 		}
 75 |         sql.append("`"+ fields[fields.length-1].getName()+"`)");
 76 |         insertValues.append("?)");
 77 | 
 78 | 		sql.append(insertValues);
 79 | 		System.out.println(sql.toString());
 80 | 		return sql.toString();
 81 | 	}
 82 | 
 83 | 
 84 | 	private Object[] getValues(Object object) throws Exception {
 85 | 		Class<? extends Object> clazz =  object.getClass();
 86 | 		Field[] fields = clazz.getDeclaredFields();
 87 | 		Object[] params = new Object[fields.length - 1];
 88 | 		for (int i = 1; i < fields.length; i++) {
 89 | 			Method method = (Method) clazz.getMethod("get" + getMethodName(fields[i].getName()));
 90 | 			Object value = method.invoke(object);
 91 | 			params[i - 1] = value;
 92 | 		}
 93 | 		return params;
 94 | 	}
 95 | 
 96 | 	private static String getMethodName(String fieldName) {
 97 |         // 把一个字符串的第一个字母大写、效率是最高的
 98 | 		byte[] items = fieldName.getBytes();
 99 | 		items[0] = (byte) ((char) items[0] - 'a' + 'A');
100 | 		return new String(items);
101 | 	}
102 | 
103 | 	private void save(String sql, Object[] params) {
104 | 		// 根据模板和sql语句执行数据库操作
105 | 		try {
106 | 			int count = jdbcTemplate.update(sql, params);
107 | 			log.info("插入数据成功......");
108 | 		} catch (Exception e) {
109 | 			log.error("插入异常 exception: {}",e);
110 | 		}
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/zlr/dorule/SsqDetailSpiderRule.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.zlr.dorule;
 2 | 
 3 | import avro.shaded.com.google.common.collect.Lists;
 4 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract;
 5 | import com.xyzj.crawler.framework.entity.Goods;
 6 | import com.xyzj.crawler.framework.entity.Param;
 7 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 8 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler;
 9 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
10 | import com.xyzj.crawler.utils.parsehtmlstring.JsoupHtmlParser;
11 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil;
12 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql;
13 | import java.util.ArrayList;
14 | import java.util.Arrays;
15 | import java.util.List;
16 | import lombok.extern.slf4j.Slf4j;
17 | import org.springframework.util.CollectionUtils;
18 | 
19 | 
20 | @Slf4j
21 | public class SsqDetailSpiderRule extends SpiderRuleAbstract {
22 | 
23 |     @Override
24 |     public void runSpider(Param param, ISpiderRule spiderRule) {
25 |         SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler();
26 |         spiderRuleHandler.handler(param, spiderRule);
27 |     }
28 | 
29 |     @Override
30 |     public void handlerGoods(Param param, String htmlSource) {
31 |         List<String> citytr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.citytr > td"), false);
32 |         List<String> countytr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.countytr > td"), false);
33 |         List<String> towntr  = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.towntr > td"), false);
34 |         List<String> villagetr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.villagetr > td"), false);
35 |         ArrayList<String> allList = Lists.newArrayList();
36 |         if (citytr != null) {
37 |             allList.addAll(citytr);
38 |         }
39 |         if (countytr != null) {
40 |             allList.addAll(countytr);
41 |         }
42 |         if (towntr != null) {
43 |             allList.addAll(towntr);
44 |         }
45 |         if (villagetr != null) {
46 |             allList.addAll(villagetr);
47 |         }
48 |         log.info("allList========"+allList);
49 |         if (!CollectionUtils.isEmpty(allList)) {
50 |             //判断是否包含城乡分类代码
51 |             if (htmlSource.contains("<td width=80>城乡分类代码</td>")) {
52 |                 for (int i = 0; i < allList.size() ; i = i + 3) {
53 |                     Goods goods = new Goods();
54 |                     goods.setWebUrl(String.valueOf(param.getWebUrl()));
55 |                     goods.setOrderNum(allList.get(i));
56 |                     goods.setName(allList.get(i + 2));
57 |                     //第三步 往数据库中存
58 |                     SaveToMysql saveToMysql = new SaveToMysql();
59 |                     saveToMysql.saveToMasql("goods", goods);
60 |                 }
61 |             } else {
62 |                 for (int i = 0; i < allList.size() ; i = i + 2) {
63 |                     Goods goods = new Goods();
64 |                     goods.setWebUrl(String.valueOf(param.getWebUrl()));
65 |                     goods.setOrderNum(allList.get(i));
66 |                     goods.setName(allList.get(i + 1));
67 |                     //第三步 往数据库中存
68 |                     SaveToMysql saveToMysql = new SaveToMysql();
69 |                     saveToMysql.saveToMasql("goods", goods);
70 |                 }
71 |             }
72 | 
73 | 
74 |         }
75 | 
76 |         //保存url
77 |         String urlRegexString = "tr'><td><a href='(.*?).html'>";
78 |         List<String> urlUtil = RegexUtil.getSubUtil(htmlSource, urlRegexString);
79 |         log.info("urlUtil========"+urlUtil);
80 | 
81 |         //http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14/01/140108.html
82 |         //http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14/01/08/140108001.html
83 |         for (int i = 0; i < urlUtil.size(); i++) {
84 |             //设置提供方
85 |             String oldWebUrl = String.valueOf(param.getWebUrl());
86 |             String newWebUrl = oldWebUrl.substring(0, oldWebUrl.lastIndexOf("/")+1);
87 |             Param newParam = new Param();
88 |             newParam.setWebUrl(newWebUrl+urlUtil.get(i)+".html");
89 |             ISpiderRule spiderRule = new SpiderRuleFactory(new SsqDetailSpiderRule()).getInstance();
90 |             runSpider(newParam,spiderRule);
91 |         }
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/authcode/AuthcodeDistinguisher.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.authcode;
  2 | 
  3 | import net.sourceforge.tess4j.Tesseract;
  4 | import net.sourceforge.tess4j.TesseractException;
  5 | import org.apache.http.HttpEntity;
  6 | import org.apache.http.HttpResponse;
  7 | import org.apache.http.client.HttpClient;
  8 | import org.apache.http.client.methods.HttpGet;
  9 | import org.apache.http.impl.client.DefaultHttpClient;
 10 | import org.apache.http.protocol.BasicHttpContext;
 11 | import sun.misc.BASE64Decoder;
 12 | 
 13 | import java.io.File;
 14 | import java.io.FileOutputStream;
 15 | import java.io.InputStream;
 16 | import java.io.OutputStream;
 17 | 
 18 | /**
 19 |  *
 20 |  * 验证码图片识别工具
 21 |  * 使用方法如下：
 22 |  * <lu>
 23 |  *     <li>将本项目下的docs/authcode的tessdata.zip解压至任意目录如z:/tessddta</li>
 24 |  *     <li>自己生成验证码图片，或者使用docs/authcode/**.png图片，复制至任意目录，如z:/abcd.png</li>
 25 |  *     <li>设置tesseract的datapath为步骤1中的目录</li>
 26 |  *     <li>修改main()中的tesseract的doOCR()方法的参数为步骤2中的图片路径</li>
 27 |  *     <li>运行即可</li>
 28 |  * </lu>
 29 |  *
 30 |  * @author liulei@bshf360.com
 31 |  * @since 2017-07-19 14:26
 32 |  */
 33 | public class AuthcodeDistinguisher {
 34 | 
 35 |     private static  void downloadImage() throws Exception {
 36 |         HttpClient httpClient = new DefaultHttpClient();
 37 |         for (int i = 0; i < 10; i++) {
 38 |             String url = "http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d";
 39 |             HttpGet getMethod = new HttpGet(url);
 40 |             try {
 41 |                 HttpResponse response = httpClient.execute(getMethod, new BasicHttpContext());
 42 |                 HttpEntity entity = response.getEntity();
 43 |                 InputStream instream = entity.getContent();
 44 |                 OutputStream outstream = new FileOutputStream(new File("d:/", i + ".gif"));
 45 |                 int l = -1;
 46 |                 byte[] tmp = new byte[2048];
 47 |                 while ((l = instream.read(tmp)) != -1) {
 48 |                     outstream.write(tmp);
 49 |                 }
 50 |                 outstream.close();
 51 |             } finally {
 52 |                 getMethod.releaseConnection();
 53 |             }
 54 |         }
 55 | 
 56 |         System.out.println("下载验证码完毕！");
 57 |     }
 58 | 
 59 |     public static String getString(String url) {
 60 |         Tesseract tesseract = new Tesseract();
 61 |         tesseract.setDatapath("D:\\java\\workspace\\learn\\crawler\\tessdata");
 62 |         try {
 63 |             //"http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d"
 64 |             File file = new File(url);
 65 |             String result = tesseract.doOCR(file);
 66 |             System.out.println(result);
 67 |             return result;
 68 |         } catch (TesseractException e) {
 69 |             System.err.println(e.getMessage());
 70 |         }
 71 |         return null;
 72 |     }
 73 | 
 74 | 
 75 |     public static void main(String[] a) throws  Exception {
 76 |         downloadImage();
 77 | 
 78 |        /* Tesseract tesseract = new Tesseract();
 79 |         tesseract.setDatapath("D:\\java\\workspace\\learn\\crawler\\tessdata");
 80 |         try {
 81 |             //String result = tesseract.doOCR(new File("D:\\java\\workspace\\learn\\crawler\\yzm\\showphone.gif"));
 82 |             URL url = new URL("http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d");
 83 | 
 84 | 
 85 |             String file = url.getFile();
 86 |             String result = tesseract.doOCR(file);
 87 |             System.out.println(result);
 88 |         } catch (Exception e) {
 89 |             System.err.println(e.getMessage());
 90 |         }*/
 91 |     }
 92 | 
 93 |     /**
 94 |      * 将图片的base64字符串生成到指定位置的文件中
 95 |      * @param imgStr 图片的base64编码字符串
 96 |      * @param imgFilePath 目标文件位置
 97 |      */
 98 |     public static boolean generateImage(String imgStr, String imgFilePath) {// 对字节数组字符串进行Base64解码并生成图片
 99 |         if (imgStr == null){ // 图像数据为空
100 |             return false;
101 |         }
102 |         BASE64Decoder decoder = new BASE64Decoder();
103 |         try {
104 |             // Base64解码
105 |             byte[] bytes = decoder.decodeBuffer(imgStr);
106 |             for (int i = 0; i < bytes.length; ++i) {
107 |                 if (bytes[i] < 0) {// 调整异常数据
108 |                     bytes[i] += 256;
109 |                 }
110 |             }
111 |             // 生成jpeg图片
112 |             OutputStream out = new FileOutputStream(imgFilePath);
113 |             out.write(bytes);
114 |             out.flush();
115 |             out.close();
116 |             return true;
117 |         } catch (Exception e) {
118 |             return false;
119 |         }
120 |     }
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/importfrom/IOUtil.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.importfrom;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.FileOutputStream;
  7 | import java.io.IOException;
  8 | import java.io.InputStreamReader;
  9 | import java.io.StringReader;
 10 | import java.util.ArrayList;
 11 | import java.util.HashSet;
 12 | 
 13 | /**
 14 |  * 读取字典时的I/O工具类
 15 |  * 
 16 |  * @author zel
 17 |  * 
 18 |  */
 19 | public class IOUtil {
 20 | 	public static String readDirOrFile(String filePath, String fileEncoding) {
 21 | 		File f = new File(filePath);
 22 | 		StringBuilder sb = new StringBuilder();
 23 | 		if (f.isDirectory()) {
 24 | 			File[] files = f.listFiles();
 25 | 			for (File temp_file : files) {
 26 | 				sb.append(readDirOrFile(temp_file.getAbsolutePath(),
 27 | 						fileEncoding));
 28 | 			}
 29 | 			return sb.toString();
 30 | 		}
 31 | 		return readFile(filePath, fileEncoding);
 32 | 	}
 33 | 
 34 | 	public static ArrayList<String> readDirOrFileToList(String filePath,
 35 | 			String fileEncoding, ArrayList<String> linkList) {
 36 | 		File f = new File(filePath);
 37 | 		if (f.isDirectory()) {
 38 | 			File[] files = f.listFiles();
 39 | 			for (File temp_file : files) {
 40 | 				linkList.add(readFileWithRegexFilter(temp_file
 41 | 						.getAbsolutePath(), fileEncoding));
 42 | 			}
 43 | 			return linkList;
 44 | 		} else {
 45 | 			linkList.add(readFileWithRegexFilter(filePath, fileEncoding));
 46 | 		}
 47 | 		return linkList;
 48 | 	}
 49 | 
 50 | 	/**
 51 | 	 * fileEncoding若为null,则采用系统默认编码
 52 | 	 * 
 53 | 	 * @param filePath
 54 | 	 * @param fileEncoding
 55 | 	 * @return
 56 | 	 */
 57 | 	public static String readFile(String filePath, String fileEncoding) {
 58 | 		if (fileEncoding == null) {
 59 | 			fileEncoding = System.getProperty("file.encoding");
 60 | 		}
 61 | 		File file = new File(filePath);
 62 | 		BufferedReader br = null;
 63 | 
 64 | 		String line = null;
 65 | 
 66 | 		StringBuilder sb = new StringBuilder();
 67 | 
 68 | 		try {
 69 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(
 70 | 					file), fileEncoding));
 71 | 			while ((line = br.readLine()) != null) {
 72 | 				// if(line.startsWith("中")) {
 73 | 				sb.append(line + "\n");
 74 | 				// }
 75 | 			}
 76 | 			// System.out.println("line---"+line);
 77 | 			return sb.toString();
 78 | 		} catch (Exception e) {
 79 | 			e.printStackTrace();
 80 | 		} finally {
 81 | 			if (br != null) {
 82 | 				try {
 83 | 					br.close();
 84 | 				} catch (IOException e) {
 85 | 					e.printStackTrace();
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 		return null;
 90 | 	}
 91 | 
 92 | 	public static String readFileWithRegexFilter(String filePath,
 93 | 			String fileEncoding) {
 94 | 		if (fileEncoding == null) {
 95 | 			fileEncoding = System.getProperty("file.encoding");
 96 | 		}
 97 | 		File file = new File(filePath);
 98 | 		BufferedReader br = null;
 99 | 
100 | 		String line = null;
101 | 
102 | 		StringBuilder sb = new StringBuilder();
103 | 
104 | 		try {
105 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(
106 | 					file), fileEncoding));
107 | 			while ((line = br.readLine()) != null) {
108 | 				sb.append(line + "\n");
109 | 			}
110 | 			return sb.toString();
111 | 		} catch (Exception e) {
112 | 			e.printStackTrace();
113 | 		} finally {
114 | 			if (br != null) {
115 | 				try {
116 | 					br.close();
117 | 				} catch (IOException e) {
118 | 					e.printStackTrace();
119 | 				}
120 | 			}
121 | 		}
122 | 		return null;
123 | 	}
124 | 
125 | 	/**
126 | 	 * 将一个字符串写入到一个文件
127 | 	 * 
128 | 	 * @param path
129 | 	 *            储存的文件路径
130 | 	 * @param value
131 | 	 *            储存的文件内容
132 | 	 * @throws IOException
133 | 	 */
134 | 	public static synchronized void writeFile(String path, String value) {
135 | 		File f = new File(path);
136 | 		FileOutputStream fos = null;
137 | 		try {
138 | 			fos = new FileOutputStream(f);
139 | 			fos.write(value.getBytes());
140 | 			fos.close();
141 | 		} catch (Exception e) {
142 | 			e.printStackTrace();
143 | 		} finally {
144 | 			if (fos != null) {
145 | 				try {
146 | 					fos.close();
147 | 				} catch (IOException e) {
148 | 					e.printStackTrace();
149 | 				}
150 | 			}
151 | 		}
152 | 
153 | 	}
154 | 
155 | 	public static void main(String[] args) throws Exception {
156 | 		// String source=readFile("resource/library.dic",null);
157 | 		// String source = readFile(ReadConfigUtil.getValue("dic.path"), null);
158 | 		// String source = readDirOrFile("d://temp", "gbk");
159 | 		// System.out.println(source);
160 | 
161 | 		String source_string = IOUtil.readFile("d:\\test\\new_words2.txt",
162 | 				"utf-8");
163 | 		StringReader sr = new StringReader(source_string);
164 | 		BufferedReader br = new BufferedReader(sr);
165 | 		String temp = null;
166 | 		StringBuilder sb = new StringBuilder();
167 | 
168 | 		HashSet<String> hashSet = new HashSet<String>();
169 | 
170 | 		while ((temp = br.readLine()) != null) {
171 | 			if (temp.trim().length() > 1 && temp.trim().length() <= 4) {
172 | 				if (!hashSet.contains(temp)) {
173 | 					sb.append(temp + "\n");
174 | 				} else {
175 | 					hashSet.add(temp);
176 | 				}
177 | 			}
178 | 		}
179 | 		IOUtil.writeFile("d:\\test\\new_words3.txt", sb.toString());
180 | 	}
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/RegexPaserUtil.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.parsehtmlstring;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Iterator;
  5 | import java.util.List;
  6 | import java.util.regex.Matcher;
  7 | import java.util.regex.Pattern;
  8 | 
  9 | /**
 10 |  * 正则表达式处理工具类，字符串的匹配截取中
 11 |  * 
 12 |  * @author zel
 13 |  * 
 14 |  */
 15 | public class RegexPaserUtil {
 16 | 
 17 | 	private String beginRegex;
 18 | 
 19 | 	private String endRegex;
 20 | 
 21 | 	private Matcher matcher;
 22 | 
 23 | 	public final static String TEXTTEGEX = ".*?";
 24 | 
 25 | 	public final static String W = "\\W*?";
 26 | 
 27 | 	public final static String N = "";
 28 | 
 29 | 	public final static String TEXTEGEXANDNRT = "[\\s\\S]*?";
 30 | 	public final static String zel_all_chars = "[\\s\\S]*";
 31 | 
 32 | 	private List<String> filterRegexList = new ArrayList<String>();
 33 | 
 34 | 	public RegexPaserUtil(String beginRegex, String endRegex, String content,
 35 | 			String textRegex) {
 36 | 
 37 | 		this.beginRegex = beginRegex;
 38 | 
 39 | 		this.endRegex = endRegex;
 40 | 
 41 | 		StringBuilder sb = new StringBuilder();
 42 | 
 43 | 		sb.append(beginRegex);
 44 | 
 45 | 		sb.append(textRegex);
 46 | 
 47 | 		sb.append(endRegex);
 48 | 		matcher = Pattern.compile(sb.toString()).matcher(content);
 49 | 	}
 50 | 
 51 | 	// 此处的content变量暂未用
 52 | 	public RegexPaserUtil(String beginRegex, String textRegex, String endRegex,
 53 | 			String content, String flag) {
 54 | 		this.beginRegex = beginRegex;
 55 | 
 56 | 		this.endRegex = endRegex;
 57 | 
 58 | 		StringBuilder sb = new StringBuilder();
 59 | 
 60 | 		sb.append(beginRegex);
 61 | 
 62 | 		sb.append(textRegex);
 63 | 
 64 | 		sb.append(endRegex);
 65 | 		// System.out.println("sb--------------" + sb);
 66 | 		matcher = Pattern.compile(sb.toString()).matcher(content);
 67 | 	}
 68 | 
 69 | 	public RegexPaserUtil(String beginRegex, String endRegex, String textRegex) {
 70 | 
 71 | 		this.beginRegex = beginRegex;
 72 | 
 73 | 		this.endRegex = endRegex;
 74 | 
 75 | 		StringBuilder sb = new StringBuilder();
 76 | 
 77 | 		sb.append(beginRegex);
 78 | 
 79 | 		sb.append(textRegex);
 80 | 
 81 | 		sb.append(endRegex);
 82 | 		matcher = Pattern.compile(sb.toString()).matcher(N);
 83 | 	}
 84 | 
 85 | 	public RegexPaserUtil(String beginRegex, String endRegex) {
 86 | 
 87 | 		this.beginRegex = beginRegex;
 88 | 
 89 | 		this.endRegex = endRegex;
 90 | 
 91 | 		StringBuilder sb = new StringBuilder();
 92 | 
 93 | 		sb.append(beginRegex);
 94 | 
 95 | 		sb.append(TEXTTEGEX);
 96 | 
 97 | 		sb.append(endRegex);
 98 | 
 99 | 		matcher = Pattern.compile(sb.toString()).matcher(N);
100 | 	}
101 | 
102 | 	public String getSimpleText() {
103 | 		if (matcher.find()) {
104 | 			String str = matcher.group().trim();
105 | 			return str;
106 | 		}
107 | 		return null;
108 | 	}
109 | 
110 | 	public String getText() {
111 | 		if (matcher.find()) {
112 | 			String str = matcher.group().trim().replaceFirst(beginRegex, N)
113 | 					.replaceAll(endRegex, N);
114 | 			Iterator<String> it = filterRegexList.iterator();
115 | 			while (it.hasNext()) {
116 | 				str = str.replaceAll(it.next(), N);
117 | 			}
118 | 			return str;
119 | 		}
120 | 		return null;
121 | 	}
122 | 
123 | 	public String getLastText() {
124 | 		String str = null;
125 | 		while (matcher.find()) {
126 | 			str = matcher.group().trim().replaceFirst(beginRegex, N)
127 | 					.replaceAll(endRegex, N);
128 | 		}
129 | 		return str;
130 | 	}
131 | 
132 | 	public String getNext() {
133 | 		return matcher.group();
134 | 	}
135 | 
136 | 	public String getNextTxt() {
137 | 		String str = matcher.group().trim().replaceFirst(beginRegex, N)
138 | 				.replaceAll(endRegex, N);
139 | 		Iterator<String> it = filterRegexList.iterator();
140 | 		while (it.hasNext()) {
141 | 			str = str.replaceAll(it.next(), N);
142 | 		}
143 | 		return str;
144 | 	}
145 | 
146 | 	/**
147 | 	 * 是指过滤了相关标签
148 | 	 * 
149 | 	 * @return
150 | 	 */
151 | 	public String getNextAddFilter() {
152 | 		String str = matcher.group();
153 | 		Iterator<String> it = filterRegexList.iterator();
154 | 		while (it.hasNext()) {
155 | 			str = str.replaceAll(it.next(), N);
156 | 		}
157 | 		return str;
158 | 	}
159 | 
160 | 	/**
161 | 	 * 循环遍历时，得到真正的txt,而不是匹配全部
162 | 	 * 
163 | 	 * @return
164 | 	 */
165 | 	public String getNextText() {
166 | 		String str = matcher.group();
167 | 		str = str.replaceFirst(beginRegex, N).replaceAll(endRegex, N);
168 | 		return str;
169 | 	}
170 | 
171 | 	public boolean hasNext() {
172 | 		return matcher.find();
173 | 	}
174 | 
175 | 	public RegexPaserUtil reset(String content) {
176 | 		this.matcher.reset(content);
177 | 		return this;
178 | 	}
179 | 
180 | 	public RegexPaserUtil addFilterRegex(String filterRegex) {
181 | 		filterRegexList.add(filterRegex);
182 | 		return this;
183 | 	}
184 | 
185 | 	public String getTextList() {
186 | 		String str = "";
187 | 		int count = 0;
188 | 		while (matcher.find()) {
189 | 			if (count == 0) {
190 | 				str = matcher.group().trim().replaceFirst(beginRegex, N)
191 | 						.replaceAll(endRegex, N);
192 | 			} else {
193 | 				str += ("#" + matcher.group().trim()
194 | 						.replaceFirst(beginRegex, N).replaceAll(endRegex, N));
195 | 			}
196 | 			count++;
197 | 		}
198 | 		return str;
199 | 	}
200 | 
201 | 	public static void main(String[] args) {
202 | 		String beginRegex = "<dd" + RegexPaserUtil.TEXTEGEXANDNRT + "</a>";
203 | 		String endRegex = "<span>";
204 | 		String text = "<dd>    <a a b c>1</a>//@<a b c d>2</a>3 4<span>";
205 | 		RegexPaserUtil ansjSayUrl = new RegexPaserUtil(beginRegex, endRegex,
206 | 				text, RegexPaserUtil.TEXTEGEXANDNRT);
207 | 
208 | 		System.out.println(ansjSayUrl.getText());
209 | 
210 | 	}
211 | 
212 | }
213 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/importfrom/ImportExcelUtil.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.importfrom;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.InputStream;
  6 | import java.text.DecimalFormat;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.List;
 12 | import java.util.Map;
 13 | import java.util.Set;
 14 | import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 15 | import org.apache.poi.ss.usermodel.Cell;
 16 | import org.apache.poi.ss.usermodel.Row;
 17 | import org.apache.poi.ss.usermodel.Sheet;
 18 | import org.apache.poi.ss.usermodel.Workbook;
 19 | import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 20 | 
 21 | ;
 22 | 
 23 | /**
 24 |  *  Excel文件流 -->  List <Map<String,Object>>对象
 25 |  *  想直接转成java bean的朋友可以使用fastjson将List<Map<String,Object>>转成bean对象
 26 |  *
 27 |  */
 28 | public class ImportExcelUtil {
 29 |     private final static String excel2003L = ".xls"; // 2003- 版本的excel
 30 |     private final static String excel2007U = ".xlsx"; // 2007+ 版本的excel
 31 |     /**
 32 |      * 将流中的Excel数据转成List<Map>
 33 |      * @param in 输入流
 34 |      * @param fileName 文件名（判断Excel版本）
 35 |      * @param mapping 字段名称映射
 36 |      * @return
 37 |      * @throws Exception
 38 |      */
 39 |     public static List<Map<String, Object>> parseExcel(InputStream in, String fileName, Map<String, String> mapping) throws Exception {
 40 |         // 根据文件名来创建Excel工作薄
 41 |         Workbook work = getWorkbook(in, fileName);
 42 |         if (null == work) {
 43 |             throw new Exception("创建Excel工作薄为空！");
 44 |         }
 45 |         Sheet sheet = null;
 46 |         Row row = null;
 47 |         Cell cell = null;
 48 |         // 返回数据
 49 |         List<Map<String, Object>> ls = new ArrayList<Map<String, Object>>();
 50 | 
 51 |         // 遍历Excel中所有的sheet
 52 |         for (int i = 0; i < work.getNumberOfSheets(); i++) {
 53 |             sheet = work.getSheetAt(i);
 54 |             if (sheet == null){
 55 |                 continue;
 56 |             }
 57 |             // 取第一行标题
 58 |             row = sheet.getRow(0);
 59 |             String title[] = null;
 60 |             if (row != null) {
 61 |                 title = new String[row.getLastCellNum()];
 62 |                 for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) {
 63 |                     cell = row.getCell(y);
 64 |                     title[y] = (String) getCellValue(cell);
 65 |                 }
 66 |             } else{
 67 |                 continue;
 68 |             }
 69 |             // 遍历当前sheet中的所有行
 70 |             for (int j = 1; j < sheet.getLastRowNum() + 1; j++) {
 71 |                 row = sheet.getRow(j);
 72 |                 Map<String, Object> m = new HashMap<String, Object>();
 73 |                 // 遍历所有的列
 74 |                 for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) {
 75 |                     cell = row.getCell(y);
 76 |                     String key = title[y];
 77 |                     m.put(mapping.get(key), getCellValue(cell));
 78 |                 }
 79 |                 ls.add(m);
 80 |             }
 81 |         }
 82 |         work.close();
 83 |         return ls;
 84 |     }
 85 | 
 86 |     /**
 87 |      * 描述：根据文件后缀，自适应上传文件的版本
 88 |      * @param inStr,fileName
 89 |      * @return
 90 |      * @throws Exception
 91 |      */
 92 |     public static Workbook getWorkbook(InputStream inStr, String fileName) throws Exception {
 93 |         Workbook wb = null;
 94 |         String fileType = fileName.substring(fileName.lastIndexOf("."));
 95 |         if (excel2003L.equals(fileType)) {
 96 |             wb = new HSSFWorkbook(inStr); // 2003-
 97 |         } else if (excel2007U.equals(fileType)) {
 98 |             wb = new XSSFWorkbook(inStr); // 2007+
 99 |         } else {
100 |             throw new Exception("解析的文件格式有误！");
101 |         }
102 |         return wb;
103 |     }
104 | 
105 |     /**
106 |      * 描述：对表格中数值进行格式化
107 |      *
108 |      * @param cell
109 |      * @return
110 |      */
111 |     public static Object getCellValue(Cell cell) {
112 |         Object value = null;
113 |         // 格式化number String字符
114 |         DecimalFormat df = new DecimalFormat("0");
115 |         // 日期格式化
116 |         SimpleDateFormat sdf = new SimpleDateFormat("yyy-MM-dd");
117 |         // 格式化数字
118 |         DecimalFormat df2 = new DecimalFormat("0");
119 |         switch (cell.getCellType()) {
120 |             case Cell.CELL_TYPE_STRING:
121 |                 value = cell.getRichStringCellValue().getString();
122 |                 break;
123 |             case Cell.CELL_TYPE_NUMERIC:
124 |                 if ("General".equals(cell.getCellStyle().getDataFormatString())) {
125 |                     value = df.format(cell.getNumericCellValue());
126 |                 } else if ("m/d/yy".equals(cell.getCellStyle().getDataFormatString())) {
127 |                     value = sdf.format(cell.getDateCellValue());
128 |                 } else {
129 |                     value = df2.format(cell.getNumericCellValue());
130 |                 }
131 |                 break;
132 |             case Cell.CELL_TYPE_BOOLEAN:
133 |                 value = cell.getBooleanCellValue();
134 |                 break;
135 |             case Cell.CELL_TYPE_BLANK:
136 |                 value = "";
137 |                 break;
138 |             default:
139 |                 break;
140 |         }
141 |         return value;
142 |     }
143 | 
144 |     /** 主方法测试*/
145 |     public static void main(String[] args) throws Exception {
146 |         File file = new File("D:\\词条目录卫青.xlsx");
147 |         FileInputStream fis = new FileInputStream(file);
148 |         Map<String, String> m = new HashMap<String, String>();
149 |         m.put("药品名称", "name");
150 |         m.put("序号", "id");
151 |         List<Map<String, Object>> ls = parseExcel(fis, file.getName(), m);
152 |         Set<String> resultSet = new HashSet<>();
153 |         for (int i = 0; i <ls.size(); i++) {
154 |             if(ls.get(i).get("name").toString()!=""){
155 |                 resultSet.add(ls.get(i).get("name").toString());
156 |             }
157 |         }
158 | 
159 |     }
160 | }
161 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/spidertask/example/dorule/DoRule51Cto.java:
--------------------------------------------------------------------------------
 1 | package com.xyzj.crawler.spidertask.example.dorule;
 2 | 
 3 | import com.xyzj.crawler.framework.entity.Param;
 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory;
 5 | import com.xyzj.crawler.framework.defaults.DefaultM3u8SpiderRule;
 6 | import com.xyzj.crawler.framework.interfaces.ISpiderRule;
 7 | import java.util.HashMap;
 8 | import java.util.UUID;
 9 | import lombok.extern.slf4j.Slf4j;
10 | 
11 | /**
12 |  * @author lyy
13 |  * @since 2018-11-25 18:57
14 |  */
15 | @Slf4j
16 | public class DoRule51Cto {
17 | 
18 |     public static void main(String[] args) {
19 | 
20 |         //文件保存地址
21 |         String fileName = "/Users/liuyangyang/Downloads/" + UUID.randomUUID() + ".mp4";
22 |         //m3u8路径
23 |         String httpUrl = "http://edu.51cto.com//center/player/play/m3u8?lesson_id=202687&id=190921&dp=general&type=course&lesson_type=course";
24 |         //m3u8地址请求头
25 |         HashMap<String, String> headerInfosMap = new HashMap<>(16);
26 |         headerInfosMap.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");
27 |         //模拟登陆必须内容
28 |         headerInfosMap.put("Cookie", "acw_tc=276aedd215431325834491762e1d45b45d7060c7dec9c77742b0f52b0eeb32; _csrf=91d7e25d637d436027e7819c6d5e9f7fb1efc29328e1b286a12c71c562b3c9ffa%3A2%3A%7Bi%3A0%3Bs%3A5%3A%22_csrf%22%3Bi%3A1%3Bs%3A32%3A%22%87y%F2%9EN%D2%C1%F0%B6%DD%00%2Bi_%ABS%87C%18%AB%0A%EE%D1_%E1%D4e%09%E9%DB%97%A3%22%3B%7D; looyu_id=2ca1e50b89c720f6c644646887121255_20000923%3A1; 51ctologToken=8eee7adfa2139ad0eb3881e882b9350a; _ourplusFirstTime=118-11-25-15-56-25; _ga=GA1.2.1084733366.1543132586; _gid=GA1.2.1681094532.1543132586; _t99_chat=1; www51cto=70F4649BA2D0E3E01B91BF4FB0771B95EGZG; pub_cookietime=0; bdshare_firstime=1543132639813; 13240332=2018/11/25; EDUACCOUNT=59bc35e120ecde375071702d5a847a51a5e15544efa309f8d5b1da3e62ce7e56a%3A2%3A%7Bi%3A0%3Bs%3A10%3A%22EDUACCOUNT%22%3Bi%3A1%3Bs%3A32%3A%2295cd024928664c2dbef35fb538aeca8c%22%3B%7D; _ourplusReturnCount=5; _ourplusReturnTime=118-11-25-16-1-48; __utma=1.1084733366.1543132586.1543132909.1543132909.1; __utmc=1; __utmz=1.1543132909.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_f77ea1ecd95cb2a1bc65cbcb3aaba7d4=1543132909; Hm_lpvt_f77ea1ecd95cb2a1bc65cbcb3aaba7d4=1543132909; playTime202687=1337.723899; playTime202689=94; playDEF=hd; _51ctologStr=data%3D%257Bvisitorid%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2CuserAagent%3A%2522Mozilla/5.0%2520%28Macintosh%3B%2520Intel%2520Mac%2520OS%2520X%252010_13_2%29%2520AppleWebKit/537.36%2520%28KHTML%2520%2520like%2520Gecko%29%2520Chrome/70.0.3538.102%2520Safari/537.36%2522%2Ctoken%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2Cuid%3A%2522%2522%2Cuuid%3A%252254f10ca8-1a3c-c5dd-t903-o22928bc36ce%2522%2Ctype%3A%2522close%2522%2Cdom%3A%2522%2522%2CdomId%3A%2522%2522%2CdomInnerTxt%3A%2522%2522%2Cprice%3A%2522%2522%2Cstudents_count%3A%2522%2522%2Cfavourite%3A%2522%2522%2Cvote%3A%2522%2522%2Cscrolling%3A%25220%2525%2522%2Cscreensize%3A%25221280X800%2522%2Curl%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cref%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cfrom%3A%2522home%2522%2Cduration%3A%252221180%2522%2Ctime%3A%25221543139532503%2522%257D; pub_sauth1=tKGEuF1dD1Q6BQEDVwlVBwY6BVFSAQQCW1YEVg; pub_sauth2=0c84c9bfca688ef4b37d29441f5474ef; PHPSESSID=22c8d16uam5ugjdtjnc4or0pm7; logserveruid=13240332; Hm_lvt_8c8abdb71d78d33dfdb885e0bc71dae0=1543132627,1543139006,1543139032,1543139535; Cto_lvt_=1543132627,1543139006,1543139032,1543139535; _gat_gtag_UA_118863081_1=1; _51ctologStr=data%3D%257Bvisitorid%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2CuserAagent%3A%2522Mozilla/5.0%2520%28Macintosh%3B%2520Intel%2520Mac%2520OS%2520X%252010_13_2%29%2520AppleWebKit/537.36%2520%28KHTML%2520%2520like%2520Gecko%29%2520Chrome/70.0.3538.102%2520Safari/537.36%2522%2Ctoken%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2Cuid%3A%252213240332%2522%2Cuuid%3A%252258e302b0-1b4c-c155-t443-o6e8309fd1b4%2522%2Ctype%3A%2522close%2522%2Cdom%3A%2522%2522%2CdomId%3A%2522%2522%2CdomInnerTxt%3A%2522%2522%2Cprice%3A%2522%2522%2Cstudents_count%3A%2522%2522%2Cfavourite%3A%2522%2522%2Cvote%3A%2522%2522%2Cscrolling%3A%25220%2525%2522%2Cscreensize%3A%25221280X800%2522%2Curl%3A%2522http%25253A%25252F%25252Fedu.51cto.com%25252Fcenter%25252Fcourse%25252Flesson%25252Findex%25253Fid%25253D202668%2522%2Cref%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cfrom%3A%2522edu%2522%2Cduration%3A%25222499%2522%2Ctime%3A%25221543139537068%2522%257D; playTime202668=58; Hm_lpvt_8c8abdb71d78d33dfdb885e0bc71dae0=1543139538; Cto_lpvt_=1543139538; playTime202662=536.436775");
29 |         headerInfosMap.put("Accept", "*/*");
30 |         headerInfosMap.put("Accept-Encoding", "gzip, deflate");
31 |         headerInfosMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
32 |         headerInfosMap.put("Cache-Control", "no-cache");
33 |         headerInfosMap.put("Connection", "keep-alive");
34 |         headerInfosMap.put("Host", "edu.51cto.com");
35 |         headerInfosMap.put("Pragma", "no-cache");
36 |         headerInfosMap.put("Referer", "http://edu.51cto.com/center/course/lesson/index?id=202687");
37 | 
38 | 
39 |         //工厂取得M3u8SpiderRule实例
40 |         ISpiderRule spiderRule = new SpiderRuleFactory(new DefaultM3u8SpiderRule()).getInstance();
41 | 
42 |         //封装参数
43 |         Param param = new Param();
44 |         param.setWebUrl(httpUrl);
45 |         param.setFileFullName(fileName);
46 |         param.setHeaderInfos(headerInfosMap);
47 | 
48 |         //走你
49 |         spiderRule.runSpider(param,spiderRule);
50 | 
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/packageutil/PackageGetJavaUtil.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.packageutil;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.InputStream;
  6 | import java.nio.file.Files;
  7 | import java.text.DecimalFormat;
  8 | import java.text.SimpleDateFormat;
  9 | import java.util.ArrayList;
 10 | import java.util.HashMap;
 11 | import java.util.HashSet;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | import java.util.Set;
 15 | import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 16 | import org.apache.poi.ss.usermodel.Cell;
 17 | import org.apache.poi.ss.usermodel.Row;
 18 | import org.apache.poi.ss.usermodel.Sheet;
 19 | import org.apache.poi.ss.usermodel.Workbook;
 20 | import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 21 | 
 22 | /**
 23 |  * @author lyy
 24 |  * @since 2019-09-18 13:01
 25 |  * 读取excel里面的文件到指定目录
 26 |  */
 27 | public class PackageGetJavaUtil {
 28 |     private static final String SRC_PATH = "/Users/liuyangyang/workspace/xyzj/xyzj-crawler";
 29 |     private static final String TARGET_PATH = "/Users/liuyangyang/Downloads";
 30 |     private final static String EXCEL_2003L = ".xls"; // 2003- 版本的excel
 31 |     private final static String EXCEL_2007U = ".xlsx"; // 2007+ 版本的excel
 32 | 
 33 |     /**
 34 |      * 将流中的Excel数据转成List<Map>
 35 |      * @param in 输入流
 36 |      * @param fileName 文件名（判断Excel版本）
 37 |      * @param mapping 字段名称映射
 38 |      * @return
 39 |      * @throws Exception
 40 |      */
 41 |     public static List<Map<String, Object>> parseExcel(InputStream in, String fileName, Map<String, String> mapping) throws Exception {
 42 |         // 根据文件名来创建Excel工作薄
 43 |         Workbook work = getWorkbook(in, fileName);
 44 |         if (null == work) {
 45 |             throw new Exception("创建Excel工作薄为空！");
 46 |         }
 47 |         Sheet sheet = null;
 48 |         Row row = null;
 49 |         Cell cell = null;
 50 |         // 返回数据
 51 |         List<Map<String, Object>> ls = new ArrayList<Map<String, Object>>();
 52 | 
 53 |         // 遍历Excel中所有的sheet
 54 |         for (int i = 0; i < work.getNumberOfSheets(); i++) {
 55 |             sheet = work.getSheetAt(i);
 56 |             if (sheet == null){
 57 |                 continue;
 58 |             }
 59 |             // 取第一行标题
 60 |             row = sheet.getRow(0);
 61 |             String title[] = null;
 62 |             if (row != null) {
 63 |                 title = new String[row.getLastCellNum()];
 64 |                 for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) {
 65 |                     cell = row.getCell(y);
 66 |                     title[y] = (String) getCellValue(cell);
 67 |                 }
 68 |             } else{
 69 |                 continue;
 70 |             }
 71 |             // 遍历当前sheet中的所有行
 72 |             for (int j = 1; j < sheet.getLastRowNum() + 1; j++) {
 73 |                 row = sheet.getRow(j);
 74 |                 Map<String, Object> m = new HashMap<String, Object>();
 75 |                 // 遍历所有的列
 76 |                 for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) {
 77 |                     cell = row.getCell(y);
 78 |                     String key = title[y];
 79 |                     m.put(mapping.get(key), getCellValue(cell));
 80 |                 }
 81 |                 ls.add(m);
 82 |             }
 83 |         }
 84 |         work.close();
 85 |         return ls;
 86 |     }
 87 | 
 88 |     //复制方法
 89 |     public static void copy(String filePath, String srcPath, String targetPath) throws Exception {
 90 |         //初始化文件复制
 91 |         File srcFile=new File(srcPath + filePath);
 92 | 
 93 |         //初始化文件目标
 94 |         File targetFile=new File(targetPath+filePath);
 95 |         if(!targetFile.getParentFile().exists()){
 96 |             targetFile.getParentFile().mkdirs();
 97 |         }
 98 |         //调用文件拷贝的方法
 99 |         targetFile.delete();
100 |         Files.copy(srcFile.toPath(), targetFile.toPath());
101 |     }
102 | 
103 | 
104 |     /**
105 |      * 描述：根据文件后缀，自适应上传文件的版本
106 |      * @param inStr,fileName
107 |      * @return
108 |      * @throws Exception
109 |      */
110 |     public static Workbook getWorkbook(InputStream inStr, String fileName) throws Exception {
111 |         Workbook wb = null;
112 |         String fileType = fileName.substring(fileName.lastIndexOf("."));
113 |         if (EXCEL_2003L.equals(fileType)) {
114 |             wb = new HSSFWorkbook(inStr); // 2003-
115 |         } else if (EXCEL_2007U.equals(fileType)) {
116 |             wb = new XSSFWorkbook(inStr); // 2007+
117 |         } else {
118 |             throw new Exception("解析的文件格式有误！");
119 |         }
120 |         return wb;
121 |     }
122 | 
123 |     /**
124 |      * 描述：对表格中数值进行格式化
125 |      *
126 |      * @param cell
127 |      * @return
128 |      */
129 |     public static Object getCellValue(Cell cell) {
130 |         Object value = null;
131 |         // 格式化number String字符
132 |         DecimalFormat df = new DecimalFormat("0");
133 |         // 日期格式化
134 |         SimpleDateFormat sdf = new SimpleDateFormat("yyy-MM-dd");
135 |         // 格式化数字
136 |         DecimalFormat df2 = new DecimalFormat("0");
137 |         switch (cell.getCellType()) {
138 |             case Cell.CELL_TYPE_STRING:
139 |                 value = cell.getRichStringCellValue().getString();
140 |                 break;
141 |             case Cell.CELL_TYPE_NUMERIC:
142 |                 if ("General".equals(cell.getCellStyle().getDataFormatString())) {
143 |                     value = df.format(cell.getNumericCellValue());
144 |                 } else if ("m/d/yy".equals(cell.getCellStyle().getDataFormatString())) {
145 |                     value = sdf.format(cell.getDateCellValue());
146 |                 } else {
147 |                     value = df2.format(cell.getNumericCellValue());
148 |                 }
149 |                 break;
150 |             case Cell.CELL_TYPE_BOOLEAN:
151 |                 value = cell.getBooleanCellValue();
152 |                 break;
153 |             case Cell.CELL_TYPE_BLANK:
154 |                 value = "";
155 |                 break;
156 |             default:
157 |                 break;
158 |         }
159 |         return value;
160 |     }
161 | 
162 | 
163 |     public static void main(String[] args) throws Exception {
164 |         //取得类路径
165 |         File file = new File("/Users/liuyangyang/Downloads/工作簿1.xlsx");
166 |         FileInputStream fis = new FileInputStream(file);
167 |         Map<String, String> m = new HashMap<String, String>();
168 |         m.put("路径", "path");
169 |         List<Map<String, Object>> ls = parseExcel(fis, file.getName(), m);
170 |         Set<String> resultSet = new HashSet<>();
171 |         for (int i = 0; i <ls.size(); i++) {
172 |             if(ls.get(i).get("path").toString()!=""){
173 |                 resultSet.add(ls.get(i).get("path").toString());
174 |             }
175 |         }
176 | 
177 |         //提取java文件
178 |         for (String filePath : resultSet) {
179 |             copy(filePath,SRC_PATH,TARGET_PATH);
180 |         }
181 |         System.out.println("类提取完成");
182 | 
183 |     }
184 | }
185 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>com.xyzj.crawler</groupId>
  5 | 	<artifactId>xyzj-crawler</artifactId>
  6 | 	<version>0.0.1-SNAPSHOT</version>
  7 | 	<name>crawler</name>
  8 | 	<properties>
  9 | 		<!--日志管理 -->
 10 | 		<logback.version>1.1.2</logback.version>
 11 | 		<logback-ext-spring.version>0.1.4</logback-ext-spring.version>
 12 | 		<log4j-over-slf4j.version>1.7.20</log4j-over-slf4j.version>
 13 | 		<jcl-over-slf4j.version>1.7.12</jcl-over-slf4j.version>
 14 | 		<slf4j.version>1.7.21</slf4j.version>
 15 | 		<commons-logging.version>1.2</commons-logging.version>
 16 | 		<spring.version>4.3.8.RELEASE</spring.version>
 17 | 		<mysql.version>5.1.38</mysql.version>
 18 | 		<org.poi-version>3.14</org.poi-version>
 19 | 
 20 | 		<!-- 以下为插件的版本属性配置 -->
 21 | 		<compiler.version>3.6.1</compiler.version> 
 22 | 		<source.version>3.0.1</source.version>
 23 | 		<javadoc.version>2.10.4</javadoc.version>
 24 | 		<!-- 配置一些环境的属性信息 -->
 25 | 		<jdk.version>1.8</jdk.version>
 26 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 27 | 	</properties>
 28 | 
 29 | 	<dependencies>
 30 | 		<dependency>
 31 | 			<groupId>commons-httpclient</groupId>
 32 | 			<artifactId>commons-httpclient</artifactId>
 33 | 			<version>3.1</version>
 34 | 		</dependency>
 35 | 
 36 | 		<dependency>
 37 | 			<groupId>com.alibaba</groupId>
 38 | 			<artifactId>fastjson</artifactId>
 39 | 			<version>1.2.31</version>
 40 | 		</dependency>
 41 | 		<dependency>
 42 | 			<groupId>org.jsoup</groupId>
 43 | 			<artifactId>jsoup</artifactId>
 44 | 			<version>1.7.2</version>
 45 | 		</dependency>
 46 | 		<dependency>
 47 | 			<groupId>org.apache.httpcomponents</groupId>
 48 | 			<artifactId>httpclient</artifactId>
 49 | 			<version>4.5.3</version>
 50 | 		</dependency>
 51 | 		<dependency>
 52 | 			<groupId>org.apache.httpcomponents</groupId>
 53 | 			<artifactId>httpmime</artifactId>
 54 | 			<version>4.5.3</version>
 55 | 		</dependency>
 56 | 		<!--日志管理 -->
 57 | 		<dependency>
 58 | 			<groupId>ch.qos.logback</groupId>
 59 | 			<artifactId>logback-classic</artifactId>
 60 | 			<version>${logback.version}</version>
 61 | 		</dependency>
 62 | 		<dependency>
 63 | 			<groupId>ch.qos.logback</groupId>
 64 | 			<artifactId>logback-access</artifactId>
 65 | 			<version>${logback.version}</version>
 66 | 		</dependency>
 67 | 		<dependency>
 68 | 			<groupId>org.logback-extensions</groupId>
 69 | 			<artifactId>logback-ext-spring</artifactId>
 70 | 			<version>${logback-ext-spring.version}</version>
 71 | 		</dependency>
 72 | 		<dependency>
 73 | 			<groupId>org.slf4j</groupId>
 74 | 			<artifactId>log4j-over-slf4j</artifactId>
 75 | 			<version>${log4j-over-slf4j.version}</version>
 76 | 		</dependency>
 77 | 		<dependency>
 78 | 			<groupId>org.slf4j</groupId>
 79 | 			<artifactId>jcl-over-slf4j</artifactId>
 80 | 			<version>${jcl-over-slf4j.version}</version>
 81 | 		</dependency>
 82 | 		<dependency>
 83 | 			<groupId>org.springframework</groupId>
 84 | 			<artifactId>spring-jdbc</artifactId>
 85 | 			<version>${spring.version}</version>
 86 | 		</dependency>
 87 | 		<dependency>
 88 | 			<groupId>mysql</groupId>
 89 | 			<artifactId>mysql-connector-java</artifactId>
 90 | 			<version>${mysql.version}</version>
 91 | 		</dependency>
 92 | 		<dependency>
 93 | 			<groupId>log4j</groupId>
 94 | 			<artifactId>log4j</artifactId>
 95 | 			<version>1.2.16</version>
 96 | 		</dependency>
 97 | 		<dependency>
 98 | 			<groupId>org.springframework</groupId>
 99 | 			<artifactId>spring-aop</artifactId>
100 | 			<version>${spring.version}</version>
101 | 		</dependency>
102 | 		<dependency>
103 | 			<groupId>commons-lang</groupId>
104 | 			<artifactId>commons-lang</artifactId>
105 | 			<version>2.6</version>
106 | 		</dependency>
107 | 		<dependency>
108 | 			<groupId>commons-io</groupId>
109 | 			<artifactId>commons-io</artifactId>
110 | 			<version>2.6</version>
111 | 		</dependency>
112 | 		<dependency>
113 | 			<groupId>junit</groupId>
114 | 			<artifactId>junit</artifactId>
115 | 			<version>4.8.2</version>
116 | 			<scope>test</scope>
117 | 		</dependency>
118 | 		<!--<dependency>-->
119 | 			<!--<groupId>org.apache.poi</groupId>-->
120 | 			<!--<artifactId>poi</artifactId>-->
121 | 			<!--<version>${org.poi-version}</version>-->
122 | 		<!--</dependency>-->
123 | 		<!--<dependency>-->
124 | 			<!--<groupId>org.apache.poi</groupId>-->
125 | 			<!--<artifactId>poi-ooxml</artifactId>-->
126 | 			<!--<version>${org.poi-version}</version>-->
127 | 		<!--</dependency>-->
128 | 		<!--<dependency>-->
129 | 			<!--<groupId>org.apache.poi</groupId>-->
130 | 			<!--<artifactId>poi-examples</artifactId>-->
131 | 			<!--<version>${org.poi-version}</version>-->
132 | 		<!--</dependency>-->
133 | 		<!--<dependency>-->
134 | 			<!--<groupId>org.apache.poi</groupId>-->
135 | 			<!--<artifactId>poi-excelant</artifactId>-->
136 | 			<!--<version>${org.poi-version}</version>-->
137 | 		<!--</dependency>-->
138 | 		<dependency>
139 | 			<groupId>us.codecraft</groupId>
140 | 			<artifactId>webmagic-core</artifactId>
141 | 			<version>0.7.3</version>
142 | 		</dependency>
143 | 		<dependency>
144 | 			<groupId>us.codecraft</groupId>
145 | 			<artifactId>webmagic-extension</artifactId>
146 | 			<version>0.7.3</version>
147 | 			<exclusions>
148 | 				<exclusion>
149 | 					<groupId>org.slf4j</groupId>
150 | 					<artifactId>slf4j-log4j12</artifactId>
151 | 				</exclusion>
152 | 			</exclusions>
153 | 		</dependency>
154 | 
155 | 		<dependency>
156 | 			<groupId>net.java.dev.jna</groupId>
157 | 			<artifactId>jna</artifactId>
158 | 			<version>4.2.1</version>
159 | 		</dependency>
160 | 
161 | 		<dependency>
162 | 			<groupId>redis.clients</groupId>
163 | 			<artifactId>jedis</artifactId>
164 | 			<version>2.9.0</version>
165 | 		</dependency>
166 | 
167 | 		<dependency>
168 | 			<groupId>net.sourceforge.tess4j</groupId>
169 | 			<artifactId>tess4j</artifactId>
170 | 			<version>2.0.1</version>
171 | 			<exclusions>
172 | 				<exclusion>
173 | 					<groupId>com.sun.jna</groupId>
174 | 					<artifactId>jna</artifactId>
175 | 				</exclusion>
176 | 			</exclusions>
177 | 		</dependency>
178 | 
179 | 
180 | 		<dependency>
181 | 			<groupId>cn.edu.hfut.dmic.webcollector</groupId>
182 | 			<artifactId>WebCollector</artifactId>
183 | 			<version>2.71</version>
184 | 		</dependency>
185 | 
186 | 		<dependency>
187 | 			<groupId>net.sourceforge.htmlunit</groupId>
188 | 			<artifactId>htmlunit</artifactId>
189 | 			<version>2.28</version>
190 | 		</dependency>
191 | 
192 | 		<dependency>
193 | 			<groupId>org.projectlombok</groupId>
194 | 			<artifactId>lombok</artifactId>
195 | 			<version>1.16.10</version>
196 | 		</dependency>
197 | 		<dependency>
198 | 			<groupId>com.alibaba</groupId>
199 | 			<artifactId>easyexcel</artifactId>
200 | 			<version>2.0.5</version>
201 | 		</dependency>
202 | 		<!-- https://mvnrepository.com/artifact/com.oracle/ojdbc6 -->
203 | 		<dependency>
204 | 			<groupId>com.oracle</groupId>
205 | 			<artifactId>ojdbc6</artifactId>
206 | 			<version>12.1.0.1-atlassian-hosted</version>
207 | 		</dependency>
208 | 
209 | 
210 | 	</dependencies>
211 | 
212 | 	<build>
213 | 		<finalName>${project.name}</finalName>
214 | 		<plugins>
215 | 			<plugin>
216 | 				<groupId>org.apache.maven.plugins</groupId>
217 | 				<artifactId>maven-compiler-plugin</artifactId>
218 | 				<version>${compiler.version}</version>
219 | 				<configuration>
220 | 					<source>${jdk.version}</source>
221 | 					<target>${jdk.version}</target>
222 | 					<encoding>${project.build.sourceEncoding}</encoding>
223 | 				</configuration>
224 | 			</plugin>
225 | 			<plugin>
226 | 				<groupId>org.springframework.boot</groupId>
227 | 				<artifactId>spring-boot-maven-plugin</artifactId>
228 | 				<configuration>
229 | 					<mainClass>com.xyzj.bigdata.in.ReadTest</mainClass>
230 | 				</configuration>
231 | 				<executions>
232 | 					<execution>
233 | 						<goals>
234 | 							<goal>repackage</goal>
235 | 						</goals>
236 | 					</execution>
237 | 				</executions>
238 | 			</plugin>
239 | 		</plugins>
240 | 	</build>
241 | </project>
242 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/gethtmlstring/HttpResponseUtil.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.gethtmlstring;
  2 | 
  3 | 
  4 | import com.alibaba.fastjson.JSONObject;
  5 | import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
  6 | import com.gargoylesoftware.htmlunit.ProxyConfig;
  7 | import com.gargoylesoftware.htmlunit.WebClient;
  8 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
  9 | import com.xyzj.crawler.framework.entity.Goods;
 10 | import com.xyzj.crawler.framework.entity.Param;
 11 | import com.xyzj.crawler.framework.enums.FactionEnum;
 12 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage;
 13 | import com.xyzj.crawler.utils.proxyip.config.RedisUtil;
 14 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql;
 15 | import lombok.extern.slf4j.Slf4j;
 16 | import org.apache.http.HttpHost;
 17 | import org.apache.http.HttpResponse;
 18 | import org.apache.http.client.config.RequestConfig;
 19 | import org.apache.http.client.methods.CloseableHttpResponse;
 20 | import org.apache.http.client.methods.HttpGet;
 21 | import org.apache.http.client.methods.HttpPost;
 22 | import org.apache.http.entity.StringEntity;
 23 | import org.apache.http.impl.client.CloseableHttpClient;
 24 | import org.apache.http.impl.client.HttpClients;
 25 | import org.apache.http.util.EntityUtils;
 26 | import org.springframework.util.CollectionUtils;
 27 | 
 28 | /**
 29 |  *
 30 |  */
 31 | 
 32 | @Slf4j
 33 | public class HttpResponseUtil {
 34 | 
 35 |     public static String getHtmlSource(Param param) {
 36 |         //获取html源文件
 37 |         param.getHeaderInfos().put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36");
 38 |         String htmlSource = "";
 39 |         FactionEnum factionEnum = param.getFactionEnum();
 40 |         switch (factionEnum) {
 41 |             case getHtml:
 42 |                 log.info("走 getHtml");
 43 |                 htmlSource = HttpResponseUtil.getHtml(param);
 44 |                 break;
 45 |             case getHtmlWithJavaScript:
 46 |                 log.info("走 getHtmlWithJavaScript");
 47 |                 htmlSource = HttpResponseUtil.getHtmlWithJavaScript(param);
 48 |                 break;
 49 |             case getJson:
 50 |                 log.info("走 getJson");
 51 |                 htmlSource = HttpResponseUtil.getJson(param);
 52 |                 break;
 53 |         }
 54 |         if (org.springframework.util.StringUtils.isEmpty(htmlSource) || htmlSource.contains("Not Found") || htmlSource.contains("无法访问此网站") || htmlSource.contains("你所访问的页面就如那些遇害的同道") || htmlSource.contains("药品不存在！")) {
 55 |             log.info("本次爬取目标失败 webUrl={}", param.getWebUrl());
 56 |             //没拿到数据 存入ungoods表
 57 |             Goods unableGoods = new Goods();
 58 |             unableGoods.setWebUrl(param.getWebUrl());
 59 |             SaveToMysql saveToMysql = new SaveToMysql();
 60 |             saveToMysql.saveToMasql("ungoods", unableGoods);
 61 |             return null;
 62 |         }
 63 |         log.info("本次爬取目标 webUrl={}", param.getWebUrl());
 64 |         log.info(htmlSource);
 65 |         return htmlSource;
 66 |     }
 67 | 
 68 |     /**
 69 |      * ========================================
 70 |      *
 71 |      * @description: 取得网页html信息
 72 |      * @author: lyy
 73 |      * @param:
 74 |      * @return:
 75 |      * @exception:
 76 |      * @create: 2019/6/28 11:54
 77 |      * <p>
 78 |      * ========================================
 79 |      */
 80 |     public static String getHtml(Param param) {
 81 |         String entity = null;
 82 |         CloseableHttpClient httpClient = HttpClients.createDefault();
 83 |         //设置代理
 84 |         RequestConfig config = null;
 85 |         if (param.getIsProxy()) {
 86 |             IPMessage ipMessage = RedisUtil.getOneIp();
 87 |             config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(new HttpHost(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort()))).build();
 88 |         } else {
 89 |             config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).build();
 90 |         }
 91 |         HttpGet httpGet = new HttpGet(param.getWebUrl());
 92 |         httpGet.setConfig(config);
 93 |         // 遍历map 设置请求头信息
 94 |         if (!CollectionUtils.isEmpty(param.getHeaderInfos())) {
 95 |             for (String key : param.getHeaderInfos().keySet()) {
 96 |                 httpGet.setHeader(key, param.getHeaderInfos().get(key));
 97 |             }
 98 |         }
 99 |         try {
100 |             //客户端执行httpGet方法，返回响应
101 |             CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
102 |             //得到服务响应状态码
103 |             if (httpResponse.getStatusLine().getStatusCode() == 200) {
104 |                 entity = EntityUtils.toString(httpResponse.getEntity(), param.getCharset());
105 |             }
106 |             httpResponse.close();
107 |             httpClient.close();
108 |         } catch (Exception e) {
109 |             log.error("getHtml exception:{}", e);
110 |         }
111 |         return entity;
112 |     }
113 | 
114 |     /**
115 |      * ========================================
116 |      *
117 |      * @description: 取得执行javascript后的页面信息
118 |      * @author: lyy
119 |      * @param:
120 |      * @return:
121 |      * @exception:
122 |      * @create: 2019/6/28 11:45
123 |      * <p>
124 |      * ========================================
125 |      */
126 |     public static String getHtmlWithJavaScript(Param param) {
127 |         try {
128 |             //HtmlUnit请求web页面
129 |             WebClient wc = new WebClient();
130 |             //启用JS解释器，默认为true
131 |             wc.getOptions().setJavaScriptEnabled(true);
132 |             //js运行错误时，是否抛出异常
133 |             wc.getOptions().setThrowExceptionOnScriptError(false);
134 |             //禁用css支持
135 |             wc.getOptions().setActiveXNative(false);
136 |             wc.getOptions().setCssEnabled(false);
137 |             //设置支持AJAX
138 |             wc.setAjaxController(new NicelyResynchronizingAjaxController());
139 |             if (param.getIsProxy()) {
140 |                 IPMessage ipMessage = RedisUtil.getOneIp();
141 |                 wc.getOptions().setProxyConfig(new ProxyConfig(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort())));
142 |             }
143 |             if (param.getDelayTime() != null) {
144 |                 Thread.sleep(param.getDelayTime());
145 |             }
146 |             HtmlPage page = wc.getPage(param.getWebUrl());
147 |             //以xml的形式获取响应文本
148 |             return page.asXml();
149 |         } catch (Exception e) {
150 |             //异常
151 |             log.info("没有抓到数据......");
152 |         }
153 |         return null;
154 |     }
155 | 
156 |     /**
157 |      * ========================================
158 |      *
159 |      * @description: 取得json数据
160 |      * @author: lyy
161 |      * @param:
162 |      * @return:
163 |      * @exception:
164 |      * @create: 2019/6/28 11:46
165 |      * <p>
166 |      * ========================================
167 |      */
168 |     public static String getJson(Param param) {
169 |         CloseableHttpClient httpClient = HttpClients.createDefault();
170 |         //设置代理
171 |         RequestConfig config = null;
172 |         if (param.getIsProxy()) {
173 |             IPMessage ipMessage = RedisUtil.getOneIp();
174 |             config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(new HttpHost(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort()))).build();
175 |         } else {
176 |             config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).build();
177 |         }
178 |         HttpPost httpPost = new HttpPost(param.getWebUrl());
179 |         httpPost.setConfig(config);
180 |         // 遍历map 设置请求头信息
181 |         if (!CollectionUtils.isEmpty(param.getHeaderInfos())) {
182 |             for (String key : param.getHeaderInfos().keySet()) {
183 |                 httpPost.setHeader(key, param.getHeaderInfos().get(key));
184 |             }
185 |         }
186 |         //遍历BodyParams
187 |         if (!CollectionUtils.isEmpty(param.getBodyParams())) {
188 |             JSONObject jsonParam = new JSONObject();
189 |             for (String key : param.getBodyParams().keySet()) {
190 |                 jsonParam.put(key, param.getBodyParams().get(key));
191 |             }
192 |             //解决中文乱码问题
193 |             StringEntity entity = new StringEntity(jsonParam.toString(), param.getCharset());
194 |             entity.setContentEncoding("UTF-8");
195 |             entity.setContentType("application/json");
196 |             httpPost.setEntity(entity);
197 |         }
198 |         String httpResponseString = "";
199 |         try {
200 |             //客户端执行httpPost方法，返回响应
201 |             HttpResponse httpResponse = httpClient.execute(httpPost);
202 |             //得到服务响应状态码
203 |             if (httpResponse.getStatusLine().getStatusCode() == 200) {
204 |                 httpResponseString = EntityUtils.toString(httpResponse.getEntity(), param.getCharset());
205 |             }
206 |             httpClient.close();
207 |         } catch (Exception e) {
208 |             log.error("Exception:{}", e);
209 |         }
210 |         return httpResponseString;
211 |     }
212 | 
213 | }
214 | 


--------------------------------------------------------------------------------
/src/main/java/com/xyzj/crawler/utils/parsehtmlstring/JsoupHtmlParser.java:
--------------------------------------------------------------------------------
  1 | package com.xyzj.crawler.utils.parsehtmlstring;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.nodes.Element;
  6 | import org.jsoup.safety.Whitelist;
  7 | import org.jsoup.select.Elements;
  8 | 
  9 | import java.util.Iterator;
 10 | import java.util.LinkedList;
 11 | import java.util.List;
 12 | 
 13 | /**
 14 |  * 对Jsoup的再次封装,更加简明扼要
 15 |  * 
 16 |  * @author zel
 17 |  * 
 18 |  */
 19 | public class JsoupHtmlParser {
 20 | 	/**
 21 | 	 * 得到指定文档的纯文档
 22 | 	 * 
 23 | 	 * @param htmlSource
 24 | 	 * @return
 25 | 	 */
 26 | 	public static String getCleanTxt(String htmlSource) {
 27 | 		if (htmlSource == null || htmlSource.isEmpty()) {
 28 | 			return StaticValue.NULL;
 29 | 		}
 30 | 		return Jsoup.clean(htmlSource, Whitelist.none());
 31 | 	}
 32 | 
 33 | 	/**
 34 | 	 * 沒有DataFormatStatus參數的情況
 35 | 	 * 
 36 | 	 * @param htmlSource
 37 | 	 * @param tagName
 38 | 	 * @return
 39 | 	 */
 40 | 	public static String getTagContent(String htmlSource, String tagName) {
 41 | 		return getTagContent(htmlSource, tagName, DataFormatStatus.CleanTxt);
 42 | 	}
 43 | 
 44 | 	/**
 45 | 	 * 得到指定tag标签的的内容，包括纯文本和标签全部内容两种格式
 46 | 	 * 
 47 | 	 * @param htmlSource
 48 | 	 * @param tagName
 49 | 	 * @param dataFormatStatus
 50 | 	 * @return
 51 | 	 */
 52 | 	public static String getTagContent(String htmlSource, String tagName,
 53 | 			DataFormatStatus dataFormatStatus) {
 54 | 		if (htmlSource == null || htmlSource.isEmpty()) {
 55 | 			return StaticValue.NULL;
 56 | 		}
 57 | 		SystemAssert.assertNotNull(dataFormatStatus);
 58 | 
 59 | 		StringBuilder sb = new StringBuilder();
 60 | 		Document doc = Jsoup.parse(htmlSource);
 61 | 		Elements elements = doc.getElementsByTag(tagName);
 62 | 
 63 | 		Iterator<Element> iterator = elements.iterator();
 64 | 		Element element = null;
 65 | 
 66 | 		if (dataFormatStatus == DataFormatStatus.CleanTxt) {
 67 | 			while (iterator.hasNext()) {
 68 | 				element = iterator.next();
 69 | 				sb.append(getCleanTxt(element.toString()));
 70 | 				sb.append(StaticValue.separator_next_line);
 71 | 			}
 72 | 		} else {
 73 | 			while (iterator.hasNext()) {
 74 | 				element = iterator.next();
 75 | 				sb.append(element.toString());
 76 | 				sb.append(StaticValue.separator_next_line);
 77 | 			}
 78 | 		}
 79 | 
 80 | 		return sb.toString();
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * getNestTagContent无参数的情况
 85 | 	 * 
 86 | 	 * @param htmlSource
 87 | 	 * @param tagList
 88 | 	 * @return
 89 | 	 */
 90 | 	public static List<String> getNestTagContent(String htmlSource,
 91 | 			List<String> tagList, boolean isFilter) {
 92 | 		return getNestTagContent(htmlSource, tagList,
 93 | 				DataFormatStatus.CleanTxt, isFilter);
 94 | 	}
 95 | 
 96 | 	/**
 97 | 	 * 得到嵌套的标签的数据,包括纯文本、标签全部内容两种格式
 98 | 	 * 
 99 | 	 * @param htmlSource
100 | 	 * @param tagList
101 | 	 * @param dataFormatStatus
102 | 	 * @return
103 | 	 */
104 | 	public static List<String> getNestTagContent(String htmlSource,
105 | 			List<String> tagList, DataFormatStatus dataFormatStatus,
106 | 			boolean isFilter) {
107 | 		if (htmlSource == null || htmlSource.isEmpty() || tagList == null
108 | 				|| tagList.isEmpty()) {
109 | 			return null;
110 | 		}
111 | 
112 | 		SystemAssert.assertNotNull(dataFormatStatus);
113 | 
114 | 		Document doc = Jsoup.parse(htmlSource);// 先预解析
115 | 		Iterator<String> tagIteraotr = tagList.iterator();
116 | 		String temp_tag = null;
117 | 
118 | 		List<Element> temp_list_element = new LinkedList<Element>();
119 | 		Elements elements = null;
120 | 
121 | 		Elements temp_elements = null;
122 | 		// 暂存循环迭代时候的结果
123 | 		List<String> temp_list_line = new LinkedList<String>();
124 | 		Document temp_doc = null;// 暂存二次解析出来的doc
125 | 		boolean isFirst = true;// 标志是否是第一次进行选择器处理
126 | 		while (tagIteraotr.hasNext()) {
127 | 			temp_tag = tagIteraotr.next();
128 | 			if (isFirst) {
129 | 				elements = doc.getElementsByTag(temp_tag);
130 | 				isFirst = false;
131 | 			} else {
132 | 				/**
133 | 				 * 对每个先前的结果集进行过滤
134 | 				 */
135 | 				elements.clear();
136 | 				for (String line : temp_list_line) {
137 | 					if (line != null && (!line.isEmpty())) {
138 | 						temp_doc = Jsoup.parse(line);// 先预解析
139 | 						temp_elements = temp_doc.getElementsByTag(temp_tag);
140 | 						if (temp_elements != null && (!temp_elements.isEmpty())) {
141 | 							elements.addAll(temp_elements);
142 | 						}
143 | 					}
144 | 				}
145 | 			}
146 | 
147 | 			temp_list_element.clear();
148 | 			temp_list_element.addAll(elements);
149 | 
150 | 			Iterator<Element> elementIteraotr = temp_list_element.iterator();
151 | 
152 | 			temp_list_line.clear();
153 | 			while (elementIteraotr.hasNext()) {
154 | 				Element element = elementIteraotr.next();
155 | 				temp_list_line.add(getTagContent(element.toString(), temp_tag,
156 | 						DataFormatStatus.TagAllContent));
157 | 			}
158 | 		}
159 | 
160 | 		temp_list_line = doListFilter(temp_list_line, dataFormatStatus,
161 | 				isFilter);
162 | 
163 | 		return temp_list_line;
164 | 	}
165 | 
166 | 	/**
167 | 	 * 对选择器的默认调用
168 | 	 * 
169 | 	 * @param htmlSource
170 | 	 * @param selectorList
171 | 	 * @param isFilter
172 | 	 * @return
173 | 	 */
174 | 	public static List<String> getNodeContentBySelector(String htmlSource,
175 | 			List<String> selectorList, boolean isFilter) {
176 | 		return getNodeContentBySelector(htmlSource, selectorList,
177 | 				DataFormatStatus.CleanTxt, isFilter);
178 | 	}
179 | 
180 | 	/**
181 | 	 * 通过选择器来检索和获取节点数据,直接以集合形式传参
182 | 	 * 
183 | 	 * @param htmlSource
184 | 	 * @param selectorList
185 | 	 * @return
186 | 	 */
187 | 	public static List<String> getNodeContentBySelector(String htmlSource,
188 | 			List<String> selectorList, DataFormatStatus dataFormatStatus,
189 | 			boolean isFilter) {
190 | 		if (htmlSource == null || htmlSource.isEmpty() || selectorList == null
191 | 				|| selectorList.isEmpty()) {
192 | 			return null;
193 | 		}
194 | 		SystemAssert.assertNotNull(dataFormatStatus);
195 | 
196 | 		Document doc = Jsoup.parse(htmlSource);// 先预解析
197 | 
198 | 		Iterator<String> selectorIteraotr = selectorList.iterator();
199 | 		String temp_selector = null;
200 | 		List<Element> temp_list_element = new LinkedList<Element>();
201 | 		Elements elements = null;
202 | 		Elements temp_elements = null;
203 | 		// 暂存循环迭代时候的结果
204 | 		List<String> temp_list_line = new LinkedList<String>();
205 | 		Document temp_doc = null;// 暂存二次解析出来的doc
206 | 		boolean isFirst = true;// 标志是否是第一次进行选择器处理
207 | 		while (selectorIteraotr.hasNext()) {
208 | 
209 | 			temp_selector = selectorIteraotr.next();
210 | 			if (isFirst) {
211 | 				elements = doc.select(temp_selector);
212 | 
213 | 				//System.out.println("elements: "+elements.toString());
214 | 				isFirst = false;
215 | 			} else {
216 | 				/**
217 | 				 * 对每个先前的结果集进行过滤
218 | 				 */
219 | 				elements.clear();
220 | 				for (String line : temp_list_line) {
221 | 					if (line != null && (!line.isEmpty())) {
222 | 						temp_doc = Jsoup.parse(line);// 先预解析
223 | 						temp_elements = temp_doc.select(temp_selector);
224 | 
225 | 						if (temp_elements != null && (!temp_elements.isEmpty())) {
226 | 							elements.addAll(temp_elements);
227 | 						}
228 | 					}
229 | 				}
230 | 			}
231 | 			temp_list_element.clear();
232 | 			temp_list_element.addAll(elements);
233 | 
234 | 			Iterator<Element> elementIteraotr = temp_list_element.iterator();
235 | 
236 | 			temp_list_line.clear();
237 | 			while (elementIteraotr.hasNext()) {
238 | 				Element element = elementIteraotr.next();
239 |                 //System.out.println("element: "+element);
240 | 
241 | 				temp_list_line.add(element.toString());
242 | 			}
243 | 		}
244 | 		// 做下过滤
245 | 		temp_list_line = doListFilter(temp_list_line, dataFormatStatus,
246 | 				isFilter);
247 | 
248 | 
249 | 		return temp_list_line;
250 | 	}
251 | 
252 | 	/**
253 | 	 * 第二版，主要是为有多个完全相同的内容块时，选取其中指定的索引位置的内容块,内容块位置从基数一开始
254 | 	 * 
255 | 	 * @param htmlSource
256 | 	 * @param selectorList
257 | 	 * @param dataFormatStatus
258 | 	 * @param isFilter
259 | 	 * @return
260 | 	 */
261 | 	public static List<String> getNodeContentBySelector4MultiSameBlock(
262 | 			String htmlSource, List<String> selectorList,
263 | 			DataFormatStatus dataFormatStatus, boolean isFilter) {
264 | 		if (htmlSource == null || htmlSource.isEmpty() || selectorList == null
265 | 				|| selectorList.isEmpty()) {
266 | 			return null;
267 | 		}
268 | 		SystemAssert.assertNotNull(dataFormatStatus);
269 | 
270 | 		Document doc = Jsoup.parse(htmlSource);// 先预解析
271 | 
272 | 		Iterator<String> selectorIteraotr = selectorList.iterator();
273 |         String temp_selector = null;
274 | 		List<Element> temp_list_element = new LinkedList<Element>();
275 | 		Elements elements = null;
276 | 		Elements temp_elements = null;
277 | 		// 暂存循环迭代时候的结果
278 | 		List<String> temp_list_line = new LinkedList<String>();
279 | 		Document temp_doc = null;// 暂存二次解析出来的doc
280 | 		boolean isFirst = true;// 标志是否是第一次进行选择器处理
281 | 		String[] split_array = null;
282 | 		boolean find_block_index = false;// 标志是否有block_index存在
283 | 		int block_index = 0;// 如果block_index存在，则其具体的值在这
284 | 		while (selectorIteraotr.hasNext()) {
285 | 			temp_selector = selectorIteraotr.next();
286 | 			// 判断是否有#index#来做分隔
287 | 			split_array = temp_selector.split(StaticValue.split_block_index);
288 | 			temp_selector = split_array[0];
289 | 
290 | 			//System.out.println("==="+temp_selector);
291 | 			if (split_array.length == 1) {
292 | 				find_block_index = false;
293 | 			} else if (split_array.length == 2) {
294 | 				try {
295 | 					block_index = Integer.parseInt(split_array[1]);
296 | 					if (block_index <= 0) {
297 | 						throw new Exception(
298 | 								"block index value <= 0 is wrong, plase the base value is 1");
299 | 					}
300 | 					find_block_index = true;
301 | 				} catch (Exception e) {
302 | 					find_block_index = false;
303 | 					e.printStackTrace();
304 | 				}
305 | 			} else {
306 | 				try {
307 | 					throw new Exception("jsoup规则写法有错误，目前只支持一个规则行中最多有一个 "
308 | 							+ StaticValue.split_block_index
309 | 							+ ",且在行末尾\n如果想写多个请用#split_big#来区分");
310 | 				} catch (Exception e) {
311 | 					e.printStackTrace();
312 | 				}
313 | 			}
314 | 
315 | 			if (isFirst) {
316 | 				elements = doc.select(temp_selector);
317 | 				isFirst = false;
318 | 			} else {
319 | 				/**
320 | 				 * 对每个先前的结果集进行过滤
321 | 				 */
322 | 				elements.clear();
323 | 				for (String line : temp_list_line) {
324 | 					if (line != null && (!line.isEmpty())) {
325 | 						temp_doc = Jsoup.parse(line);// 先预解析
326 | 						temp_elements = temp_doc.select(temp_selector);
327 | 
328 | 						if (temp_elements != null && (!temp_elements.isEmpty())) {
329 | 							elements.addAll(temp_elements);
330 | 						}
331 | 					}
332 | 				}
333 | 			}
334 | 
335 | 			// 对前边的elements做block_index截取
336 | 			if (find_block_index
337 | 					&& StringOperatorUtil.isNotBlankCollection(elements)) {
338 | 				int size = elements.size();
339 | 				if (size >= block_index) {
340 | 					List<Element> eleList = new LinkedList<Element>();
341 | 					eleList.add(elements.get(block_index - 1));
342 | 					elements = new Elements(eleList);
343 | 				}
344 | 			}
345 | 
346 | 			temp_list_element.clear();
347 | 			temp_list_element.addAll(elements);
348 | 
349 | 			Iterator<Element> elementIteraotr = temp_list_element.iterator();
350 | 
351 | 			temp_list_line.clear();
352 | 			while (elementIteraotr.hasNext()) {
353 | 				Element element = elementIteraotr.next();
354 | 				temp_list_line.add(element.toString());
355 | 			}
356 | 		}
357 | 		// 做下过滤
358 | 		temp_list_line = doListFilter(temp_list_line, dataFormatStatus,
359 | 				isFilter);
360 | 
361 | 		return temp_list_line;
362 | 	}
363 | 
364 | 	/**
365 | 	 * 取得字符串source的对应的属性attr的值
366 | 	 * 
367 | 	 * @param source
368 | 	 * @param attr
369 | 	 * @return
370 | 	 */
371 | 	public static String getAttributeValue(String source, String attr) {
372 | 		if (source == null || attr == null) {
373 | 			return null;
374 | 		}
375 | 		Document doc = Jsoup.parse(source);
376 | 		Elements elements = doc.select("[" + attr + "]");
377 | 		String temp = null;
378 | 		if (elements != null) {
379 | 			for (Element ele : elements) {
380 | 				temp = ele.attr(attr);
381 | 			}
382 | 		}
383 | 		return temp;
384 | 	}
385 | 
386 | 	/**
387 | 	 * 集合方式处理，得取集合中的每个元素串的attr属性对应的值
388 | 	 * 
389 | 	 * @param sourceList
390 | 	 * @param attr
391 | 	 * @return
392 | 	 */
393 | 	public static List<String> getAttributeValueList(List<String> sourceList,
394 | 			String attr) {
395 | 		if (sourceList == null || attr == null) {
396 | 			return null;
397 | 		}
398 | 		List<String> resultList = new LinkedList<String>();
399 | 		String selString = "[" + attr + "]";
400 | 		for (String tempLine : sourceList) {
401 | 			Document doc = Jsoup.parse(tempLine);
402 | 			Elements elements = doc.select(selString);
403 | 			if (elements != null) {
404 | 				for (Element ele : elements) {
405 | 					resultList.add(ele.attr(attr));
406 | 				}
407 | 			}
408 | 		}
409 | 
410 | 		return resultList;
411 | 	}
412 | 
413 | 	/**
414 | 	 * 做最后的字符串过滤，该方法对用户透明
415 | 	 * 
416 | 	 * @param temp_list_line
417 | 	 * @param dataFormatStatus
418 | 	 * @return
419 | 	 */
420 | 	private static List<String> doListFilter(List<String> temp_list_line,
421 | 			DataFormatStatus dataFormatStatus, boolean isFilter) {
422 | 		if (temp_list_line == null || temp_list_line.isEmpty()) {
423 | 			return null;
424 | 		}
425 | 
426 | 		SystemAssert.assertNotNull(dataFormatStatus);
427 | 
428 | 		// 最终的结合集进行所要的数据格式的过滤
429 | 		if (dataFormatStatus == DataFormatStatus.CleanTxt) {
430 | 			List<String> cleanResultList = new LinkedList<String>();
431 | 			String temp_clean = null;
432 | 			for (String item : temp_list_line) {
433 | 				if (isFilter) {
434 | 					item = item.replaceAll(StaticValue.htmlTagRegex, "");
435 | 				}
436 | 				if ((temp_clean = getCleanTxt(item)) != null
437 | 						&& (!temp_clean.isEmpty())) {
438 | 					cleanResultList.add(temp_clean);
439 | 				}
440 | 				/*	for (int i = 0; i < temp_list_line.size(); i ++) {
441 | 						if(null != cleanResultList && cleanResultList.get(i).indexOf(temp_clean) != -1) {
442 | 
443 | 						} else {
444 | 							cleanResultList.add(temp_clean);
445 | 						}
446 | 					}
447 | 				}*/
448 | 			}
449 | 			return cleanResultList;
450 | 		}
451 | 		return temp_list_line;
452 | 	}
453 | 
454 | 	/**
455 | 	 * url抽取相关,不带过滤条件
456 | 	 * @param htmlSource
457 | 	 * @return
458 | 	 */
459 | 	public static List<String> getAllHref(String htmlSource) {
460 | 		try {
461 | 			Document doc = Jsoup.parse(htmlSource);// 先预解析
462 | 			Elements links = doc.getElementsByTag("a");
463 | 			String linkHref = null;
464 | 			List<String> urlList = new LinkedList<String>();
465 | 			for (Element link : links) {
466 | 				linkHref = link.attr("href");
467 | 				// String linkText = link.text();
468 | 				if (UrlOperatorUtil.isValidUrl(linkHref)
469 | 						&& linkHref.startsWith("http:")) {
470 | 					urlList.add(linkHref.trim());
471 | 				}
472 | 			}
473 | 			return urlList;
474 | 		} catch (Exception e) {
475 | 			e.printStackTrace();
476 | 		}
477 | 		return null;
478 | 	}
479 | 
480 | 	/**
481 | 	 * url抽取相关,附带添加相关的host前缀
482 | 	 * 
483 | 	 * @param fromUrl
484 | 	 * @param host
485 | 	 * @param htmlSource
486 | 	 * @return
487 | 	 */
488 | 	public static List<String> getAllHref4AddHost(String fromUrl, String host,
489 | 			String htmlSource) {
490 | 		try {
491 | 			Document doc = Jsoup.parse(htmlSource);// 先预解析
492 | 			Elements links = doc.getElementsByTag("a");
493 | 			String linkHref = null;
494 | 			List<String> urlList = new LinkedList<String>();
495 | 			// String host = UrlOperatorUtil.getHost(fromUrl);
496 | 
497 | 			for (Element link : links) {
498 | 				linkHref = link.attr("href");
499 | 				if (linkHref.startsWith("http://")) {
500 | 					// 不作处理
501 | 				} else if (linkHref.startsWith("/")) {
502 | 					// 绝对地址
503 | 					linkHref = "http://" + host + linkHref;
504 | 				} else {
505 | 					// 相对地址
506 | 					// 说明是相对路径
507 | 					int last_pos = fromUrl.lastIndexOf("/");
508 | 					String relative_path = fromUrl.substring(0, last_pos + 1);
509 | 					linkHref = relative_path + linkHref;
510 | 				}
511 | 				if (UrlOperatorUtil.isValidUrl(linkHref)
512 | 						&& linkHref.startsWith("http:")) {
513 | 					urlList.add(linkHref.trim());
514 | 				}
515 | 			}
516 | 			return urlList;
517 | 		} catch (Exception e) {
518 | 			e.printStackTrace();
519 | 		}
520 | 		return null;
521 | 	}
522 | 
523 | 	/**
524 | 	 * 去掉标签中某一部分内容,暂定位第一版
525 | 	 * 
526 | 	 * @param htmlSource
527 | 	 * @param selector
528 | 	 * @param removeSelector
529 | 	 * @return
530 | 	 */
531 | 	public static String removeInnerContent(String htmlSource, String selector,
532 | 			List<String> removeSelector) {
533 | 		if (selector == null
534 | 				|| StringOperatorUtil.isBlankCollection(removeSelector)) {
535 | 			return htmlSource;
536 | 		}
537 | 		try {
538 | 			Document doc = Jsoup.parse(htmlSource);// 先预解析
539 | 			Elements elements = doc.select(selector);
540 | 			String result = null;
541 | 			if (elements != null) {
542 | 				for (Element ele : elements) {
543 | 					for (String sel : removeSelector) {
544 | 						ele.select(sel).remove();
545 | 					}
546 | 					// result = ele.toString();
547 | 					result = JsoupHtmlParser.getCleanTxt(ele.toString());
548 | 					break;
549 | 				}
550 | 			}
551 | 			return result;
552 | 		} catch (Exception e) {
553 | 			e.printStackTrace();
554 | 		}
555 | 		return null;
556 | 	}
557 |     /**
558 |      *
559 |      *百度词条重新爬取数据
560 |      * @param htmlSource
561 |      * @param selectorList
562 |      * @return
563 |      */
564 |     public static List<String> getContentBySelector(String htmlSource,
565 |                                                         List<String> selectorList, DataFormatStatus dataFormatStatus,
566 |                                                         boolean isFilter) {
567 |         if (htmlSource == null || htmlSource.isEmpty() || selectorList == null
568 |                 || selectorList.isEmpty()) {
569 |             return null;
570 |         }
571 |         SystemAssert.assertNotNull(dataFormatStatus);
572 | 
573 |         Document doc = Jsoup.parse(htmlSource);// 先预解析
574 | 
575 |         Iterator<String> selectorIteraotr = selectorList.iterator();
576 |         String temp_selector = null;
577 |         List<Element> temp_list_element = new LinkedList<Element>();
578 |         Elements elements = null;
579 |         Elements temp_elements = null;
580 |         // 暂存循环迭代时候的结果
581 |         List<String> temp_list_line = new LinkedList<String>();
582 |         Document temp_doc = null;// 暂存二次解析出来的doc
583 |         boolean isFirst = true;// 标志是否是第一次进行选择器处理
584 |         while (selectorIteraotr.hasNext()) {
585 | 
586 |             temp_selector = selectorIteraotr.next();
587 |             if (isFirst) {
588 |                 elements = doc.select(temp_selector);
589 | 
590 |                 //System.out.println("elements: "+elements.toString());
591 |                 isFirst = false;
592 |             } else {
593 |                 /**
594 |                  * 对每个先前的结果集进行过滤
595 |                  */
596 |                 elements.clear();
597 |                 for (String line : temp_list_line) {
598 |                     if (line != null && (!line.isEmpty())) {
599 |                         temp_doc = Jsoup.parse(line);// 先预解析
600 |                         temp_elements = temp_doc.select(temp_selector);
601 | 
602 |                         if (temp_elements != null && (!temp_elements.isEmpty())) {
603 |                             elements.addAll(temp_elements);
604 |                         }
605 |                     }
606 |                 }
607 |             }
608 |             temp_list_element.clear();
609 |             temp_list_element.addAll(elements);
610 | 
611 |             Iterator<Element> elementIteraotr = temp_list_element.iterator();
612 |             temp_list_line.clear();
613 |             while (elementIteraotr.hasNext()) {
614 |                 Element element = elementIteraotr.next();
615 | 				String aClass = element.attr("class");
616 | 				String paraText = null;
617 | 				//String paraText = element.toString();
618 |                 if("para".equals(aClass)){
619 | 					paraText = element.text();
620 | 					Element element2 = element.nextElementSibling();
621 | 					String aClass2 = element2.attr("class");
622 | 					while ("para".equals(aClass2)){
623 | 						paraText +=element2.text();
624 | 						element2 = element2.nextElementSibling();
625 | 						aClass2 = element2.attr("class");
626 | 						if(!"para".equals(aClass2)){
627 | 							aClass=aClass2;
628 | 						}
629 | 					}
630 | 
631 | 				}else{
632 | 					paraText = element.toString();
633 | 
634 | 				}
635 | 
636 | 
637 |                 /*while ("para".equals(aClass)) {
638 |                     element = element.nextElementSibling();
639 |                     aClass = element.attr("class");
640 | 
641 |                     if ("para".equals(aClass)){
642 | 						String text = element.text();
643 | 						paraText+=text;
644 | 					}
645 |                 }*/
646 |                 temp_list_line.add(paraText);
647 |             }
648 |         }
649 |         // 做下过滤
650 |         temp_list_line = doListFilter(temp_list_line, dataFormatStatus,
651 |                 isFilter);
652 | 
653 | 
654 |         return temp_list_line;
655 |     }
656 | 	/**
657 | 	 *
658 | 	 *中国军医网数据爬取
659 | 	 * @param htmlSource
660 | 	 * @param selectorList
661 | 	 * @return
662 | 	 */
663 | 	public static List<String> getContentByChinese(String htmlSource,
664 | 													List<String> selectorList, DataFormatStatus dataFormatStatus,
665 | 													boolean isFilter) {
666 | 		if (htmlSource == null || htmlSource.isEmpty() || selectorList == null
667 | 				|| selectorList.isEmpty()) {
668 | 			return null;
669 | 		}
670 | 		SystemAssert.assertNotNull(dataFormatStatus);
671 | 
672 | 		Document doc = Jsoup.parse(htmlSource);// 先预解析
673 | 
674 | 		Iterator<String> selectorIteraotr = selectorList.iterator();
675 | 		String temp_selector = null;
676 | 		List<Element> temp_list_element = new LinkedList<Element>();
677 | 		Elements elements = null;
678 | 		Elements temp_elements = null;
679 | 		// 暂存循环迭代时候的结果
680 | 		List<String> temp_list_line = new LinkedList<String>();
681 | 		Document temp_doc = null;// 暂存二次解析出来的doc
682 | 		boolean isFirst = true;// 标志是否是第一次进行选择器处理
683 | 		while (selectorIteraotr.hasNext()) {
684 | 
685 | 			temp_selector = selectorIteraotr.next();
686 | 			if (isFirst) {
687 | 				elements = doc.select(temp_selector);
688 | 
689 | 				//System.out.println("elements: "+elements.toString());
690 | 				isFirst = false;
691 | 			} else {
692 | 				/**
693 | 				 * 对每个先前的结果集进行过滤
694 | 				 */
695 | 				elements.clear();
696 | 				for (String line : temp_list_line) {
697 | 					if (line != null && (!line.isEmpty())) {
698 | 						temp_doc = Jsoup.parse(line);// 先预解析
699 | 						temp_elements = temp_doc.select(temp_selector);
700 | 
701 | 						if (temp_elements != null && (!temp_elements.isEmpty())) {
702 | 							elements.addAll(temp_elements);
703 | 						}
704 | 					}
705 | 				}
706 | 			}
707 | 			temp_list_element.clear();
708 | 			temp_list_element.addAll(elements);
709 | 
710 | 			Iterator<Element> elementIteraotr = temp_list_element.iterator();
711 | 
712 | 			temp_list_line.clear();
713 | 			while (elementIteraotr.hasNext()) {
714 | 				Element element = elementIteraotr.next();
715 | 				//System.out.println("element: "+element.toString());
716 | 				String paraText = element.toString();
717 | 				String aClass = element.attr("class");
718 | 				while ("PreCaption".equals(aClass)) {
719 | 					element = elementIteraotr.next();
720 | 					aClass = element.attr("class");
721 | 					//System.out.println("elements: "+element.text());
722 | 					if ("ColumnValue".equals(aClass)|"".equals(aClass)){
723 | 						String text1 = element.text();
724 |                         paraText +=text1;
725 |                     }
726 | 				}
727 | 				temp_list_line.add(paraText);
728 | 
729 | 			}
730 | 		}
731 | 		// 做下过滤
732 | 		temp_list_line = doListFilter(temp_list_line, dataFormatStatus,
733 | 				isFilter);
734 | 
735 | 
736 | 		return temp_list_line;
737 | 	}
738 | }


--------------------------------------------------------------------------------