├── .github ├── my.JPG └── FUNDING.yml ├── src ├── main │ ├── resources │ │ ├── demo │ │ │ └── demo.xlsx │ │ └── conf.properties │ └── java │ │ └── com │ │ └── xyzj │ │ ├── crawler │ │ ├── utils │ │ │ ├── .DS_Store │ │ │ ├── parsehtmlstring │ │ │ │ ├── DataFormatStatus.java │ │ │ │ ├── JsoupSelectItemPojo.java │ │ │ │ ├── SystemAssert.java │ │ │ │ ├── StaticValue.java │ │ │ │ ├── UrlOperatorUtil.java │ │ │ │ ├── StringOperatorUtil.java │ │ │ │ ├── ParseTsUrls.java │ │ │ │ ├── DownloadTsFile.java │ │ │ │ ├── RegexUtil.java │ │ │ │ ├── RegexPaserUtil.java │ │ │ │ └── JsoupHtmlParser.java │ │ │ ├── gethtmlstring │ │ │ │ ├── BaseHttpCallBack.java │ │ │ │ ├── UrlUtil.java │ │ │ │ ├── SpiderUtil.java │ │ │ │ ├── M3u8HttpClientUtil.java │ │ │ │ └── HttpResponseUtil.java │ │ │ ├── proxyip │ │ │ │ ├── IPModel │ │ │ │ │ ├── IPMessage.java │ │ │ │ │ └── SerializeUtil.java │ │ │ │ ├── spider │ │ │ │ │ ├── doRule │ │ │ │ │ │ ├── ProxyFilterSpiderRule.java │ │ │ │ │ │ └── ProxyXcSpiderRule.java │ │ │ │ │ └── docrawler │ │ │ │ │ │ └── ProxyXcDoMain.java │ │ │ │ └── config │ │ │ │ │ ├── RedisUtil.java │ │ │ │ │ └── RedisConfig.java │ │ │ ├── packageutil │ │ │ │ ├── GetAllFiles.java │ │ │ │ ├── PackageGetClassUtil.java │ │ │ │ └── PackageGetJavaUtil.java │ │ │ ├── importfrom │ │ │ │ ├── FileCopyUtil.java │ │ │ │ ├── IOUtil.java │ │ │ │ └── ImportExcelUtil.java │ │ │ ├── savetomysql │ │ │ │ ├── SaveToMysql.java │ │ │ │ └── SaveToOracle.java │ │ │ └── authcode │ │ │ │ └── AuthcodeDistinguisher.java │ │ ├── framework │ │ │ ├── enums │ │ │ │ └── FactionEnum.java │ │ │ ├── interfaces │ │ │ │ └── ISpiderRule.java │ │ │ ├── abstracts │ │ │ │ └── SpiderRuleAbstract.java │ │ │ ├── entity │ │ │ │ ├── Goods.java │ │ │ │ └── Param.java │ │ │ ├── factory │ │ │ │ └── SpiderRuleFactory.java │ │ │ ├── runnable │ │ │ │ └── SpiderRunnable.java │ │ │ ├── defaults │ │ │ │ ├── DefaultM3u8SpiderRule.java │ │ │ │ └── DefaultSpiderRule.java │ │ │ └── handler │ │ │ │ └── SpiderRuleHandler.java │ │ └── spidertask │ │ │ ├── example │ │ │ ├── dorule │ │ │ │ ├── DoRule58.java │ │ │ │ └── DoRule51Cto.java │ │ │ └── docrawler │ │ │ │ └── DoCrawler58.java │ │ │ └── zlr │ │ │ ├── docrawler │ │ │ ├── SsqDoErrorMain.java │ │ │ └── SsqDoMain.java │ │ │ └── dorule │ │ │ └── SsqDetailSpiderRule.java │ │ └── bigdata │ │ └── in │ │ ├── DemoData.java │ │ ├── TestFileUtil.java │ │ ├── ReadTest.java │ │ └── DemoDataListener.java └── test │ └── java │ └── com │ └── xyzj │ └── crawler │ └── DefaultSpiderRuleTest.java ├── .gitignore ├── ddl └── goods.sql ├── README.md └── pom.xml /.github/my.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/.github/my.JPG -------------------------------------------------------------------------------- /src/main/resources/demo/demo.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/src/main/resources/demo/demo.xlsx -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xyzj-dev/xyzj-crawler/HEAD/src/main/java/com/xyzj/crawler/utils/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/enums/FactionEnum.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.enums; 2 | 3 | /** 4 | * @author lyy 5 | * @since 2019-06-28 11:01 6 | */ 7 | public enum FactionEnum { 8 | getHtml, 9 | getHtmlWithJavaScript, 10 | getJson, 11 | ; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/DataFormatStatus.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | /** 4 | * 对所要结果数据格式的枚举,现分为2种,纯文本和标签自身的全部内容,默认为纯文本 5 | * 6 | * @author zel 7 | * 8 | */ 9 | public enum DataFormatStatus { 10 | CleanTxt, TagAllContent 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/gethtmlstring/BaseHttpCallBack.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.gethtmlstring; 2 | 3 | import java.io.InputStream; 4 | 5 | /** 6 | * http回调 7 | * */ 8 | public interface BaseHttpCallBack { 9 | 10 | void httpCallBack(int responseCode, InputStream inputStream); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/bigdata/in/DemoData.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.bigdata.in; 2 | 3 | import java.util.Date; 4 | import lombok.Data; 5 | 6 | /** 7 | * 基础数据类.这里的排序和excel里面的排序一致 8 | * 9 | * @author Jiaju Zhuang 10 | **/ 11 | @Data 12 | public class DemoData { 13 | private String string; 14 | private Date date; 15 | private Double doubleData; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/interfaces/ISpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.interfaces; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | 5 | 6 | /** 7 | * 爬虫规则 接口 8 | * 9 | * @author liuyangyang 10 | */ 11 | public interface ISpiderRule { 12 | 13 | void runSpider(Param param,ISpiderRule spiderRule); 14 | 15 | void handlerGoods(Param param, String htmlSource); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/abstracts/SpiderRuleAbstract.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.abstracts; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 5 | 6 | /** 7 | * 抽象类 8 | * @author liuyangyang 9 | * */ 10 | public abstract class SpiderRuleAbstract implements ISpiderRule { 11 | 12 | @Override 13 | public void runSpider(Param param, ISpiderRule spiderRule) { 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/JsoupSelectItemPojo.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | public class JsoupSelectItemPojo { 4 | private String selector; 5 | 6 | public JsoupSelectItemPojo(String selector, boolean have_remove_sel) { 7 | this.selector = selector; 8 | } 9 | 10 | public String getSelector() { 11 | return selector; 12 | } 13 | 14 | public void setSelector(String selector) { 15 | this.selector = selector; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/resources/conf.properties: -------------------------------------------------------------------------------- 1 | ########redis############ 2 | jedis.addr=192.168.34.251 3 | jedis.port=6379 4 | jedis.passwd= 5 | 6 | 7 | #########mysql####### 8 | mysql.url=jdbc:mysql://localhost/crawler?characterEncoding=utf8&useSSL=false 9 | mysql.username=root 10 | mysql.password=x5 11 | 12 | #mysql.url=jdbc:mysql://192.168.34.250/crawler?characterEncoding=utf8&useSSL=false&generateSimpleParameterMetadata=true 13 | #mysql.username=root 14 | #mysql.password=root 15 | 16 | 17 | 18 | #oracle 19 | oracle.url=jdbc:oracle:thin:@10.64.2.62:1521:testdb 20 | oracle.username=xdkf 21 | oracle.password=xdkf -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/IPModel/IPMessage.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.IPModel; 2 | 3 | import java.io.Serializable; 4 | import lombok.Data; 5 | 6 | 7 | @Data 8 | public class IPMessage implements Serializable { 9 | private static final long serialVersionUID = 1L; 10 | 11 | /** 12 | * ip地址 13 | */ 14 | private String ip; 15 | 16 | /** 17 | * 端口号 18 | */ 19 | private String port; 20 | 21 | 22 | /** 23 | * 类型 24 | */ 25 | private String type; 26 | 27 | /** 28 | * 延迟 29 | */ 30 | private String speed; 31 | 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/entity/Goods.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.entity; 2 | 3 | import java.io.Serializable; 4 | import lombok.Data; 5 | 6 | /** 7 | * @author liuyangyang 8 | * @since 2017-12-05 11:49 9 | */ 10 | @Data 11 | public class Goods implements Serializable { 12 | 13 | /** 主键 id*/ 14 | private Integer id; 15 | 16 | /** 类型 */ 17 | private String type; 18 | 19 | /**名称 详细内容*/ 20 | private String name; 21 | 22 | /**来源网站*/ 23 | private String webUrl; 24 | 25 | /**提供*/ 26 | private String provide; 27 | 28 | /** 排序列*/ 29 | private String orderNum; 30 | 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/com/xyzj/crawler/DefaultSpiderRuleTest.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | import org.junit.Test; 7 | 8 | /** 9 | * @author lyy 10 | * @since 2018-10-27 14:06 11 | */ 12 | 13 | public class DefaultSpiderRuleTest { 14 | 15 | @Test 16 | public void runSpider(){ 17 | Param param = new Param(); 18 | param.setWebUrl("https://www.baidu.com"); 19 | ISpiderRule spiderRule = new SpiderRuleFactory().getInstance(); 20 | spiderRule.runSpider(param,spiderRule); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with a single custom sponsorship URL 13 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/SystemAssert.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | /** 4 | * 系统断言类,方便引用,不使用junit之类的assert 5 | * 6 | * @author zel 7 | * 8 | */ 9 | public class SystemAssert { 10 | public static void assertNotNull(Object obj) { 11 | if (obj == null) { 12 | try { 13 | throw new Exception("object should not be null,please check"); 14 | } catch (Exception e) { 15 | e.printStackTrace(); 16 | System.exit(0); 17 | } 18 | } 19 | } 20 | 21 | public static void assertTrue(boolean bool, String message) { 22 | if (bool) { 23 | try { 24 | throw new Exception(message); 25 | } catch (Exception e) { 26 | e.printStackTrace(); 27 | System.exit(0); 28 | } 29 | } 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/factory/SpiderRuleFactory.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.factory; 2 | 3 | import com.xyzj.crawler.framework.defaults.DefaultSpiderRule; 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 5 | 6 | /** 7 | * @author lyy 8 | * @since 2018-10-27 13:57 9 | */ 10 | public class SpiderRuleFactory { 11 | 12 | 13 | //提供默认实现 14 | private ISpiderRule spiderRule = new DefaultSpiderRule(); 15 | 16 | 17 | //无参构造 18 | public SpiderRuleFactory() { 19 | 20 | } 21 | 22 | //有参数构造 23 | public SpiderRuleFactory(ISpiderRule spiderRule) { 24 | this.spiderRule = spiderRule; 25 | } 26 | 27 | 28 | //取得实例 29 | public ISpiderRule getInstance() { 30 | return spiderRule; 31 | } 32 | 33 | 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/StaticValue.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | /** 4 | * 静态变量设置 5 | * 6 | * @author zel 7 | * 8 | */ 9 | public class StaticValue { 10 | public static String default_encoding = "UTF-8"; 11 | public static String encoding_gbk = "gbk"; 12 | 13 | public static String separator_tab = "\t"; 14 | public static String separator_vertical_line = "\\|"; 15 | public static String separator_space = " "; 16 | public static String separator_next_line = "\n"; 17 | 18 | public static String NULL = null; 19 | 20 | /** 21 | * 要去除的html tags,regex string 22 | */ 23 | public static String htmlTagRegex=""; 24 | // public static String htmlTagRegex=""; 25 | 26 | //定义分隔块变量,与spider中保持一致,减少项目上依赖 27 | public static String split_block_index = "#block_index#"; 28 | 29 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/runnable/SpiderRunnable.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.runnable; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 5 | import lombok.extern.slf4j.Slf4j; 6 | 7 | 8 | /** 9 | * 多线程任务定义 10 | * 11 | * @since liuyangyang 12 | */ 13 | @Slf4j 14 | public class SpiderRunnable implements Runnable { 15 | 16 | //封装参数 17 | private Param param; 18 | 19 | //规则 20 | private ISpiderRule spiderRule; 21 | 22 | //构造方法 23 | public SpiderRunnable(ISpiderRule spiderRule, Param param) { 24 | super(); 25 | this.spiderRule = spiderRule; 26 | this.param = param; 27 | } 28 | 29 | @Override 30 | public void run() { 31 | spiderRule.runSpider(param,spiderRule); 32 | 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/UrlOperatorUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URL; 5 | 6 | /** 7 | * 关于url操作的工具类,如提取url的域名、keyword等 8 | * 9 | * @author zel 10 | * 11 | */ 12 | public class UrlOperatorUtil { 13 | 14 | public static boolean isValidUrl(String url) { 15 | if (url == null || url.isEmpty()) { 16 | return false; 17 | } 18 | try { 19 | @SuppressWarnings("unused") 20 | URL urlObj = new URL(url); 21 | return true; 22 | } catch (MalformedURLException e) { 23 | // e.printStackTrace(); 24 | } 25 | return false; 26 | } 27 | 28 | public static void main(String[] args) throws Exception { 29 | @SuppressWarnings("unused") 30 | UrlOperatorUtil urlOperatorUtil = new UrlOperatorUtil(); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/spidertask/example/dorule/DoRule58.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.spidertask.example.dorule; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | 7 | /** 8 | * 单页面抓取 9 | * 10 | * @author lyy 11 | * @since 2018-10-27 14:22 12 | */ 13 | public class DoRule58 { 14 | 15 | public static void main(String[] args) { 16 | 17 | //工厂取得默认实例 18 | ISpiderRule spiderRule = new SpiderRuleFactory().getInstance(); 19 | //封装参数 20 | Param param = new Param(); 21 | param.setWebUrl("https://cq.58.com/shouji/?PGTID=0d100000-0002-5d3a-b0c9-e83a870d03be&ClickID=3"); 22 | 23 | //走你 24 | spiderRule.runSpider(param,spiderRule); 25 | 26 | 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/defaults/DefaultM3u8SpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.defaults; 2 | 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract; 4 | import com.xyzj.crawler.framework.entity.Param; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | import com.xyzj.crawler.utils.parsehtmlstring.ParseTsUrls; 7 | import lombok.extern.slf4j.Slf4j; 8 | 9 | /** 10 | * @author lyy 11 | * @since 2018-11-25 19:40 12 | */ 13 | @Slf4j 14 | public class DefaultM3u8SpiderRule extends SpiderRuleAbstract { 15 | @Override 16 | public void runSpider(Param param, ISpiderRule spiderRule) { 17 | //执行解析 18 | new ParseTsUrls(param.getWebUrl(), param.getHeaderInfos(), param.getFileFullName()).httpRequestForTsUrls(); 19 | log.info("文件生成成功......"); 20 | } 21 | 22 | @Override 23 | public void handlerGoods(Param param, String htmlSource) { 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/StringOperatorUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * String常用操作类 7 | * 8 | * @author zel 9 | * 10 | */ 11 | public class StringOperatorUtil { 12 | public static boolean isBlank(String str) { 13 | if (str == null || str.trim().length() == 0) { 14 | return true; 15 | } 16 | return false; 17 | } 18 | 19 | public static boolean isBlankCollection(List list) { 20 | if (list == null || list.isEmpty()) { 21 | return true; 22 | } 23 | return false; 24 | } 25 | 26 | public static boolean isNotBlank(String str) { 27 | if (str == null || str.trim().length() == 0) { 28 | return false; 29 | } 30 | return true; 31 | } 32 | 33 | public static boolean isNotBlankCollection(List list) { 34 | if (list == null || list.isEmpty()) { 35 | return false; 36 | } 37 | return true; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/bigdata/in/TestFileUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.bigdata.in; 2 | 3 | import java.io.File; 4 | import java.io.InputStream; 5 | 6 | public class TestFileUtil { 7 | 8 | public static InputStream getResourcesFileInputStream(String fileName) { 9 | return Thread.currentThread().getContextClassLoader().getResourceAsStream("" + fileName); 10 | } 11 | 12 | public static String getPath() { 13 | return TestFileUtil.class.getResource("/").getPath(); 14 | } 15 | 16 | public static File createNewFile(String pathName) { 17 | File file = new File(getPath() + pathName); 18 | if (file.exists()) { 19 | file.delete(); 20 | } else { 21 | if (!file.getParentFile().exists()) { 22 | file.getParentFile().mkdirs(); 23 | } 24 | } 25 | return file; 26 | } 27 | 28 | public static File readFile(String pathName) { 29 | return new File(getPath() + pathName); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/handler/SpiderRuleHandler.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.handler; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 5 | import com.xyzj.crawler.utils.gethtmlstring.HttpResponseUtil; 6 | import lombok.extern.slf4j.Slf4j; 7 | 8 | /** 9 | * @author lyy 10 | * @since 2018-10-27 13:08 11 | */ 12 | @Slf4j 13 | public class SpiderRuleHandler { 14 | public void handler(Param param, ISpiderRule spiderRule) { 15 | try { 16 | //第一步 拿到源码 17 | String htmlSource = HttpResponseUtil.getHtmlSource(param); 18 | if (htmlSource == null)return; 19 | //第二步 匹配出内容 并进行下一步处理 20 | spiderRule.handlerGoods(param, htmlSource); 21 | }finally { 22 | //第三步 如果有减1个操作 23 | if (param.getCountDownLatch() !=null){ 24 | param.getCountDownLatch().countDown(); 25 | log.info("还有{}个任务等待中{}", param.getCountDownLatch().getCount()); 26 | } 27 | } 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/packageutil/GetAllFiles.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.packageutil; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * 遍历文件夹 7 | */ 8 | public class GetAllFiles { 9 | 10 | public static void main(String[] args) { 11 | //路径 这里写一个路径进去 12 | String path="F:\\QQ文档"; 13 | //调用方法 14 | getFiles(path); 15 | } 16 | 17 | /** 18 | * 递归获取某路径下的所有文件,文件夹,并输出 19 | */ 20 | 21 | public static void getFiles(String path) { 22 | File file = new File(path); 23 | // 如果这个路径是文件夹 24 | if (file.isDirectory()) { 25 | // 获取路径下的所有文件 26 | File[] files = file.listFiles(); 27 | for (int i = 0; i < files.length; i++) { 28 | // 如果还是文件夹 递归获取里面的文件 文件夹 29 | if (files[i].isDirectory()) { 30 | System.out.println("目录:" + files[i].getPath()); 31 | getFiles(files[i].getPath()); 32 | } else { 33 | System.out.println("文件:" + files[i].getPath()); 34 | } 35 | 36 | } 37 | } else { 38 | System.out.println("文件:" + file.getPath()); 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/bigdata/in/ReadTest.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.bigdata.in; 2 | 3 | import com.alibaba.excel.EasyExcel; 4 | import java.io.File; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | /** 9 | * 读的常见写法 10 | * 11 | * @author Jiaju Zhuang 12 | */ 13 | 14 | public class ReadTest { 15 | private static final Logger LOGGER = LoggerFactory.getLogger(ReadTest.class); 16 | 17 | private static String fileName ; 18 | public void simpleRead() { 19 | // 写法1: 20 | if (fileName == null) { 21 | fileName = TestFileUtil.getPath() + "demo" + File.separator + "demo.xlsx"; 22 | } 23 | // 这里 需要指定读用哪个class去读,然后读取第一个sheet 文件流会自动关闭 24 | LOGGER.info("文件路径是{}",fileName); 25 | EasyExcel.read(fileName, DemoData.class, new DemoDataListener()).sheet().doRead(); 26 | } 27 | 28 | public static void main(String[] args) { 29 | if (args.length!=0) { 30 | fileName = args[0]; 31 | } 32 | long l = System.currentTimeMillis(); 33 | ReadTest readTest = new ReadTest(); 34 | readTest.simpleRead(); 35 | long l1 = System.currentTimeMillis() - l; 36 | LOGGER.info("总耗时{}毫秒 ",l1); 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /ddl/goods.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : 192.168.73.21 5 | Source Server Version : 50719 6 | Source Host : 192.168.73.21:3306 7 | Source Database : crawler 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50719 11 | File Encoding : 65001 12 | 13 | Date: 2018-01-29 19:10:20 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for goods 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `goods`; 22 | CREATE TABLE `goods` ( 23 | `id` int(11) NOT NULL AUTO_INCREMENT, 24 | `type` text COMMENT '类型', 25 | `name` longtext COMMENT '名称', 26 | `webUrl` text COMMENT '来源网站', 27 | `provide` text COMMENT '提供方', 28 | `orderNum` text COMMENT '排序列', 29 | PRIMARY KEY (`id`) 30 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 31 | 32 | DROP TABLE IF EXISTS `ungoods`; 33 | CREATE TABLE `ungoods` ( 34 | `id` int(11) NOT NULL AUTO_INCREMENT, 35 | `type` text COMMENT '类型', 36 | `name` longtext COMMENT '名称', 37 | `webUrl` text COMMENT '来源网站', 38 | `provide` text COMMENT '提供方', 39 | `orderNum` text COMMENT '排序列', 40 | PRIMARY KEY (`id`) 41 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 42 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/importfrom/FileCopyUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.importfrom; 2 | 3 | 4 | import java.io.File; 5 | import java.nio.file.Files; 6 | 7 | /** 8 | * 文件夹拷贝(文件内含有文件和文件夹) 9 | */ 10 | public class FileCopyUtil { 11 | 12 | 13 | 14 | //复制方法 15 | public static void copy(String filePath, String srcPath, String targetPath) throws Exception { 16 | //初始化文件复制 17 | File srcFile=new File(srcPath + filePath); 18 | 19 | //初始化文件目标 20 | File targetFile=new File(targetPath+filePath); 21 | if(!targetFile.getParentFile().exists()){ 22 | targetFile.getParentFile().mkdirs(); 23 | } 24 | //调用文件拷贝的方法 25 | targetFile.delete(); 26 | Files.copy(srcFile.toPath(), targetFile.toPath()); 27 | } 28 | 29 | 30 | //主入口 31 | public static void main(String[] args) throws Exception { 32 | //复制方法 33 | String filePath = "/src/main/java/com/xyzj/crawler/framework/defaults/DefaultM3u8SpiderRule.java"; 34 | String srcPath = "/Users/liuyangyang/workspace/xyzj/xyzj-crawler"; 35 | String targetPath = "/Users/liuyangyang/Downloads"; 36 | copy(filePath,srcPath,targetPath); 37 | //打印完成 38 | System.out.println("文件拷贝完成!"); 39 | } 40 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/IPModel/SerializeUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.IPModel; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.ObjectInputStream; 6 | import java.io.ObjectOutputStream; 7 | 8 | public class SerializeUtil { 9 | public static byte[] serialize(Object object) { 10 | ObjectOutputStream oos; 11 | ByteArrayOutputStream baos; 12 | try { 13 | // 序列化 14 | baos = new ByteArrayOutputStream(); 15 | oos = new ObjectOutputStream(baos); 16 | oos.writeObject(object); 17 | 18 | byte[] bytes = baos.toByteArray(); 19 | 20 | return bytes; 21 | } catch (Exception e) { 22 | e.printStackTrace(); 23 | } 24 | return null; 25 | } 26 | 27 | //反序列化 28 | public static Object unSerialize(byte[] bytes) { 29 | ByteArrayInputStream bais; 30 | ObjectInputStream ois; 31 | 32 | try { 33 | // 反序列化 34 | bais = new ByteArrayInputStream(bytes); 35 | ois = new ObjectInputStream(bais); 36 | return ois.readObject(); 37 | } catch (Exception e) { 38 | e.printStackTrace(); 39 | } 40 | 41 | return null; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xyzj-crawler 2 | 3 | ## 一、项目介绍 4 | 5 | 爬虫? no no no 6 | 就是一个网页信息采集器。 7 | 8 | java语言实现,会java,开箱即用。 9 | 10 | 想学习一下具体实现,和探讨一下技术的小伙伴,也可以跟我团队一起讨论。 11 | 交流学习-- 12 | 联系lyy qq:719882551。 13 | 14 | 15 | 16 | ### 技术交流&&商务合作 17 | 如果你感兴趣,请加入技术讨论群,添加微信,备注入群。 18 | 有偿提供公开资源爬取,请加微信,备注合作。 19 | 20 | ![](.github/my.JPG) 21 | 22 | 23 | 24 | 25 | ## 二、使用帮助 26 | 27 | ### 2-1 页面有用信息提取 28 | 29 | ```shell 30 | #第一步:克隆项目 导入idea 31 | git clone https://github.com/xy-xyzj/xyzj-crawler 32 | 33 | #第二步:创建数据库、数据表 34 | 创建语句 goods.sql 35 | 36 | #第三步:修改数据库配置 37 | conf.properties 38 | mysql.url=jdbc:mysql://localhost/crawler?characterEncoding=utf8&useSSL=false 39 | mysql.username=xxx 40 | mysql.password=xxx 41 | 42 | 43 | #第四步:熟悉代码 44 | --默认实现 45 | 1)DefaultSpiderRuleTest 46 | 47 | --58单页面 48 | 2)DoRule58 49 | 50 | --58分页多页面 开启多线程爬取 51 | 3)DoCrawler58 52 | 53 | ``` 54 | 55 | 56 | 57 | ### 2-2 m3u8视频下载 58 | 59 | ```shell 60 | --m3u8规则实现类 61 | 1)com.xyzj.crawler.framework.factory.M3u8SpiderRule 62 | 63 | --m3u8下载实例 64 | 2)com.xyzj.crawler.spidertask.dorule.DoRule51Cto 65 | 66 | ``` 67 | 68 | 69 | 70 | ## 三、爬取你想要的网站 71 | 72 | ​ 73 | 74 | ## 四、模拟登陆 75 | 76 | ```shell 77 | #其实已实现了。可以自己看看 78 | params.put("headInfos",Map); 79 | ``` 80 | 81 | 82 | 83 | ## 五、IP代理 84 | 85 | ```shell 86 | #已实现。看代码。后面写个说明 87 | ``` 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/spider/doRule/ProxyFilterSpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.spider.doRule; 2 | 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract; 4 | import com.xyzj.crawler.framework.entity.Param; 5 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler; 6 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 7 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage; 8 | import com.xyzj.crawler.utils.proxyip.config.RedisUtil; 9 | import lombok.extern.slf4j.Slf4j; 10 | 11 | /** 12 | * @author lyy 13 | * @since 2018-10-27 13:08 14 | */ 15 | @Slf4j 16 | public class ProxyFilterSpiderRule extends SpiderRuleAbstract { 17 | 18 | @Override 19 | public void runSpider(Param param, ISpiderRule spiderRule) { 20 | SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler(); 21 | spiderRuleHandler.handler(param, spiderRule); 22 | } 23 | 24 | @Override 25 | public void handlerGoods(Param param, String htmlSource) { 26 | IPMessage ipMessage = (IPMessage) param.getExtParamMap().get("ipMessage"); 27 | String ipType = ipMessage.getType(); 28 | String ipSpeed = ipMessage.getSpeed(); 29 | ipSpeed = ipSpeed.substring(0, ipSpeed.indexOf('秒')); 30 | double speed = Double.parseDouble(ipSpeed); 31 | if (ipType.equals("HTTPS") && speed <= 2.0) { 32 | RedisUtil.setOneIp(ipMessage); 33 | } 34 | 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/spider/docrawler/ProxyXcDoMain.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.spider.docrawler; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable; 7 | import com.xyzj.crawler.utils.proxyip.spider.doRule.ProxyXcSpiderRule; 8 | import lombok.extern.slf4j.Slf4j; 9 | 10 | import java.util.concurrent.ExecutorService; 11 | import java.util.concurrent.Executors; 12 | 13 | @Slf4j 14 | public class ProxyXcDoMain { 15 | 16 | 17 | public static void main(String[] args) throws Exception { 18 | log.info("开始采集有效代理"); 19 | String baseUrl = "https://www.xicidaili.com/nn/"; 20 | ExecutorService executorService = Executors.newFixedThreadPool(3); 21 | for (int i = 0; i <5; i++) { 22 | Param param = new Param(); 23 | param.setWebUrl(baseUrl+i); 24 | param.getExtParamMap().put("targetUrl", "https://www.baidu.com"); 25 | ISpiderRule spiderRule = new SpiderRuleFactory(new ProxyXcSpiderRule()).getInstance(); 26 | //spiderRule 参数 27 | SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, param); 28 | executorService.execute(spiderRunnable); 29 | } 30 | //等到任务执行完毕,关闭线程池。 31 | executorService.shutdown(); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/entity/Param.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.entity; 2 | 3 | import avro.shaded.com.google.common.collect.Maps; 4 | import com.xyzj.crawler.framework.enums.FactionEnum; 5 | import java.util.Map; 6 | import java.util.concurrent.CountDownLatch; 7 | import lombok.Data; 8 | 9 | /** 10 | * ================================================== 11 | *

12 | * FileName: Param 13 | * 14 | * @description: 15 | * @author: lyy 16 | * @create: 2019/6/28 17 | * @since: 1.0.0 18 | *

19 | * ================================================== 20 | */ 21 | @Data 22 | public class Param { 23 | 24 | /** 请求地址 */ 25 | private String webUrl; 26 | 27 | /** 网页编码 */ 28 | private String charset = "utf-8"; 29 | 30 | /** 请求头信息 模拟登陆*/ 31 | private Map headerInfos = Maps.newHashMap(); 32 | 33 | /** 请求体信息 post json参数*/ 34 | private Map bodyParams = Maps.newHashMap(); 35 | 36 | /** 指定源码获取方法 */ 37 | private FactionEnum factionEnum= FactionEnum.getHtml; 38 | 39 | /** 页面加载延迟时间单位 毫秒 */ 40 | private Integer delayTime; 41 | 42 | /** 计数器锁 */ 43 | private CountDownLatch countDownLatch; 44 | 45 | 46 | /** isProxy */ 47 | private Boolean isProxy = false; 48 | 49 | /** 代理ip */ 50 | private String proxyIp; 51 | 52 | /** 代理port */ 53 | private String proxyPort; 54 | 55 | /** 文件保存路径 */ 56 | private String fileFullName; 57 | 58 | /** 59 | * 其余定制参数 60 | */ 61 | private Map extParamMap = Maps.newHashMap(); 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/gethtmlstring/UrlUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.gethtmlstring; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | /** 5 | * url转码、解码 6 | * 7 | */ 8 | public class UrlUtil { 9 | private final static String ENCODE = "utf-8"; 10 | /** 11 | * URL 解码 12 | * 13 | * @return String 14 | * @author lifq 15 | * @date 2015-3-17 下午04:09:51 16 | */ 17 | private static String getURLDecoderString(String str) { 18 | String result = ""; 19 | if (null == str) { 20 | return ""; 21 | } 22 | try { 23 | result = java.net.URLDecoder.decode(str, ENCODE); 24 | } catch (UnsupportedEncodingException e) { 25 | e.printStackTrace(); 26 | } 27 | return result; 28 | } 29 | /** 30 | * URL 转码 31 | * 32 | * @return String 33 | * @author lifq 34 | * @date 2015-3-17 下午04:10:28 35 | */ 36 | public static String getURLEncoderString(String str) { 37 | String result = ""; 38 | if (null == str) { 39 | return ""; 40 | } 41 | try { 42 | result = java.net.URLEncoder.encode(str, ENCODE); 43 | } catch (UnsupportedEncodingException e) { 44 | e.printStackTrace(); 45 | } 46 | return result; 47 | } 48 | 49 | /** 50 | * 51 | * @author lifq 52 | * @date 2015-3-17 下午04:09:16 53 | */ 54 | public static void main(String[] args) { 55 | String str = "测试1"; 56 | System.out.println(getURLEncoderString(str)); 57 | System.out.println(getURLDecoderString(str)); 58 | 59 | } 60 | 61 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/gethtmlstring/SpiderUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.gethtmlstring; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.io.ByteArrayOutputStream; 7 | import java.io.InputStream; 8 | import java.net.HttpURLConnection; 9 | import java.net.URL; 10 | 11 | /** 12 | * 爬虫工具类 13 | * 14 | */ 15 | public final class SpiderUtil { 16 | private static final Logger THREAD_LOG = LoggerFactory.getLogger(SpiderUtil.class); 17 | 18 | private static final byte[] getImageFromNetByUrl(final String strUrl) { 19 | try { 20 | URL url = new URL("http:" + strUrl); 21 | if(strUrl.startsWith("http:")) { 22 | url = new URL(strUrl); 23 | } 24 | HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 25 | conn.setRequestMethod("GET"); 26 | conn.setConnectTimeout(5 * 1000); 27 | InputStream inStream = conn.getInputStream();// 通过输入流获取图片数据 28 | byte[] btImg = readInputStream(inStream);// 得到图片的二进制数据 29 | return btImg; 30 | } catch (Exception e) { 31 | THREAD_LOG.info("爬虫的getImageFromNetByUrl方法报错:", e.getMessage()); 32 | } 33 | return null; 34 | } 35 | 36 | public static final byte[] readInputStream(InputStream inStream) throws Exception { 37 | ByteArrayOutputStream outStream = new ByteArrayOutputStream(); 38 | byte[] buffer = new byte[1024]; 39 | int len = 0; 40 | while ((len = inStream.read(buffer)) != -1) { 41 | outStream.write(buffer, 0, len); 42 | } 43 | inStream.close(); 44 | return outStream.toByteArray(); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/framework/defaults/DefaultSpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.framework.defaults; 2 | 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract; 4 | import com.xyzj.crawler.framework.entity.Goods; 5 | import com.xyzj.crawler.framework.entity.Param; 6 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler; 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 8 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil; 9 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql; 10 | import java.util.List; 11 | import lombok.extern.slf4j.Slf4j; 12 | import org.springframework.util.CollectionUtils; 13 | 14 | /** 15 | * @author lyy 16 | * @since 2018-10-27 13:08 17 | */ 18 | @Slf4j 19 | public class DefaultSpiderRule extends SpiderRuleAbstract { 20 | 21 | public void runSpider(Param param,ISpiderRule spiderRule) { 22 | SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler(); 23 | spiderRuleHandler.handler(param, spiderRule); 24 | } 25 | 26 | public void handlerGoods(Param param, String htmlSource) { 27 | String regexPattern = "([\\s\\S]*)"; 28 | List stringList = RegexUtil.getSubUtil(htmlSource, regexPattern); 29 | if (CollectionUtils.isEmpty(stringList)) { 30 | log.info("没有匹配需要都内容......"); 31 | } 32 | Goods saveGoods = new Goods(); 33 | saveGoods.setWebUrl(param.getWebUrl()); 34 | SaveToMysql saveToMysql = new SaveToMysql(); 35 | for (int i = 0; i < stringList.size(); i++) { 36 | saveGoods.setType(Integer.toString(i + 1)); 37 | saveGoods.setName(stringList.get(i)); 38 | saveToMysql.saveToMasql("goods", saveGoods); 39 | } 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/config/RedisUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.config; 2 | 3 | 4 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage; 5 | import com.xyzj.crawler.utils.proxyip.IPModel.SerializeUtil; 6 | import lombok.extern.slf4j.Slf4j; 7 | import redis.clients.jedis.Jedis; 8 | 9 | 10 | @Slf4j 11 | public class RedisUtil { 12 | public static Jedis jedis = RedisConfig.getJedis(); 13 | 14 | /** 15 | * ======================================== 16 | * 17 | * @description: 保存ip信息到redis队列 18 | * @author: lyy 19 | * @param: 20 | * @return: 21 | * @exception: 22 | * @create: 2019/7/4 10:10 23 | *

24 | * ======================================== 25 | */ 26 | public static void setOneIp(IPMessage ipMessage) { 27 | //首先将ipMessage进行序列化 28 | byte[] bytes = SerializeUtil.serialize(ipMessage); 29 | jedis.rpush("IpPool".getBytes(), bytes); 30 | } 31 | 32 | /** 33 | * ======================================== 34 | * 35 | * @description: 从队列中取出ip信息 36 | * @author: lyy 37 | * @param: 38 | * @return: 39 | * @exception: 40 | * @create: 2019/7/4 10:11 41 | *

42 | * ======================================== 43 | */ 44 | public static IPMessage getOneIp() { 45 | byte[] bytes = jedis.lpop("IpPool".getBytes()); 46 | if (bytes != null) { 47 | Object o = SerializeUtil.unSerialize(bytes); 48 | if (o instanceof IPMessage) { 49 | return (IPMessage) o; 50 | } 51 | } 52 | return null; 53 | } 54 | 55 | public static void deleteKey(String key) { 56 | jedis.del(key); 57 | } 58 | 59 | public static void close() { 60 | RedisConfig.close(jedis); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/packageutil/PackageGetClassUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.packageutil; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | 8 | /** 9 | * @author lyy 10 | * @since 2019-09-18 13:01 11 | * 读取excel里面的文件到指定目录 12 | */ 13 | public class PackageGetClassUtil { 14 | private static final String SRC_PATH = "C:\\Users\\quling\\Desktop\\java"; 15 | private static final String TARGET_PATH = "D:\\IdeaProjects\\demo2\\out\\production\\classes"; 16 | 17 | public static void main(String[] args) throws Exception { 18 | try{ 19 | fileReplace(new File(SRC_PATH),SRC_PATH,TARGET_PATH); 20 | }catch(Exception e){ 21 | e.printStackTrace(); 22 | } 23 | System.out.println("类提取完成"); 24 | 25 | } 26 | 27 | private static void fileReplace(File base, String sourcePath, String targetPath) throws IOException { 28 | if (!base.exists() || base.getName().contains(".txt")){ 29 | return; 30 | } 31 | if (base.isDirectory()) { 32 | File[] files = base.listFiles(); 33 | for (File file : files) { 34 | fileReplace(file, sourcePath, targetPath); 35 | } 36 | } else { 37 | String path = base.getPath(); 38 | String tempPath = path = path.replace(".java", ".class"); 39 | System.out.println(base.getName()); 40 | base.delete(); 41 | path = targetPath + path.substring(sourcePath.length()); 42 | FileInputStream in = new FileInputStream(new File(path)); 43 | FileOutputStream out = new FileOutputStream(new File(tempPath)); 44 | int i; 45 | while ((i = in.read()) != -1) { 46 | out.write(i); 47 | } 48 | in.close(); 49 | out.close(); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/bigdata/in/DemoDataListener.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.bigdata.in; 2 | 3 | import com.alibaba.excel.context.AnalysisContext; 4 | import com.alibaba.excel.event.AnalysisEventListener; 5 | import com.xyzj.crawler.utils.savetomysql.SaveToOracle; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** 12 | * 模板的读取类 13 | * 14 | * @author Jiaju Zhuang 15 | */ 16 | public class DemoDataListener extends AnalysisEventListener { 17 | private static final Logger LOGGER = LoggerFactory.getLogger(DemoDataListener.class); 18 | /** 19 | * 每隔5条存储数据库,实际使用中可以3000条,然后清理list ,方便内存回收 20 | */ 21 | private static final int BATCH_COUNT = 5000; 22 | List list = new ArrayList(); 23 | 24 | @Override 25 | public void invoke(DemoData data, AnalysisContext context) { 26 | list.add(data); 27 | if (list.size() >= BATCH_COUNT) { 28 | saveData(); 29 | list.clear(); 30 | } 31 | } 32 | 33 | @Override 34 | public void doAfterAllAnalysed(AnalysisContext context) { 35 | saveData(); 36 | LOGGER.info("所有数据解析完成!"); 37 | } 38 | 39 | /** 40 | * 加上存储数据库 41 | */ 42 | private void saveData() { 43 | LOGGER.info("{}条数据,开始存储数据库!", list.size()); 44 | String sql = "INSERT INTO my_test " + 45 | "(name, birthday, age) VALUES (?, ?, ?)"; 46 | List param = new ArrayList<>(); 47 | 48 | for(int i=0;i headerInfos) { 18 | HttpURLConnection connection = null; 19 | try { 20 | // 创建远程url连接对象 21 | URL url = new URL(httpUrl); 22 | // 通过远程url连接对象打开一个连接,强转成httpURLConnection类 23 | connection = (HttpURLConnection) url.openConnection(); 24 | // 设置连接主机服务器的超时时间:15000毫秒 25 | int connectTimeout = 15000; 26 | // 设置读取远程返回的数据时间:60000毫秒 27 | int readTimeout = 60000; 28 | // 设置连接方式:get 29 | connection.setRequestMethod("GET"); 30 | connection.setConnectTimeout(connectTimeout); 31 | connection.setReadTimeout(readTimeout); 32 | // 遍历map 设置请求头信息 33 | if (!CollectionUtils.isEmpty(headerInfos)) { 34 | for (String key : headerInfos.keySet()) { 35 | connection.setRequestProperty(key, headerInfos.get(key)); 36 | } 37 | } 38 | // 发送请求 39 | connection.connect(); 40 | if (connection.getResponseCode() == 200) { 41 | baseHttpCallBack.httpCallBack(connection.getResponseCode(), connection.getInputStream()); 42 | } 43 | } catch (MalformedURLException e) { 44 | e.printStackTrace(); 45 | baseHttpCallBack.httpCallBack(-1, null); 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | baseHttpCallBack.httpCallBack(-2, null); 49 | } finally { 50 | // 关闭远程连接 51 | if (connection != null) { 52 | connection.disconnect(); 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/spidertask/example/docrawler/DoCrawler58.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.spidertask.example.docrawler; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.concurrent.CountDownLatch; 10 | import java.util.concurrent.ExecutorService; 11 | import java.util.concurrent.Executors; 12 | import lombok.extern.slf4j.Slf4j; 13 | 14 | /** 15 | * 线程池抓取 16 | * 单个线程抓取某一个页面 17 | * 翻页的情况,开启线程池 18 | * 19 | * 58页面。 20 | * 21 | * @author lyy 22 | * @since 2018-10-27 18:14 23 | */ 24 | @Slf4j 25 | public class DoCrawler58 { 26 | 27 | private static final int THREAD_COUNT = 10; 28 | 29 | public static void main(String[] args) { 30 | //总记录数 31 | Integer totalCount =3199; 32 | //每页数 33 | Integer pageSize = 30; 34 | //目标数量 107 35 | Integer pageCount = totalCount / pageSize + 1; 36 | //开启一个线程池 37 | ExecutorService executorService = Executors.newFixedThreadPool(THREAD_COUNT); 38 | //计数器锁 39 | CountDownLatch countDownLatch = new CountDownLatch(pageCount); 40 | for(int i=1;i<=pageCount;i++) { 41 | Map params = new HashMap<>(); 42 | //目标url 43 | String webUrl = "https://cq.58.com/shouji/pn"+i+"/?PGTID=0d300024-0002-5274-9167-f56e706b72b9&ClickID=1"; 44 | Param param = new Param(); 45 | param.setWebUrl(webUrl); 46 | param.setCountDownLatch(countDownLatch); 47 | 48 | //spiderRule 规则 49 | ISpiderRule spiderRule = new SpiderRuleFactory().getInstance(); 50 | //spiderRule 参数 51 | SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule,param); 52 | executorService.execute(spiderRunnable); 53 | } 54 | //等到任务执行完毕,关闭线程池。 55 | executorService.shutdown(); 56 | try { 57 | countDownLatch.await(); 58 | } catch (InterruptedException e) { 59 | log.error("出毛病了{}",e); 60 | } 61 | log.info("main --爬完了"); 62 | } 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/ParseTsUrls.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import com.xyzj.crawler.utils.gethtmlstring.BaseHttpCallBack; 4 | import com.xyzj.crawler.utils.gethtmlstring.M3u8HttpClientUtil; 5 | import java.io.BufferedReader; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Map; 12 | import lombok.extern.slf4j.Slf4j; 13 | import org.apache.commons.collections.CollectionUtils; 14 | 15 | /** 16 | * 解析ts路径 17 | * @author liuyangyang 18 | * */ 19 | @Slf4j 20 | public class ParseTsUrls implements BaseHttpCallBack { 21 | 22 | private String httpUrl; 23 | 24 | private String fileName; 25 | 26 | private Map headerInfos; 27 | 28 | private List tsUrlList = new ArrayList(); 29 | 30 | public ParseTsUrls(String httpUrl, Map headerInfos,String fileName){ 31 | this.httpUrl = httpUrl; 32 | this.headerInfos = headerInfos; 33 | this.fileName = fileName; 34 | } 35 | 36 | public void httpRequestForTsUrls(){ 37 | log.info("正在发送请求:httpUrl={}",httpUrl); 38 | M3u8HttpClientUtil.doGet(httpUrl,this,headerInfos); 39 | if (CollectionUtils.isNotEmpty(tsUrlList)) { 40 | new DownloadTsFile(tsUrlList, headerInfos, fileName).download(); 41 | } else { 42 | log.info("没有拿到ts路径,请检查..."); 43 | } 44 | } 45 | 46 | @Override 47 | public void httpCallBack(int responseCode,InputStream inputStream) { 48 | log.info("开始解析TS路径....."); 49 | if(responseCode == 200){ 50 | try { 51 | // 封装输入流is,并指定字符集 52 | BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); 53 | String lineStr = null; 54 | while ((lineStr = br.readLine()) != null) { 55 | if (lineStr.contains("http") && lineStr.contains(".ts")) { 56 | tsUrlList.add(lineStr); 57 | } 58 | } 59 | } catch (IOException e) { 60 | log.error("解析ts出错了 e={}",e); 61 | } 62 | 63 | log.info("解析TS路径完成....."); 64 | } 65 | } 66 | 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/spider/doRule/ProxyXcSpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.spider.doRule; 2 | 3 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract; 4 | import com.xyzj.crawler.framework.entity.Param; 5 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 6 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler; 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 8 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage; 9 | import lombok.extern.slf4j.Slf4j; 10 | import org.jsoup.Jsoup; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.select.Elements; 13 | 14 | /** 15 | * @author lyy 16 | * @since 2018-10-27 13:08 17 | */ 18 | @Slf4j 19 | public class ProxyXcSpiderRule extends SpiderRuleAbstract { 20 | @Override 21 | public void runSpider(Param param, ISpiderRule spiderRule) { 22 | SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler(); 23 | spiderRuleHandler.handler(param, spiderRule); 24 | } 25 | 26 | @Override 27 | public void handlerGoods(Param param, String htmlSource) { 28 | Document document = Jsoup.parse(htmlSource); 29 | Elements trs = document.select("table[id=ip_list]").select("tbody").select("tr"); 30 | for (int i = 1; i < trs.size(); i++) { 31 | String newIp = trs.get(i).select("td").get(1).text(); 32 | String newPort = trs.get(i).select("td").get(2).text(); 33 | String newType = trs.get(i).select("td").get(5).text(); 34 | String newSpeed = trs.get(i).select("td").get(6).select("div[class=bar]").attr("title"); 35 | //取得单个ip 36 | IPMessage ipMessage = new IPMessage(); 37 | ipMessage.setIp(newIp); 38 | ipMessage.setPort(newPort); 39 | ipMessage.setType(newType); 40 | ipMessage.setSpeed(newSpeed); 41 | 42 | ISpiderRule spiderRule = new SpiderRuleFactory(new ProxyFilterSpiderRule()).getInstance(); 43 | Param newParam = new Param(); 44 | newParam.setProxyIp(newIp); 45 | newParam.setProxyPort(newPort); 46 | newParam.setWebUrl(String.valueOf(param.getExtParamMap().get("targetUrl"))); 47 | newParam.getExtParamMap().put("ipMessage",ipMessage); 48 | spiderRule.runSpider(newParam,spiderRule); 49 | } 50 | } 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/DownloadTsFile.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import com.xyzj.crawler.utils.gethtmlstring.BaseHttpCallBack; 4 | import com.xyzj.crawler.utils.gethtmlstring.M3u8HttpClientUtil; 5 | import java.io.File; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import java.io.InputStream; 10 | import java.util.List; 11 | import java.util.Map; 12 | import lombok.extern.slf4j.Slf4j; 13 | import org.springframework.util.StringUtils; 14 | 15 | /** 16 | * 17 | * @author liuyangyang 18 | * */ 19 | @Slf4j 20 | public class DownloadTsFile implements BaseHttpCallBack { 21 | private List tsUrlList; 22 | 23 | private Map headerInfos; 24 | 25 | private String fileName; 26 | 27 | private FileOutputStream fileOutputStream=null; 28 | 29 | public DownloadTsFile(List tsUrlList, Map headerInfos, String fileName){ 30 | this.tsUrlList=tsUrlList; 31 | this.headerInfos = headerInfos; 32 | this.fileName = fileName; 33 | } 34 | 35 | 36 | public void download(){ 37 | log.info("开始生成文件,请等待......"); 38 | if(!StringUtils.isEmpty(tsUrlList)){ 39 | try { 40 | fileOutputStream = new FileOutputStream(new File(fileName)); 41 | } catch (FileNotFoundException e) { 42 | log.info("输出流创建异常 e={}",e); 43 | } 44 | for (String url:tsUrlList) { 45 | M3u8HttpClientUtil.doGet(url,this,headerInfos); 46 | } 47 | if(fileOutputStream!=null){ 48 | try { 49 | fileOutputStream.close(); 50 | } catch (IOException e) { 51 | log.info("输出流关闭异常 e={}",e); 52 | } 53 | } 54 | } 55 | } 56 | 57 | @Override 58 | public void httpCallBack(int responseCode, InputStream inputStream) { 59 | if(responseCode == 200){ 60 | byte[] tempBytes = new byte[100]; 61 | int byteRead = 0; 62 | try { 63 | while ((byteRead = inputStream.read(tempBytes)) != -1) { 64 | fileOutputStream.write(tempBytes, 0, byteRead); 65 | } 66 | } catch (IOException e) { 67 | log.info("文件合并异常 e={}",e); 68 | } 69 | } 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/RegexUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * 正则表达式匹配两个字符串之间的内容 10 | * @author Administrator 11 | * 12 | */ 13 | public class RegexUtil { 14 | 15 | public static void main(String[] args) { 16 | String str = "w764e:\1.xml1单据w764开始处理...单据w764处理完毕!2017.09-记账凭证-1w1007e:\1.xml1单据w1007开始处理...单据w1007处理完毕!2017.10-记账凭证-1w516e:\1.xml1单据w516开始处理...单据w516处理完毕!2017.07-记账凭证-50"; 17 | str = "[{ \\\"CretType\":\"name\"}]"; 18 | System.out.println(str); 19 | String rgex = "CretType\":\"(.*?)\""; 20 | 21 | System.out.println((RegexUtil.getSubUtil(str,rgex))); 22 | List lists = RegexUtil.getSubUtil(str,rgex); 23 | for (String string : lists) { 24 | System.out.println(string); 25 | } 26 | System.out.println(RegexUtil.getSubUtilSimple(str, rgex)); 27 | } 28 | 29 | /** 30 | * 正则表达式匹配两个指定字符串中间的内容 31 | * @param soap 32 | * @return 33 | */ 34 | public static List getSubUtil(String soap,String rgex){ 35 | List list = new ArrayList(); 36 | Pattern pattern = Pattern.compile(rgex);// 匹配的模式 37 | Matcher m = pattern.matcher(soap); 38 | while (m.find()) { 39 | int i = 1; 40 | list.add(m.group(i)); 41 | i++; 42 | } 43 | return list; 44 | } 45 | 46 | /** 47 | * 返回单个字符串,若匹配到多个的话就返回第一个,方法与getSubUtil一样 48 | * @param soap 49 | * @param rgex 50 | * @return 51 | */ 52 | public static String getSubUtilSimple(String soap,String rgex){ 53 | Pattern pattern = Pattern.compile(rgex);// 匹配的模式 54 | Matcher m = pattern.matcher(soap); 55 | while(m.find()){ 56 | return m.group(1); 57 | } 58 | return ""; 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/spidertask/zlr/docrawler/SsqDoErrorMain.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.spidertask.zlr.docrawler; 2 | 3 | import com.xyzj.crawler.framework.entity.Param; 4 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 5 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 6 | import com.xyzj.crawler.framework.runnable.SpiderRunnable; 7 | import com.xyzj.crawler.spidertask.zlr.dorule.SsqDetailSpiderRule; 8 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql; 9 | 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.concurrent.CountDownLatch; 13 | import java.util.concurrent.ExecutorService; 14 | import java.util.concurrent.Executors; 15 | import lombok.extern.slf4j.Slf4j; 16 | import org.springframework.util.CollectionUtils; 17 | import org.springframework.util.StringUtils; 18 | 19 | @Slf4j 20 | public class SsqDoErrorMain { 21 | public static void main(String[] args) throws Exception { 22 | log.info("尝试从新拉去数据"); 23 | reTry(); 24 | } 25 | 26 | public static void reTry() { 27 | SaveToMysql query = new SaveToMysql(); 28 | List> mapList = query.queryBySql("select * from ungoods"); 29 | if (CollectionUtils.isEmpty(mapList)) { 30 | log.info("无失败记录......"); 31 | return; 32 | } 33 | // 删除 34 | query.executeBySql("delete from ungoods"); 35 | //计数器锁 36 | CountDownLatch countDownLatch = new CountDownLatch(mapList.size()); 37 | ExecutorService executorService = Executors.newFixedThreadPool(3); 38 | for (Map map : mapList) { 39 | String url = String.valueOf(map.get("webUrl")); 40 | if (!StringUtils.isEmpty(url)) { 41 | // 存在数据 42 | log.info("url={}", url); 43 | Param param = new Param(); 44 | param.setWebUrl(url); 45 | param.setCountDownLatch(countDownLatch); 46 | 47 | //抓取器 48 | SsqDetailSpiderRule ssqDetailSpiderRule = new SsqDetailSpiderRule(); 49 | ISpiderRule spiderRule = new SpiderRuleFactory(ssqDetailSpiderRule).getInstance(); 50 | 51 | //spiderRule 参数 52 | SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, param); 53 | executorService.execute(spiderRunnable); 54 | } 55 | } 56 | //等到任务执行完毕,关闭线程池。 57 | executorService.shutdown(); 58 | try { 59 | countDownLatch.await(); 60 | } catch (InterruptedException e) { 61 | log.info("出毛病......"); 62 | } 63 | // 循环 64 | reTry(); 65 | 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/proxyip/config/RedisConfig.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.proxyip.config; 2 | 3 | import java.util.ResourceBundle; 4 | import lombok.extern.slf4j.Slf4j; 5 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig; 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.JedisPool; 8 | 9 | @Slf4j 10 | public class RedisConfig { 11 | 12 | private static GenericObjectPoolConfig config = null; 13 | private static String addr; 14 | private static int port; 15 | 16 | private static JedisPool jedisPool ; 17 | 18 | //加载配置文件 19 | private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf"); 20 | 21 | //初始化连接 22 | static { 23 | addr = resourceBundle.getString("jedis.addr"); 24 | port = Integer.parseInt(resourceBundle.getString("jedis.port")); 25 | try { 26 | //先进行redis数据的参数配置 27 | config = new GenericObjectPoolConfig(); 28 | //链接耗尽时是否阻塞,false时抛出异常,默认是true,阻塞超时之后抛出异常 29 | config.setBlockWhenExhausted(true); 30 | //逐出策略类名,当连接超过最大空闲时间或最大空闲数抛出异常 31 | config.setEvictionPolicyClassName("org.apache.commons.pool2.impl.DefaultEvictionPolicy"); 32 | //是否启用pool的jmx管理功能,默认是true 33 | config.setJmxEnabled(true); 34 | //最大空闲数,默认为8,一个pool最多有多少空闲的Jedis实例 35 | config.setMaxIdle(8); 36 | //最大连接数 37 | config.setMaxTotal(100); 38 | //当引入一个Jedis实例时,最大的等待时间,如果超过等待时间,抛出异常 39 | config.setMaxWaitMillis(1000 * 10); 40 | //获得一个jedis实例的时候是否检查连接可用性(ping() 41 | config.setTestOnBorrow(true); 42 | } catch (Exception e) { 43 | log.error("Exception:{}", e); 44 | } 45 | } 46 | 47 | /** 48 | * ======================================== 49 | * 50 | * @description: 取得Jedis实例 51 | * @author: lyy 52 | * @param: 53 | * @return: 54 | * @exception: 55 | * @create: 2019/7/4 10:06 56 | *

57 | * ======================================== 58 | */ 59 | public synchronized static Jedis getJedis() { 60 | if (jedisPool == null) { 61 | jedisPool = new JedisPool(config, addr, port); 62 | } 63 | return jedisPool.getResource(); 64 | } 65 | 66 | /** 67 | * ======================================== 68 | * 69 | * @description: 释放Jedis连接 70 | * @author: lyy 71 | * @param: 72 | * @return: 73 | * @exception: 74 | * @create: 2019/7/4 10:06 75 | *

76 | * ======================================== 77 | */ 78 | public static void close(final Jedis jedis) { 79 | if (jedis != null) { 80 | jedis.close(); 81 | } 82 | } 83 | } -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/spidertask/zlr/docrawler/SsqDoMain.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.spidertask.zlr.docrawler; 2 | 3 | import avro.shaded.com.google.common.collect.Lists; 4 | import com.xyzj.crawler.framework.entity.Goods; 5 | import com.xyzj.crawler.framework.entity.Param; 6 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 7 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 8 | import com.xyzj.crawler.framework.runnable.SpiderRunnable; 9 | import com.xyzj.crawler.spidertask.zlr.dorule.SsqDetailSpiderRule; 10 | import com.xyzj.crawler.utils.gethtmlstring.HttpResponseUtil; 11 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil; 12 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.concurrent.ExecutorService; 16 | import java.util.concurrent.Executors; 17 | import lombok.extern.slf4j.Slf4j; 18 | 19 | @Slf4j 20 | public class SsqDoMain { 21 | 22 | 23 | public static void main(String[] args) throws Exception { 24 | log.info("开始抓取区域数据"); 25 | //第一步 取得源码 26 | Param param = new Param(); 27 | String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"; 28 | param.setWebUrl(baseUrl); 29 | param.setCharset("gb2312"); 30 | String htmlSource = HttpResponseUtil.getHtmlSource(param); 31 | if (htmlSource == null) return; 32 | 33 | //第二步 提取url 34 | String numberRegexString = ""; 35 | String nameRegexString = ".html'>(.*?)
"; 36 | List numUtil = RegexUtil.getSubUtil(htmlSource, numberRegexString); 37 | List nameUtil = RegexUtil.getSubUtil(htmlSource, nameRegexString); 38 | 39 | ArrayList goodsList = Lists.newArrayList(); 40 | for (int i = 0; i < numUtil.size(); i++) { 41 | Goods goods = new Goods(); 42 | goods.setName(nameUtil.get(i)); 43 | goods.setWebUrl(baseUrl + numUtil.get(i) + ".html"); 44 | goods.setOrderNum(numUtil.get(i)); 45 | //第三步 往数据库中存 46 | SaveToMysql saveToMysql = new SaveToMysql(); 47 | saveToMysql.saveToMasql("goods", goods); 48 | goodsList.add(goods); 49 | } 50 | 51 | //第四步 爬取明细记录 52 | ExecutorService executorService = Executors.newFixedThreadPool(3); 53 | for (Goods goods : goodsList) { 54 | Param detailParam = new Param(); 55 | detailParam.setWebUrl(goods.getWebUrl()); 56 | detailParam.setCharset("gb2312"); 57 | //抓取器 58 | SsqDetailSpiderRule ssqDetailSpiderRule = new SsqDetailSpiderRule(); 59 | ISpiderRule spiderRule = new SpiderRuleFactory(ssqDetailSpiderRule).getInstance(); 60 | //spiderRule 参数 61 | SpiderRunnable spiderRunnable = new SpiderRunnable(spiderRule, detailParam); 62 | executorService.execute(spiderRunnable); 63 | } 64 | //等到任务执行完毕,关闭线程池。 65 | executorService.shutdown(); 66 | 67 | 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/savetomysql/SaveToMysql.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.savetomysql; 2 | 3 | import java.lang.reflect.Field; 4 | import java.lang.reflect.Method; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.ResourceBundle; 8 | import lombok.extern.slf4j.Slf4j; 9 | import org.springframework.jdbc.core.JdbcTemplate; 10 | import org.springframework.jdbc.datasource.DriverManagerDataSource; 11 | 12 | @Slf4j 13 | public class SaveToMysql { 14 | private static JdbcTemplate jdbcTemplate = null; 15 | private final static String MYSQL_URL; 16 | private final static String MYSQL_USERNAME; 17 | private final static String MYSQL_PASSWORD; 18 | 19 | //加载配置文件 20 | private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf"); 21 | 22 | static { 23 | MYSQL_URL = resourceBundle.getString("mysql.url"); 24 | MYSQL_USERNAME = resourceBundle.getString("mysql.username"); 25 | MYSQL_PASSWORD = resourceBundle.getString("mysql.password"); 26 | 27 | DriverManagerDataSource dataSource = new DriverManagerDataSource(); 28 | dataSource.setDriverClassName("com.mysql.jdbc.Driver"); 29 | dataSource.setUrl(MYSQL_URL); 30 | dataSource.setUsername(MYSQL_USERNAME); 31 | dataSource.setPassword(MYSQL_PASSWORD); 32 | 33 | jdbcTemplate = new JdbcTemplate(dataSource); 34 | } 35 | 36 | public boolean saveToMasql(String tableName,Object object) { 37 | try { 38 | save(sqlBuilder(tableName,object), getValues(object)); 39 | } catch (Exception e) { 40 | log.error("error,exception: {}",e); 41 | } 42 | return true; 43 | } 44 | 45 | public List> queryBySql(String sql) { 46 | return jdbcTemplate.queryForList(sql); 47 | } 48 | 49 | public void executeBySql(String sql) { 50 | jdbcTemplate.execute(sql); 51 | } 52 | 53 | public void batchUpdate(String sql, List param) { 54 | jdbcTemplate.batchUpdate(sql, param); 55 | } 56 | 57 | 58 | 59 | // 取得要执行的sql语句 60 | private static String sqlBuilder(String tableName,Object object) { 61 | Class clazz = object.getClass(); 62 | Field[] fields = clazz.getDeclaredFields(); 63 | 64 | StringBuilder sql = new StringBuilder(); 65 | sql.append("INSERT INTO "); 66 | sql.append("`"+tableName+"`"); 67 | sql.append("("); 68 | 69 | StringBuilder insertValues = new StringBuilder(); 70 | insertValues.append("values("); 71 | for(int i=1;i clazz = object.getClass(); 86 | Field[] fields = clazz.getDeclaredFields(); 87 | Object[] params = new Object[fields.length - 1]; 88 | for (int i = 1; i < fields.length; i++) { 89 | Method method = (Method) clazz.getMethod("get" + getMethodName(fields[i].getName())); 90 | Object value = method.invoke(object); 91 | params[i - 1] = value; 92 | } 93 | return params; 94 | } 95 | 96 | private static String getMethodName(String fieldName) { 97 | // 把一个字符串的第一个字母大写、效率是最高的 98 | byte[] items = fieldName.getBytes(); 99 | items[0] = (byte) ((char) items[0] - 'a' + 'A'); 100 | return new String(items); 101 | } 102 | 103 | private void save(String sql, Object[] params) { 104 | // 根据模板和sql语句执行数据库操作 105 | try { 106 | int count = jdbcTemplate.update(sql, params); 107 | log.info("插入数据成功......"); 108 | } catch (Exception e) { 109 | log.error("插入异常 exception: {}",e); 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/savetomysql/SaveToOracle.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.savetomysql; 2 | 3 | import java.lang.reflect.Field; 4 | import java.lang.reflect.Method; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.ResourceBundle; 8 | import lombok.extern.slf4j.Slf4j; 9 | import org.springframework.jdbc.core.JdbcTemplate; 10 | import org.springframework.jdbc.datasource.DriverManagerDataSource; 11 | 12 | @Slf4j 13 | public class SaveToOracle { 14 | private static JdbcTemplate jdbcTemplate = null; 15 | private final static String ORACLE_URL; 16 | private final static String ORACLE_USERNAME; 17 | private final static String ORACLE_PASSWORD; 18 | 19 | //加载配置文件 20 | private static ResourceBundle resourceBundle = ResourceBundle.getBundle("conf"); 21 | 22 | static { 23 | ORACLE_URL = resourceBundle.getString("oracle.url"); 24 | ORACLE_USERNAME = resourceBundle.getString("oracle.username"); 25 | ORACLE_PASSWORD = resourceBundle.getString("oracle.password"); 26 | 27 | DriverManagerDataSource dataSource = new DriverManagerDataSource(); 28 | dataSource.setDriverClassName("oracle.jdbc.driver.OracleDriver"); 29 | dataSource.setUrl(ORACLE_URL); 30 | dataSource.setUsername(ORACLE_USERNAME); 31 | dataSource.setPassword(ORACLE_PASSWORD); 32 | 33 | jdbcTemplate = new JdbcTemplate(dataSource); 34 | } 35 | 36 | public boolean saveToOracle(String tableName,Object object) { 37 | try { 38 | save(sqlBuilder(tableName,object), getValues(object)); 39 | } catch (Exception e) { 40 | log.error("error,exception: {}",e); 41 | } 42 | return true; 43 | } 44 | 45 | public List> queryBySql(String sql) { 46 | return jdbcTemplate.queryForList(sql); 47 | } 48 | 49 | public void executeBySql(String sql) { 50 | jdbcTemplate.execute(sql); 51 | } 52 | 53 | public void batchUpdate(String sql, List param) { 54 | jdbcTemplate.batchUpdate(sql, param); 55 | } 56 | 57 | 58 | 59 | // 取得要执行的sql语句 60 | private static String sqlBuilder(String tableName,Object object) { 61 | Class clazz = object.getClass(); 62 | Field[] fields = clazz.getDeclaredFields(); 63 | 64 | StringBuilder sql = new StringBuilder(); 65 | sql.append("INSERT INTO "); 66 | sql.append("`"+tableName+"`"); 67 | sql.append("("); 68 | 69 | StringBuilder insertValues = new StringBuilder(); 70 | insertValues.append("values("); 71 | for(int i=1;i clazz = object.getClass(); 86 | Field[] fields = clazz.getDeclaredFields(); 87 | Object[] params = new Object[fields.length - 1]; 88 | for (int i = 1; i < fields.length; i++) { 89 | Method method = (Method) clazz.getMethod("get" + getMethodName(fields[i].getName())); 90 | Object value = method.invoke(object); 91 | params[i - 1] = value; 92 | } 93 | return params; 94 | } 95 | 96 | private static String getMethodName(String fieldName) { 97 | // 把一个字符串的第一个字母大写、效率是最高的 98 | byte[] items = fieldName.getBytes(); 99 | items[0] = (byte) ((char) items[0] - 'a' + 'A'); 100 | return new String(items); 101 | } 102 | 103 | private void save(String sql, Object[] params) { 104 | // 根据模板和sql语句执行数据库操作 105 | try { 106 | int count = jdbcTemplate.update(sql, params); 107 | log.info("插入数据成功......"); 108 | } catch (Exception e) { 109 | log.error("插入异常 exception: {}",e); 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/spidertask/zlr/dorule/SsqDetailSpiderRule.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.spidertask.zlr.dorule; 2 | 3 | import avro.shaded.com.google.common.collect.Lists; 4 | import com.xyzj.crawler.framework.abstracts.SpiderRuleAbstract; 5 | import com.xyzj.crawler.framework.entity.Goods; 6 | import com.xyzj.crawler.framework.entity.Param; 7 | import com.xyzj.crawler.framework.factory.SpiderRuleFactory; 8 | import com.xyzj.crawler.framework.handler.SpiderRuleHandler; 9 | import com.xyzj.crawler.framework.interfaces.ISpiderRule; 10 | import com.xyzj.crawler.utils.parsehtmlstring.JsoupHtmlParser; 11 | import com.xyzj.crawler.utils.parsehtmlstring.RegexUtil; 12 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.List; 16 | import lombok.extern.slf4j.Slf4j; 17 | import org.springframework.util.CollectionUtils; 18 | 19 | 20 | @Slf4j 21 | public class SsqDetailSpiderRule extends SpiderRuleAbstract { 22 | 23 | @Override 24 | public void runSpider(Param param, ISpiderRule spiderRule) { 25 | SpiderRuleHandler spiderRuleHandler = new SpiderRuleHandler(); 26 | spiderRuleHandler.handler(param, spiderRule); 27 | } 28 | 29 | @Override 30 | public void handlerGoods(Param param, String htmlSource) { 31 | List citytr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.citytr > td"), false); 32 | List countytr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.countytr > td"), false); 33 | List towntr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.towntr > td"), false); 34 | List villagetr = JsoupHtmlParser.getNodeContentBySelector(htmlSource, Arrays.asList("tr.villagetr > td"), false); 35 | ArrayList allList = Lists.newArrayList(); 36 | if (citytr != null) { 37 | allList.addAll(citytr); 38 | } 39 | if (countytr != null) { 40 | allList.addAll(countytr); 41 | } 42 | if (towntr != null) { 43 | allList.addAll(towntr); 44 | } 45 | if (villagetr != null) { 46 | allList.addAll(villagetr); 47 | } 48 | log.info("allList========"+allList); 49 | if (!CollectionUtils.isEmpty(allList)) { 50 | //判断是否包含城乡分类代码 51 | if (htmlSource.contains("城乡分类代码")) { 52 | for (int i = 0; i < allList.size() ; i = i + 3) { 53 | Goods goods = new Goods(); 54 | goods.setWebUrl(String.valueOf(param.getWebUrl())); 55 | goods.setOrderNum(allList.get(i)); 56 | goods.setName(allList.get(i + 2)); 57 | //第三步 往数据库中存 58 | SaveToMysql saveToMysql = new SaveToMysql(); 59 | saveToMysql.saveToMasql("goods", goods); 60 | } 61 | } else { 62 | for (int i = 0; i < allList.size() ; i = i + 2) { 63 | Goods goods = new Goods(); 64 | goods.setWebUrl(String.valueOf(param.getWebUrl())); 65 | goods.setOrderNum(allList.get(i)); 66 | goods.setName(allList.get(i + 1)); 67 | //第三步 往数据库中存 68 | SaveToMysql saveToMysql = new SaveToMysql(); 69 | saveToMysql.saveToMasql("goods", goods); 70 | } 71 | } 72 | 73 | 74 | } 75 | 76 | //保存url 77 | String urlRegexString = "tr'>
"; 78 | List urlUtil = RegexUtil.getSubUtil(htmlSource, urlRegexString); 79 | log.info("urlUtil========"+urlUtil); 80 | 81 | //http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14/01/140108.html 82 | //http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14/01/08/140108001.html 83 | for (int i = 0; i < urlUtil.size(); i++) { 84 | //设置提供方 85 | String oldWebUrl = String.valueOf(param.getWebUrl()); 86 | String newWebUrl = oldWebUrl.substring(0, oldWebUrl.lastIndexOf("/")+1); 87 | Param newParam = new Param(); 88 | newParam.setWebUrl(newWebUrl+urlUtil.get(i)+".html"); 89 | ISpiderRule spiderRule = new SpiderRuleFactory(new SsqDetailSpiderRule()).getInstance(); 90 | runSpider(newParam,spiderRule); 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/authcode/AuthcodeDistinguisher.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.authcode; 2 | 3 | import net.sourceforge.tess4j.Tesseract; 4 | import net.sourceforge.tess4j.TesseractException; 5 | import org.apache.http.HttpEntity; 6 | import org.apache.http.HttpResponse; 7 | import org.apache.http.client.HttpClient; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.impl.client.DefaultHttpClient; 10 | import org.apache.http.protocol.BasicHttpContext; 11 | import sun.misc.BASE64Decoder; 12 | 13 | import java.io.File; 14 | import java.io.FileOutputStream; 15 | import java.io.InputStream; 16 | import java.io.OutputStream; 17 | 18 | /** 19 | * 20 | * 验证码图片识别工具 21 | * 使用方法如下: 22 | * 23 | *

  • 将本项目下的docs/authcode的tessdata.zip解压至任意目录如z:/tessddta
  • 24 | *
  • 自己生成验证码图片,或者使用docs/authcode/**.png图片,复制至任意目录,如z:/abcd.png
  • 25 | *
  • 设置tesseract的datapath为步骤1中的目录
  • 26 | *
  • 修改main()中的tesseract的doOCR()方法的参数为步骤2中的图片路径
  • 27 | *
  • 运行即可
  • 28 | * 29 | * 30 | * @author liulei@bshf360.com 31 | * @since 2017-07-19 14:26 32 | */ 33 | public class AuthcodeDistinguisher { 34 | 35 | private static void downloadImage() throws Exception { 36 | HttpClient httpClient = new DefaultHttpClient(); 37 | for (int i = 0; i < 10; i++) { 38 | String url = "http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d"; 39 | HttpGet getMethod = new HttpGet(url); 40 | try { 41 | HttpResponse response = httpClient.execute(getMethod, new BasicHttpContext()); 42 | HttpEntity entity = response.getEntity(); 43 | InputStream instream = entity.getContent(); 44 | OutputStream outstream = new FileOutputStream(new File("d:/", i + ".gif")); 45 | int l = -1; 46 | byte[] tmp = new byte[2048]; 47 | while ((l = instream.read(tmp)) != -1) { 48 | outstream.write(tmp); 49 | } 50 | outstream.close(); 51 | } finally { 52 | getMethod.releaseConnection(); 53 | } 54 | } 55 | 56 | System.out.println("下载验证码完毕!"); 57 | } 58 | 59 | public static String getString(String url) { 60 | Tesseract tesseract = new Tesseract(); 61 | tesseract.setDatapath("D:\\java\\workspace\\learn\\crawler\\tessdata"); 62 | try { 63 | //"http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d" 64 | File file = new File(url); 65 | String result = tesseract.doOCR(file); 66 | System.out.println(result); 67 | return result; 68 | } catch (TesseractException e) { 69 | System.err.println(e.getMessage()); 70 | } 71 | return null; 72 | } 73 | 74 | 75 | public static void main(String[] a) throws Exception { 76 | downloadImage(); 77 | 78 | /* Tesseract tesseract = new Tesseract(); 79 | tesseract.setDatapath("D:\\java\\workspace\\learn\\crawler\\tessdata"); 80 | try { 81 | //String result = tesseract.doOCR(new File("D:\\java\\workspace\\learn\\crawler\\yzm\\showphone.gif")); 82 | URL url = new URL("http://beijing.qd8.com.cn/jobs/ajax/showphone.ashx?v=2JG4ozQd13oYUdXFs0YrOQ%3d%3d"); 83 | 84 | 85 | String file = url.getFile(); 86 | String result = tesseract.doOCR(file); 87 | System.out.println(result); 88 | } catch (Exception e) { 89 | System.err.println(e.getMessage()); 90 | }*/ 91 | } 92 | 93 | /** 94 | * 将图片的base64字符串生成到指定位置的文件中 95 | * @param imgStr 图片的base64编码字符串 96 | * @param imgFilePath 目标文件位置 97 | */ 98 | public static boolean generateImage(String imgStr, String imgFilePath) {// 对字节数组字符串进行Base64解码并生成图片 99 | if (imgStr == null){ // 图像数据为空 100 | return false; 101 | } 102 | BASE64Decoder decoder = new BASE64Decoder(); 103 | try { 104 | // Base64解码 105 | byte[] bytes = decoder.decodeBuffer(imgStr); 106 | for (int i = 0; i < bytes.length; ++i) { 107 | if (bytes[i] < 0) {// 调整异常数据 108 | bytes[i] += 256; 109 | } 110 | } 111 | // 生成jpeg图片 112 | OutputStream out = new FileOutputStream(imgFilePath); 113 | out.write(bytes); 114 | out.flush(); 115 | out.close(); 116 | return true; 117 | } catch (Exception e) { 118 | return false; 119 | } 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/importfrom/IOUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.importfrom; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.io.StringReader; 10 | import java.util.ArrayList; 11 | import java.util.HashSet; 12 | 13 | /** 14 | * 读取字典时的I/O工具类 15 | * 16 | * @author zel 17 | * 18 | */ 19 | public class IOUtil { 20 | public static String readDirOrFile(String filePath, String fileEncoding) { 21 | File f = new File(filePath); 22 | StringBuilder sb = new StringBuilder(); 23 | if (f.isDirectory()) { 24 | File[] files = f.listFiles(); 25 | for (File temp_file : files) { 26 | sb.append(readDirOrFile(temp_file.getAbsolutePath(), 27 | fileEncoding)); 28 | } 29 | return sb.toString(); 30 | } 31 | return readFile(filePath, fileEncoding); 32 | } 33 | 34 | public static ArrayList readDirOrFileToList(String filePath, 35 | String fileEncoding, ArrayList linkList) { 36 | File f = new File(filePath); 37 | if (f.isDirectory()) { 38 | File[] files = f.listFiles(); 39 | for (File temp_file : files) { 40 | linkList.add(readFileWithRegexFilter(temp_file 41 | .getAbsolutePath(), fileEncoding)); 42 | } 43 | return linkList; 44 | } else { 45 | linkList.add(readFileWithRegexFilter(filePath, fileEncoding)); 46 | } 47 | return linkList; 48 | } 49 | 50 | /** 51 | * fileEncoding若为null,则采用系统默认编码 52 | * 53 | * @param filePath 54 | * @param fileEncoding 55 | * @return 56 | */ 57 | public static String readFile(String filePath, String fileEncoding) { 58 | if (fileEncoding == null) { 59 | fileEncoding = System.getProperty("file.encoding"); 60 | } 61 | File file = new File(filePath); 62 | BufferedReader br = null; 63 | 64 | String line = null; 65 | 66 | StringBuilder sb = new StringBuilder(); 67 | 68 | try { 69 | br = new BufferedReader(new InputStreamReader(new FileInputStream( 70 | file), fileEncoding)); 71 | while ((line = br.readLine()) != null) { 72 | // if(line.startsWith("中")) { 73 | sb.append(line + "\n"); 74 | // } 75 | } 76 | // System.out.println("line---"+line); 77 | return sb.toString(); 78 | } catch (Exception e) { 79 | e.printStackTrace(); 80 | } finally { 81 | if (br != null) { 82 | try { 83 | br.close(); 84 | } catch (IOException e) { 85 | e.printStackTrace(); 86 | } 87 | } 88 | } 89 | return null; 90 | } 91 | 92 | public static String readFileWithRegexFilter(String filePath, 93 | String fileEncoding) { 94 | if (fileEncoding == null) { 95 | fileEncoding = System.getProperty("file.encoding"); 96 | } 97 | File file = new File(filePath); 98 | BufferedReader br = null; 99 | 100 | String line = null; 101 | 102 | StringBuilder sb = new StringBuilder(); 103 | 104 | try { 105 | br = new BufferedReader(new InputStreamReader(new FileInputStream( 106 | file), fileEncoding)); 107 | while ((line = br.readLine()) != null) { 108 | sb.append(line + "\n"); 109 | } 110 | return sb.toString(); 111 | } catch (Exception e) { 112 | e.printStackTrace(); 113 | } finally { 114 | if (br != null) { 115 | try { 116 | br.close(); 117 | } catch (IOException e) { 118 | e.printStackTrace(); 119 | } 120 | } 121 | } 122 | return null; 123 | } 124 | 125 | /** 126 | * 将一个字符串写入到一个文件 127 | * 128 | * @param path 129 | * 储存的文件路径 130 | * @param value 131 | * 储存的文件内容 132 | * @throws IOException 133 | */ 134 | public static synchronized void writeFile(String path, String value) { 135 | File f = new File(path); 136 | FileOutputStream fos = null; 137 | try { 138 | fos = new FileOutputStream(f); 139 | fos.write(value.getBytes()); 140 | fos.close(); 141 | } catch (Exception e) { 142 | e.printStackTrace(); 143 | } finally { 144 | if (fos != null) { 145 | try { 146 | fos.close(); 147 | } catch (IOException e) { 148 | e.printStackTrace(); 149 | } 150 | } 151 | } 152 | 153 | } 154 | 155 | public static void main(String[] args) throws Exception { 156 | // String source=readFile("resource/library.dic",null); 157 | // String source = readFile(ReadConfigUtil.getValue("dic.path"), null); 158 | // String source = readDirOrFile("d://temp", "gbk"); 159 | // System.out.println(source); 160 | 161 | String source_string = IOUtil.readFile("d:\\test\\new_words2.txt", 162 | "utf-8"); 163 | StringReader sr = new StringReader(source_string); 164 | BufferedReader br = new BufferedReader(sr); 165 | String temp = null; 166 | StringBuilder sb = new StringBuilder(); 167 | 168 | HashSet hashSet = new HashSet(); 169 | 170 | while ((temp = br.readLine()) != null) { 171 | if (temp.trim().length() > 1 && temp.trim().length() <= 4) { 172 | if (!hashSet.contains(temp)) { 173 | sb.append(temp + "\n"); 174 | } else { 175 | hashSet.add(temp); 176 | } 177 | } 178 | } 179 | IOUtil.writeFile("d:\\test\\new_words3.txt", sb.toString()); 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/RegexPaserUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | /** 10 | * 正则表达式处理工具类,字符串的匹配截取中 11 | * 12 | * @author zel 13 | * 14 | */ 15 | public class RegexPaserUtil { 16 | 17 | private String beginRegex; 18 | 19 | private String endRegex; 20 | 21 | private Matcher matcher; 22 | 23 | public final static String TEXTTEGEX = ".*?"; 24 | 25 | public final static String W = "\\W*?"; 26 | 27 | public final static String N = ""; 28 | 29 | public final static String TEXTEGEXANDNRT = "[\\s\\S]*?"; 30 | public final static String zel_all_chars = "[\\s\\S]*"; 31 | 32 | private List filterRegexList = new ArrayList(); 33 | 34 | public RegexPaserUtil(String beginRegex, String endRegex, String content, 35 | String textRegex) { 36 | 37 | this.beginRegex = beginRegex; 38 | 39 | this.endRegex = endRegex; 40 | 41 | StringBuilder sb = new StringBuilder(); 42 | 43 | sb.append(beginRegex); 44 | 45 | sb.append(textRegex); 46 | 47 | sb.append(endRegex); 48 | matcher = Pattern.compile(sb.toString()).matcher(content); 49 | } 50 | 51 | // 此处的content变量暂未用 52 | public RegexPaserUtil(String beginRegex, String textRegex, String endRegex, 53 | String content, String flag) { 54 | this.beginRegex = beginRegex; 55 | 56 | this.endRegex = endRegex; 57 | 58 | StringBuilder sb = new StringBuilder(); 59 | 60 | sb.append(beginRegex); 61 | 62 | sb.append(textRegex); 63 | 64 | sb.append(endRegex); 65 | // System.out.println("sb--------------" + sb); 66 | matcher = Pattern.compile(sb.toString()).matcher(content); 67 | } 68 | 69 | public RegexPaserUtil(String beginRegex, String endRegex, String textRegex) { 70 | 71 | this.beginRegex = beginRegex; 72 | 73 | this.endRegex = endRegex; 74 | 75 | StringBuilder sb = new StringBuilder(); 76 | 77 | sb.append(beginRegex); 78 | 79 | sb.append(textRegex); 80 | 81 | sb.append(endRegex); 82 | matcher = Pattern.compile(sb.toString()).matcher(N); 83 | } 84 | 85 | public RegexPaserUtil(String beginRegex, String endRegex) { 86 | 87 | this.beginRegex = beginRegex; 88 | 89 | this.endRegex = endRegex; 90 | 91 | StringBuilder sb = new StringBuilder(); 92 | 93 | sb.append(beginRegex); 94 | 95 | sb.append(TEXTTEGEX); 96 | 97 | sb.append(endRegex); 98 | 99 | matcher = Pattern.compile(sb.toString()).matcher(N); 100 | } 101 | 102 | public String getSimpleText() { 103 | if (matcher.find()) { 104 | String str = matcher.group().trim(); 105 | return str; 106 | } 107 | return null; 108 | } 109 | 110 | public String getText() { 111 | if (matcher.find()) { 112 | String str = matcher.group().trim().replaceFirst(beginRegex, N) 113 | .replaceAll(endRegex, N); 114 | Iterator it = filterRegexList.iterator(); 115 | while (it.hasNext()) { 116 | str = str.replaceAll(it.next(), N); 117 | } 118 | return str; 119 | } 120 | return null; 121 | } 122 | 123 | public String getLastText() { 124 | String str = null; 125 | while (matcher.find()) { 126 | str = matcher.group().trim().replaceFirst(beginRegex, N) 127 | .replaceAll(endRegex, N); 128 | } 129 | return str; 130 | } 131 | 132 | public String getNext() { 133 | return matcher.group(); 134 | } 135 | 136 | public String getNextTxt() { 137 | String str = matcher.group().trim().replaceFirst(beginRegex, N) 138 | .replaceAll(endRegex, N); 139 | Iterator it = filterRegexList.iterator(); 140 | while (it.hasNext()) { 141 | str = str.replaceAll(it.next(), N); 142 | } 143 | return str; 144 | } 145 | 146 | /** 147 | * 是指过滤了相关标签 148 | * 149 | * @return 150 | */ 151 | public String getNextAddFilter() { 152 | String str = matcher.group(); 153 | Iterator it = filterRegexList.iterator(); 154 | while (it.hasNext()) { 155 | str = str.replaceAll(it.next(), N); 156 | } 157 | return str; 158 | } 159 | 160 | /** 161 | * 循环遍历时,得到真正的txt,而不是匹配全部 162 | * 163 | * @return 164 | */ 165 | public String getNextText() { 166 | String str = matcher.group(); 167 | str = str.replaceFirst(beginRegex, N).replaceAll(endRegex, N); 168 | return str; 169 | } 170 | 171 | public boolean hasNext() { 172 | return matcher.find(); 173 | } 174 | 175 | public RegexPaserUtil reset(String content) { 176 | this.matcher.reset(content); 177 | return this; 178 | } 179 | 180 | public RegexPaserUtil addFilterRegex(String filterRegex) { 181 | filterRegexList.add(filterRegex); 182 | return this; 183 | } 184 | 185 | public String getTextList() { 186 | String str = ""; 187 | int count = 0; 188 | while (matcher.find()) { 189 | if (count == 0) { 190 | str = matcher.group().trim().replaceFirst(beginRegex, N) 191 | .replaceAll(endRegex, N); 192 | } else { 193 | str += ("#" + matcher.group().trim() 194 | .replaceFirst(beginRegex, N).replaceAll(endRegex, N)); 195 | } 196 | count++; 197 | } 198 | return str; 199 | } 200 | 201 | public static void main(String[] args) { 202 | String beginRegex = ""; 203 | String endRegex = ""; 204 | String text = "
    1//@23 4"; 205 | RegexPaserUtil ansjSayUrl = new RegexPaserUtil(beginRegex, endRegex, 206 | text, RegexPaserUtil.TEXTEGEXANDNRT); 207 | 208 | System.out.println(ansjSayUrl.getText()); 209 | 210 | } 211 | 212 | } 213 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/importfrom/ImportExcelUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.importfrom; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | import java.text.DecimalFormat; 7 | import java.text.SimpleDateFormat; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.Set; 14 | import org.apache.poi.hssf.usermodel.HSSFWorkbook; 15 | import org.apache.poi.ss.usermodel.Cell; 16 | import org.apache.poi.ss.usermodel.Row; 17 | import org.apache.poi.ss.usermodel.Sheet; 18 | import org.apache.poi.ss.usermodel.Workbook; 19 | import org.apache.poi.xssf.usermodel.XSSFWorkbook; 20 | 21 | ; 22 | 23 | /** 24 | * Excel文件流 --> List >对象 25 | * 想直接转成java bean的朋友可以使用fastjson将List>转成bean对象 26 | * 27 | */ 28 | public class ImportExcelUtil { 29 | private final static String excel2003L = ".xls"; // 2003- 版本的excel 30 | private final static String excel2007U = ".xlsx"; // 2007+ 版本的excel 31 | /** 32 | * 将流中的Excel数据转成List 33 | * @param in 输入流 34 | * @param fileName 文件名(判断Excel版本) 35 | * @param mapping 字段名称映射 36 | * @return 37 | * @throws Exception 38 | */ 39 | public static List> parseExcel(InputStream in, String fileName, Map mapping) throws Exception { 40 | // 根据文件名来创建Excel工作薄 41 | Workbook work = getWorkbook(in, fileName); 42 | if (null == work) { 43 | throw new Exception("创建Excel工作薄为空!"); 44 | } 45 | Sheet sheet = null; 46 | Row row = null; 47 | Cell cell = null; 48 | // 返回数据 49 | List> ls = new ArrayList>(); 50 | 51 | // 遍历Excel中所有的sheet 52 | for (int i = 0; i < work.getNumberOfSheets(); i++) { 53 | sheet = work.getSheetAt(i); 54 | if (sheet == null){ 55 | continue; 56 | } 57 | // 取第一行标题 58 | row = sheet.getRow(0); 59 | String title[] = null; 60 | if (row != null) { 61 | title = new String[row.getLastCellNum()]; 62 | for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) { 63 | cell = row.getCell(y); 64 | title[y] = (String) getCellValue(cell); 65 | } 66 | } else{ 67 | continue; 68 | } 69 | // 遍历当前sheet中的所有行 70 | for (int j = 1; j < sheet.getLastRowNum() + 1; j++) { 71 | row = sheet.getRow(j); 72 | Map m = new HashMap(); 73 | // 遍历所有的列 74 | for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) { 75 | cell = row.getCell(y); 76 | String key = title[y]; 77 | m.put(mapping.get(key), getCellValue(cell)); 78 | } 79 | ls.add(m); 80 | } 81 | } 82 | work.close(); 83 | return ls; 84 | } 85 | 86 | /** 87 | * 描述:根据文件后缀,自适应上传文件的版本 88 | * @param inStr,fileName 89 | * @return 90 | * @throws Exception 91 | */ 92 | public static Workbook getWorkbook(InputStream inStr, String fileName) throws Exception { 93 | Workbook wb = null; 94 | String fileType = fileName.substring(fileName.lastIndexOf(".")); 95 | if (excel2003L.equals(fileType)) { 96 | wb = new HSSFWorkbook(inStr); // 2003- 97 | } else if (excel2007U.equals(fileType)) { 98 | wb = new XSSFWorkbook(inStr); // 2007+ 99 | } else { 100 | throw new Exception("解析的文件格式有误!"); 101 | } 102 | return wb; 103 | } 104 | 105 | /** 106 | * 描述:对表格中数值进行格式化 107 | * 108 | * @param cell 109 | * @return 110 | */ 111 | public static Object getCellValue(Cell cell) { 112 | Object value = null; 113 | // 格式化number String字符 114 | DecimalFormat df = new DecimalFormat("0"); 115 | // 日期格式化 116 | SimpleDateFormat sdf = new SimpleDateFormat("yyy-MM-dd"); 117 | // 格式化数字 118 | DecimalFormat df2 = new DecimalFormat("0"); 119 | switch (cell.getCellType()) { 120 | case Cell.CELL_TYPE_STRING: 121 | value = cell.getRichStringCellValue().getString(); 122 | break; 123 | case Cell.CELL_TYPE_NUMERIC: 124 | if ("General".equals(cell.getCellStyle().getDataFormatString())) { 125 | value = df.format(cell.getNumericCellValue()); 126 | } else if ("m/d/yy".equals(cell.getCellStyle().getDataFormatString())) { 127 | value = sdf.format(cell.getDateCellValue()); 128 | } else { 129 | value = df2.format(cell.getNumericCellValue()); 130 | } 131 | break; 132 | case Cell.CELL_TYPE_BOOLEAN: 133 | value = cell.getBooleanCellValue(); 134 | break; 135 | case Cell.CELL_TYPE_BLANK: 136 | value = ""; 137 | break; 138 | default: 139 | break; 140 | } 141 | return value; 142 | } 143 | 144 | /** 主方法测试*/ 145 | public static void main(String[] args) throws Exception { 146 | File file = new File("D:\\词条目录卫青.xlsx"); 147 | FileInputStream fis = new FileInputStream(file); 148 | Map m = new HashMap(); 149 | m.put("药品名称", "name"); 150 | m.put("序号", "id"); 151 | List> ls = parseExcel(fis, file.getName(), m); 152 | Set resultSet = new HashSet<>(); 153 | for (int i = 0; i headerInfosMap = new HashMap<>(16); 26 | headerInfosMap.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"); 27 | //模拟登陆必须内容 28 | headerInfosMap.put("Cookie", "acw_tc=276aedd215431325834491762e1d45b45d7060c7dec9c77742b0f52b0eeb32; _csrf=91d7e25d637d436027e7819c6d5e9f7fb1efc29328e1b286a12c71c562b3c9ffa%3A2%3A%7Bi%3A0%3Bs%3A5%3A%22_csrf%22%3Bi%3A1%3Bs%3A32%3A%22%87y%F2%9EN%D2%C1%F0%B6%DD%00%2Bi_%ABS%87C%18%AB%0A%EE%D1_%E1%D4e%09%E9%DB%97%A3%22%3B%7D; looyu_id=2ca1e50b89c720f6c644646887121255_20000923%3A1; 51ctologToken=8eee7adfa2139ad0eb3881e882b9350a; _ourplusFirstTime=118-11-25-15-56-25; _ga=GA1.2.1084733366.1543132586; _gid=GA1.2.1681094532.1543132586; _t99_chat=1; www51cto=70F4649BA2D0E3E01B91BF4FB0771B95EGZG; pub_cookietime=0; bdshare_firstime=1543132639813; 13240332=2018/11/25; EDUACCOUNT=59bc35e120ecde375071702d5a847a51a5e15544efa309f8d5b1da3e62ce7e56a%3A2%3A%7Bi%3A0%3Bs%3A10%3A%22EDUACCOUNT%22%3Bi%3A1%3Bs%3A32%3A%2295cd024928664c2dbef35fb538aeca8c%22%3B%7D; _ourplusReturnCount=5; _ourplusReturnTime=118-11-25-16-1-48; __utma=1.1084733366.1543132586.1543132909.1543132909.1; __utmc=1; __utmz=1.1543132909.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lvt_f77ea1ecd95cb2a1bc65cbcb3aaba7d4=1543132909; Hm_lpvt_f77ea1ecd95cb2a1bc65cbcb3aaba7d4=1543132909; playTime202687=1337.723899; playTime202689=94; playDEF=hd; _51ctologStr=data%3D%257Bvisitorid%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2CuserAagent%3A%2522Mozilla/5.0%2520%28Macintosh%3B%2520Intel%2520Mac%2520OS%2520X%252010_13_2%29%2520AppleWebKit/537.36%2520%28KHTML%2520%2520like%2520Gecko%29%2520Chrome/70.0.3538.102%2520Safari/537.36%2522%2Ctoken%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2Cuid%3A%2522%2522%2Cuuid%3A%252254f10ca8-1a3c-c5dd-t903-o22928bc36ce%2522%2Ctype%3A%2522close%2522%2Cdom%3A%2522%2522%2CdomId%3A%2522%2522%2CdomInnerTxt%3A%2522%2522%2Cprice%3A%2522%2522%2Cstudents_count%3A%2522%2522%2Cfavourite%3A%2522%2522%2Cvote%3A%2522%2522%2Cscrolling%3A%25220%2525%2522%2Cscreensize%3A%25221280X800%2522%2Curl%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cref%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cfrom%3A%2522home%2522%2Cduration%3A%252221180%2522%2Ctime%3A%25221543139532503%2522%257D; pub_sauth1=tKGEuF1dD1Q6BQEDVwlVBwY6BVFSAQQCW1YEVg; pub_sauth2=0c84c9bfca688ef4b37d29441f5474ef; PHPSESSID=22c8d16uam5ugjdtjnc4or0pm7; logserveruid=13240332; Hm_lvt_8c8abdb71d78d33dfdb885e0bc71dae0=1543132627,1543139006,1543139032,1543139535; Cto_lvt_=1543132627,1543139006,1543139032,1543139535; _gat_gtag_UA_118863081_1=1; _51ctologStr=data%3D%257Bvisitorid%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2CuserAagent%3A%2522Mozilla/5.0%2520%28Macintosh%3B%2520Intel%2520Mac%2520OS%2520X%252010_13_2%29%2520AppleWebKit/537.36%2520%28KHTML%2520%2520like%2520Gecko%29%2520Chrome/70.0.3538.102%2520Safari/537.36%2522%2Ctoken%3A%25228eee7adfa2139ad0eb3881e882b9350a%2522%2Cuid%3A%252213240332%2522%2Cuuid%3A%252258e302b0-1b4c-c155-t443-o6e8309fd1b4%2522%2Ctype%3A%2522close%2522%2Cdom%3A%2522%2522%2CdomId%3A%2522%2522%2CdomInnerTxt%3A%2522%2522%2Cprice%3A%2522%2522%2Cstudents_count%3A%2522%2522%2Cfavourite%3A%2522%2522%2Cvote%3A%2522%2522%2Cscrolling%3A%25220%2525%2522%2Cscreensize%3A%25221280X800%2522%2Curl%3A%2522http%25253A%25252F%25252Fedu.51cto.com%25252Fcenter%25252Fcourse%25252Flesson%25252Findex%25253Fid%25253D202668%2522%2Cref%3A%2522http%25253A%25252F%25252Fhome.51cto.com%25252Findex%25252F%25253Freback%25253Dhttp%2525253A%2525252F%2525252Fedu.51cto.com%2525252Fcenter%2525252Fuser%2525252Findex%2525252Flogin-success%2525253Fsign%2525253D1e4dAVQIA1FRBAEJAQFUVFVUWgEEC1tRUwECVgpYEE0UXxweAVxGT1QFUk1eS1ZZWUsABw9MBhRMAFgRQEMBFggAQEILVhwID1BUQQ4MUQsGVFFUVwBSA1cH%25252526client%2525253Dweb%2522%2Cfrom%3A%2522edu%2522%2Cduration%3A%25222499%2522%2Ctime%3A%25221543139537068%2522%257D; playTime202668=58; Hm_lpvt_8c8abdb71d78d33dfdb885e0bc71dae0=1543139538; Cto_lpvt_=1543139538; playTime202662=536.436775"); 29 | headerInfosMap.put("Accept", "*/*"); 30 | headerInfosMap.put("Accept-Encoding", "gzip, deflate"); 31 | headerInfosMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); 32 | headerInfosMap.put("Cache-Control", "no-cache"); 33 | headerInfosMap.put("Connection", "keep-alive"); 34 | headerInfosMap.put("Host", "edu.51cto.com"); 35 | headerInfosMap.put("Pragma", "no-cache"); 36 | headerInfosMap.put("Referer", "http://edu.51cto.com/center/course/lesson/index?id=202687"); 37 | 38 | 39 | //工厂取得M3u8SpiderRule实例 40 | ISpiderRule spiderRule = new SpiderRuleFactory(new DefaultM3u8SpiderRule()).getInstance(); 41 | 42 | //封装参数 43 | Param param = new Param(); 44 | param.setWebUrl(httpUrl); 45 | param.setFileFullName(fileName); 46 | param.setHeaderInfos(headerInfosMap); 47 | 48 | //走你 49 | spiderRule.runSpider(param,spiderRule); 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/packageutil/PackageGetJavaUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.packageutil; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | import java.nio.file.Files; 7 | import java.text.DecimalFormat; 8 | import java.text.SimpleDateFormat; 9 | import java.util.ArrayList; 10 | import java.util.HashMap; 11 | import java.util.HashSet; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Set; 15 | import org.apache.poi.hssf.usermodel.HSSFWorkbook; 16 | import org.apache.poi.ss.usermodel.Cell; 17 | import org.apache.poi.ss.usermodel.Row; 18 | import org.apache.poi.ss.usermodel.Sheet; 19 | import org.apache.poi.ss.usermodel.Workbook; 20 | import org.apache.poi.xssf.usermodel.XSSFWorkbook; 21 | 22 | /** 23 | * @author lyy 24 | * @since 2019-09-18 13:01 25 | * 读取excel里面的文件到指定目录 26 | */ 27 | public class PackageGetJavaUtil { 28 | private static final String SRC_PATH = "/Users/liuyangyang/workspace/xyzj/xyzj-crawler"; 29 | private static final String TARGET_PATH = "/Users/liuyangyang/Downloads"; 30 | private final static String EXCEL_2003L = ".xls"; // 2003- 版本的excel 31 | private final static String EXCEL_2007U = ".xlsx"; // 2007+ 版本的excel 32 | 33 | /** 34 | * 将流中的Excel数据转成List 35 | * @param in 输入流 36 | * @param fileName 文件名(判断Excel版本) 37 | * @param mapping 字段名称映射 38 | * @return 39 | * @throws Exception 40 | */ 41 | public static List> parseExcel(InputStream in, String fileName, Map mapping) throws Exception { 42 | // 根据文件名来创建Excel工作薄 43 | Workbook work = getWorkbook(in, fileName); 44 | if (null == work) { 45 | throw new Exception("创建Excel工作薄为空!"); 46 | } 47 | Sheet sheet = null; 48 | Row row = null; 49 | Cell cell = null; 50 | // 返回数据 51 | List> ls = new ArrayList>(); 52 | 53 | // 遍历Excel中所有的sheet 54 | for (int i = 0; i < work.getNumberOfSheets(); i++) { 55 | sheet = work.getSheetAt(i); 56 | if (sheet == null){ 57 | continue; 58 | } 59 | // 取第一行标题 60 | row = sheet.getRow(0); 61 | String title[] = null; 62 | if (row != null) { 63 | title = new String[row.getLastCellNum()]; 64 | for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) { 65 | cell = row.getCell(y); 66 | title[y] = (String) getCellValue(cell); 67 | } 68 | } else{ 69 | continue; 70 | } 71 | // 遍历当前sheet中的所有行 72 | for (int j = 1; j < sheet.getLastRowNum() + 1; j++) { 73 | row = sheet.getRow(j); 74 | Map m = new HashMap(); 75 | // 遍历所有的列 76 | for (int y = row.getFirstCellNum(); y < row.getLastCellNum(); y++) { 77 | cell = row.getCell(y); 78 | String key = title[y]; 79 | m.put(mapping.get(key), getCellValue(cell)); 80 | } 81 | ls.add(m); 82 | } 83 | } 84 | work.close(); 85 | return ls; 86 | } 87 | 88 | //复制方法 89 | public static void copy(String filePath, String srcPath, String targetPath) throws Exception { 90 | //初始化文件复制 91 | File srcFile=new File(srcPath + filePath); 92 | 93 | //初始化文件目标 94 | File targetFile=new File(targetPath+filePath); 95 | if(!targetFile.getParentFile().exists()){ 96 | targetFile.getParentFile().mkdirs(); 97 | } 98 | //调用文件拷贝的方法 99 | targetFile.delete(); 100 | Files.copy(srcFile.toPath(), targetFile.toPath()); 101 | } 102 | 103 | 104 | /** 105 | * 描述:根据文件后缀,自适应上传文件的版本 106 | * @param inStr,fileName 107 | * @return 108 | * @throws Exception 109 | */ 110 | public static Workbook getWorkbook(InputStream inStr, String fileName) throws Exception { 111 | Workbook wb = null; 112 | String fileType = fileName.substring(fileName.lastIndexOf(".")); 113 | if (EXCEL_2003L.equals(fileType)) { 114 | wb = new HSSFWorkbook(inStr); // 2003- 115 | } else if (EXCEL_2007U.equals(fileType)) { 116 | wb = new XSSFWorkbook(inStr); // 2007+ 117 | } else { 118 | throw new Exception("解析的文件格式有误!"); 119 | } 120 | return wb; 121 | } 122 | 123 | /** 124 | * 描述:对表格中数值进行格式化 125 | * 126 | * @param cell 127 | * @return 128 | */ 129 | public static Object getCellValue(Cell cell) { 130 | Object value = null; 131 | // 格式化number String字符 132 | DecimalFormat df = new DecimalFormat("0"); 133 | // 日期格式化 134 | SimpleDateFormat sdf = new SimpleDateFormat("yyy-MM-dd"); 135 | // 格式化数字 136 | DecimalFormat df2 = new DecimalFormat("0"); 137 | switch (cell.getCellType()) { 138 | case Cell.CELL_TYPE_STRING: 139 | value = cell.getRichStringCellValue().getString(); 140 | break; 141 | case Cell.CELL_TYPE_NUMERIC: 142 | if ("General".equals(cell.getCellStyle().getDataFormatString())) { 143 | value = df.format(cell.getNumericCellValue()); 144 | } else if ("m/d/yy".equals(cell.getCellStyle().getDataFormatString())) { 145 | value = sdf.format(cell.getDateCellValue()); 146 | } else { 147 | value = df2.format(cell.getNumericCellValue()); 148 | } 149 | break; 150 | case Cell.CELL_TYPE_BOOLEAN: 151 | value = cell.getBooleanCellValue(); 152 | break; 153 | case Cell.CELL_TYPE_BLANK: 154 | value = ""; 155 | break; 156 | default: 157 | break; 158 | } 159 | return value; 160 | } 161 | 162 | 163 | public static void main(String[] args) throws Exception { 164 | //取得类路径 165 | File file = new File("/Users/liuyangyang/Downloads/工作簿1.xlsx"); 166 | FileInputStream fis = new FileInputStream(file); 167 | Map m = new HashMap(); 168 | m.put("路径", "path"); 169 | List> ls = parseExcel(fis, file.getName(), m); 170 | Set resultSet = new HashSet<>(); 171 | for (int i = 0; i 3 | 4.0.0 4 | com.xyzj.crawler 5 | xyzj-crawler 6 | 0.0.1-SNAPSHOT 7 | crawler 8 | 9 | 10 | 1.1.2 11 | 0.1.4 12 | 1.7.20 13 | 1.7.12 14 | 1.7.21 15 | 1.2 16 | 4.3.8.RELEASE 17 | 5.1.38 18 | 3.14 19 | 20 | 21 | 3.6.1 22 | 3.0.1 23 | 2.10.4 24 | 25 | 1.8 26 | UTF-8 27 | 28 | 29 | 30 | 31 | commons-httpclient 32 | commons-httpclient 33 | 3.1 34 | 35 | 36 | 37 | com.alibaba 38 | fastjson 39 | 1.2.31 40 | 41 | 42 | org.jsoup 43 | jsoup 44 | 1.7.2 45 | 46 | 47 | org.apache.httpcomponents 48 | httpclient 49 | 4.5.3 50 | 51 | 52 | org.apache.httpcomponents 53 | httpmime 54 | 4.5.3 55 | 56 | 57 | 58 | ch.qos.logback 59 | logback-classic 60 | ${logback.version} 61 | 62 | 63 | ch.qos.logback 64 | logback-access 65 | ${logback.version} 66 | 67 | 68 | org.logback-extensions 69 | logback-ext-spring 70 | ${logback-ext-spring.version} 71 | 72 | 73 | org.slf4j 74 | log4j-over-slf4j 75 | ${log4j-over-slf4j.version} 76 | 77 | 78 | org.slf4j 79 | jcl-over-slf4j 80 | ${jcl-over-slf4j.version} 81 | 82 | 83 | org.springframework 84 | spring-jdbc 85 | ${spring.version} 86 | 87 | 88 | mysql 89 | mysql-connector-java 90 | ${mysql.version} 91 | 92 | 93 | log4j 94 | log4j 95 | 1.2.16 96 | 97 | 98 | org.springframework 99 | spring-aop 100 | ${spring.version} 101 | 102 | 103 | commons-lang 104 | commons-lang 105 | 2.6 106 | 107 | 108 | commons-io 109 | commons-io 110 | 2.6 111 | 112 | 113 | junit 114 | junit 115 | 4.8.2 116 | test 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | us.codecraft 140 | webmagic-core 141 | 0.7.3 142 | 143 | 144 | us.codecraft 145 | webmagic-extension 146 | 0.7.3 147 | 148 | 149 | org.slf4j 150 | slf4j-log4j12 151 | 152 | 153 | 154 | 155 | 156 | net.java.dev.jna 157 | jna 158 | 4.2.1 159 | 160 | 161 | 162 | redis.clients 163 | jedis 164 | 2.9.0 165 | 166 | 167 | 168 | net.sourceforge.tess4j 169 | tess4j 170 | 2.0.1 171 | 172 | 173 | com.sun.jna 174 | jna 175 | 176 | 177 | 178 | 179 | 180 | 181 | cn.edu.hfut.dmic.webcollector 182 | WebCollector 183 | 2.71 184 | 185 | 186 | 187 | net.sourceforge.htmlunit 188 | htmlunit 189 | 2.28 190 | 191 | 192 | 193 | org.projectlombok 194 | lombok 195 | 1.16.10 196 | 197 | 198 | com.alibaba 199 | easyexcel 200 | 2.0.5 201 | 202 | 203 | 204 | com.oracle 205 | ojdbc6 206 | 12.1.0.1-atlassian-hosted 207 | 208 | 209 | 210 | 211 | 212 | 213 | ${project.name} 214 | 215 | 216 | org.apache.maven.plugins 217 | maven-compiler-plugin 218 | ${compiler.version} 219 | 220 | ${jdk.version} 221 | ${jdk.version} 222 | ${project.build.sourceEncoding} 223 | 224 | 225 | 226 | org.springframework.boot 227 | spring-boot-maven-plugin 228 | 229 | com.xyzj.bigdata.in.ReadTest 230 | 231 | 232 | 233 | 234 | repackage 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/gethtmlstring/HttpResponseUtil.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.gethtmlstring; 2 | 3 | 4 | import com.alibaba.fastjson.JSONObject; 5 | import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; 6 | import com.gargoylesoftware.htmlunit.ProxyConfig; 7 | import com.gargoylesoftware.htmlunit.WebClient; 8 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 9 | import com.xyzj.crawler.framework.entity.Goods; 10 | import com.xyzj.crawler.framework.entity.Param; 11 | import com.xyzj.crawler.framework.enums.FactionEnum; 12 | import com.xyzj.crawler.utils.proxyip.IPModel.IPMessage; 13 | import com.xyzj.crawler.utils.proxyip.config.RedisUtil; 14 | import com.xyzj.crawler.utils.savetomysql.SaveToMysql; 15 | import lombok.extern.slf4j.Slf4j; 16 | import org.apache.http.HttpHost; 17 | import org.apache.http.HttpResponse; 18 | import org.apache.http.client.config.RequestConfig; 19 | import org.apache.http.client.methods.CloseableHttpResponse; 20 | import org.apache.http.client.methods.HttpGet; 21 | import org.apache.http.client.methods.HttpPost; 22 | import org.apache.http.entity.StringEntity; 23 | import org.apache.http.impl.client.CloseableHttpClient; 24 | import org.apache.http.impl.client.HttpClients; 25 | import org.apache.http.util.EntityUtils; 26 | import org.springframework.util.CollectionUtils; 27 | 28 | /** 29 | * 30 | */ 31 | 32 | @Slf4j 33 | public class HttpResponseUtil { 34 | 35 | public static String getHtmlSource(Param param) { 36 | //获取html源文件 37 | param.getHeaderInfos().put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"); 38 | String htmlSource = ""; 39 | FactionEnum factionEnum = param.getFactionEnum(); 40 | switch (factionEnum) { 41 | case getHtml: 42 | log.info("走 getHtml"); 43 | htmlSource = HttpResponseUtil.getHtml(param); 44 | break; 45 | case getHtmlWithJavaScript: 46 | log.info("走 getHtmlWithJavaScript"); 47 | htmlSource = HttpResponseUtil.getHtmlWithJavaScript(param); 48 | break; 49 | case getJson: 50 | log.info("走 getJson"); 51 | htmlSource = HttpResponseUtil.getJson(param); 52 | break; 53 | } 54 | if (org.springframework.util.StringUtils.isEmpty(htmlSource) || htmlSource.contains("Not Found") || htmlSource.contains("无法访问此网站") || htmlSource.contains("你所访问的页面就如那些遇害的同道") || htmlSource.contains("药品不存在!")) { 55 | log.info("本次爬取目标失败 webUrl={}", param.getWebUrl()); 56 | //没拿到数据 存入ungoods表 57 | Goods unableGoods = new Goods(); 58 | unableGoods.setWebUrl(param.getWebUrl()); 59 | SaveToMysql saveToMysql = new SaveToMysql(); 60 | saveToMysql.saveToMasql("ungoods", unableGoods); 61 | return null; 62 | } 63 | log.info("本次爬取目标 webUrl={}", param.getWebUrl()); 64 | log.info(htmlSource); 65 | return htmlSource; 66 | } 67 | 68 | /** 69 | * ======================================== 70 | * 71 | * @description: 取得网页html信息 72 | * @author: lyy 73 | * @param: 74 | * @return: 75 | * @exception: 76 | * @create: 2019/6/28 11:54 77 | *

    78 | * ======================================== 79 | */ 80 | public static String getHtml(Param param) { 81 | String entity = null; 82 | CloseableHttpClient httpClient = HttpClients.createDefault(); 83 | //设置代理 84 | RequestConfig config = null; 85 | if (param.getIsProxy()) { 86 | IPMessage ipMessage = RedisUtil.getOneIp(); 87 | config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(new HttpHost(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort()))).build(); 88 | } else { 89 | config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).build(); 90 | } 91 | HttpGet httpGet = new HttpGet(param.getWebUrl()); 92 | httpGet.setConfig(config); 93 | // 遍历map 设置请求头信息 94 | if (!CollectionUtils.isEmpty(param.getHeaderInfos())) { 95 | for (String key : param.getHeaderInfos().keySet()) { 96 | httpGet.setHeader(key, param.getHeaderInfos().get(key)); 97 | } 98 | } 99 | try { 100 | //客户端执行httpGet方法,返回响应 101 | CloseableHttpResponse httpResponse = httpClient.execute(httpGet); 102 | //得到服务响应状态码 103 | if (httpResponse.getStatusLine().getStatusCode() == 200) { 104 | entity = EntityUtils.toString(httpResponse.getEntity(), param.getCharset()); 105 | } 106 | httpResponse.close(); 107 | httpClient.close(); 108 | } catch (Exception e) { 109 | log.error("getHtml exception:{}", e); 110 | } 111 | return entity; 112 | } 113 | 114 | /** 115 | * ======================================== 116 | * 117 | * @description: 取得执行javascript后的页面信息 118 | * @author: lyy 119 | * @param: 120 | * @return: 121 | * @exception: 122 | * @create: 2019/6/28 11:45 123 | *

    124 | * ======================================== 125 | */ 126 | public static String getHtmlWithJavaScript(Param param) { 127 | try { 128 | //HtmlUnit请求web页面 129 | WebClient wc = new WebClient(); 130 | //启用JS解释器,默认为true 131 | wc.getOptions().setJavaScriptEnabled(true); 132 | //js运行错误时,是否抛出异常 133 | wc.getOptions().setThrowExceptionOnScriptError(false); 134 | //禁用css支持 135 | wc.getOptions().setActiveXNative(false); 136 | wc.getOptions().setCssEnabled(false); 137 | //设置支持AJAX 138 | wc.setAjaxController(new NicelyResynchronizingAjaxController()); 139 | if (param.getIsProxy()) { 140 | IPMessage ipMessage = RedisUtil.getOneIp(); 141 | wc.getOptions().setProxyConfig(new ProxyConfig(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort()))); 142 | } 143 | if (param.getDelayTime() != null) { 144 | Thread.sleep(param.getDelayTime()); 145 | } 146 | HtmlPage page = wc.getPage(param.getWebUrl()); 147 | //以xml的形式获取响应文本 148 | return page.asXml(); 149 | } catch (Exception e) { 150 | //异常 151 | log.info("没有抓到数据......"); 152 | } 153 | return null; 154 | } 155 | 156 | /** 157 | * ======================================== 158 | * 159 | * @description: 取得json数据 160 | * @author: lyy 161 | * @param: 162 | * @return: 163 | * @exception: 164 | * @create: 2019/6/28 11:46 165 | *

    166 | * ======================================== 167 | */ 168 | public static String getJson(Param param) { 169 | CloseableHttpClient httpClient = HttpClients.createDefault(); 170 | //设置代理 171 | RequestConfig config = null; 172 | if (param.getIsProxy()) { 173 | IPMessage ipMessage = RedisUtil.getOneIp(); 174 | config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(new HttpHost(ipMessage.getIp(), Integer.parseInt(ipMessage.getPort()))).build(); 175 | } else { 176 | config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).build(); 177 | } 178 | HttpPost httpPost = new HttpPost(param.getWebUrl()); 179 | httpPost.setConfig(config); 180 | // 遍历map 设置请求头信息 181 | if (!CollectionUtils.isEmpty(param.getHeaderInfos())) { 182 | for (String key : param.getHeaderInfos().keySet()) { 183 | httpPost.setHeader(key, param.getHeaderInfos().get(key)); 184 | } 185 | } 186 | //遍历BodyParams 187 | if (!CollectionUtils.isEmpty(param.getBodyParams())) { 188 | JSONObject jsonParam = new JSONObject(); 189 | for (String key : param.getBodyParams().keySet()) { 190 | jsonParam.put(key, param.getBodyParams().get(key)); 191 | } 192 | //解决中文乱码问题 193 | StringEntity entity = new StringEntity(jsonParam.toString(), param.getCharset()); 194 | entity.setContentEncoding("UTF-8"); 195 | entity.setContentType("application/json"); 196 | httpPost.setEntity(entity); 197 | } 198 | String httpResponseString = ""; 199 | try { 200 | //客户端执行httpPost方法,返回响应 201 | HttpResponse httpResponse = httpClient.execute(httpPost); 202 | //得到服务响应状态码 203 | if (httpResponse.getStatusLine().getStatusCode() == 200) { 204 | httpResponseString = EntityUtils.toString(httpResponse.getEntity(), param.getCharset()); 205 | } 206 | httpClient.close(); 207 | } catch (Exception e) { 208 | log.error("Exception:{}", e); 209 | } 210 | return httpResponseString; 211 | } 212 | 213 | } 214 | -------------------------------------------------------------------------------- /src/main/java/com/xyzj/crawler/utils/parsehtmlstring/JsoupHtmlParser.java: -------------------------------------------------------------------------------- 1 | package com.xyzj.crawler.utils.parsehtmlstring; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.safety.Whitelist; 7 | import org.jsoup.select.Elements; 8 | 9 | import java.util.Iterator; 10 | import java.util.LinkedList; 11 | import java.util.List; 12 | 13 | /** 14 | * 对Jsoup的再次封装,更加简明扼要 15 | * 16 | * @author zel 17 | * 18 | */ 19 | public class JsoupHtmlParser { 20 | /** 21 | * 得到指定文档的纯文档 22 | * 23 | * @param htmlSource 24 | * @return 25 | */ 26 | public static String getCleanTxt(String htmlSource) { 27 | if (htmlSource == null || htmlSource.isEmpty()) { 28 | return StaticValue.NULL; 29 | } 30 | return Jsoup.clean(htmlSource, Whitelist.none()); 31 | } 32 | 33 | /** 34 | * 沒有DataFormatStatus參數的情況 35 | * 36 | * @param htmlSource 37 | * @param tagName 38 | * @return 39 | */ 40 | public static String getTagContent(String htmlSource, String tagName) { 41 | return getTagContent(htmlSource, tagName, DataFormatStatus.CleanTxt); 42 | } 43 | 44 | /** 45 | * 得到指定tag标签的的内容,包括纯文本和标签全部内容两种格式 46 | * 47 | * @param htmlSource 48 | * @param tagName 49 | * @param dataFormatStatus 50 | * @return 51 | */ 52 | public static String getTagContent(String htmlSource, String tagName, 53 | DataFormatStatus dataFormatStatus) { 54 | if (htmlSource == null || htmlSource.isEmpty()) { 55 | return StaticValue.NULL; 56 | } 57 | SystemAssert.assertNotNull(dataFormatStatus); 58 | 59 | StringBuilder sb = new StringBuilder(); 60 | Document doc = Jsoup.parse(htmlSource); 61 | Elements elements = doc.getElementsByTag(tagName); 62 | 63 | Iterator iterator = elements.iterator(); 64 | Element element = null; 65 | 66 | if (dataFormatStatus == DataFormatStatus.CleanTxt) { 67 | while (iterator.hasNext()) { 68 | element = iterator.next(); 69 | sb.append(getCleanTxt(element.toString())); 70 | sb.append(StaticValue.separator_next_line); 71 | } 72 | } else { 73 | while (iterator.hasNext()) { 74 | element = iterator.next(); 75 | sb.append(element.toString()); 76 | sb.append(StaticValue.separator_next_line); 77 | } 78 | } 79 | 80 | return sb.toString(); 81 | } 82 | 83 | /** 84 | * getNestTagContent无参数的情况 85 | * 86 | * @param htmlSource 87 | * @param tagList 88 | * @return 89 | */ 90 | public static List getNestTagContent(String htmlSource, 91 | List tagList, boolean isFilter) { 92 | return getNestTagContent(htmlSource, tagList, 93 | DataFormatStatus.CleanTxt, isFilter); 94 | } 95 | 96 | /** 97 | * 得到嵌套的标签的数据,包括纯文本、标签全部内容两种格式 98 | * 99 | * @param htmlSource 100 | * @param tagList 101 | * @param dataFormatStatus 102 | * @return 103 | */ 104 | public static List getNestTagContent(String htmlSource, 105 | List tagList, DataFormatStatus dataFormatStatus, 106 | boolean isFilter) { 107 | if (htmlSource == null || htmlSource.isEmpty() || tagList == null 108 | || tagList.isEmpty()) { 109 | return null; 110 | } 111 | 112 | SystemAssert.assertNotNull(dataFormatStatus); 113 | 114 | Document doc = Jsoup.parse(htmlSource);// 先预解析 115 | Iterator tagIteraotr = tagList.iterator(); 116 | String temp_tag = null; 117 | 118 | List temp_list_element = new LinkedList(); 119 | Elements elements = null; 120 | 121 | Elements temp_elements = null; 122 | // 暂存循环迭代时候的结果 123 | List temp_list_line = new LinkedList(); 124 | Document temp_doc = null;// 暂存二次解析出来的doc 125 | boolean isFirst = true;// 标志是否是第一次进行选择器处理 126 | while (tagIteraotr.hasNext()) { 127 | temp_tag = tagIteraotr.next(); 128 | if (isFirst) { 129 | elements = doc.getElementsByTag(temp_tag); 130 | isFirst = false; 131 | } else { 132 | /** 133 | * 对每个先前的结果集进行过滤 134 | */ 135 | elements.clear(); 136 | for (String line : temp_list_line) { 137 | if (line != null && (!line.isEmpty())) { 138 | temp_doc = Jsoup.parse(line);// 先预解析 139 | temp_elements = temp_doc.getElementsByTag(temp_tag); 140 | if (temp_elements != null && (!temp_elements.isEmpty())) { 141 | elements.addAll(temp_elements); 142 | } 143 | } 144 | } 145 | } 146 | 147 | temp_list_element.clear(); 148 | temp_list_element.addAll(elements); 149 | 150 | Iterator elementIteraotr = temp_list_element.iterator(); 151 | 152 | temp_list_line.clear(); 153 | while (elementIteraotr.hasNext()) { 154 | Element element = elementIteraotr.next(); 155 | temp_list_line.add(getTagContent(element.toString(), temp_tag, 156 | DataFormatStatus.TagAllContent)); 157 | } 158 | } 159 | 160 | temp_list_line = doListFilter(temp_list_line, dataFormatStatus, 161 | isFilter); 162 | 163 | return temp_list_line; 164 | } 165 | 166 | /** 167 | * 对选择器的默认调用 168 | * 169 | * @param htmlSource 170 | * @param selectorList 171 | * @param isFilter 172 | * @return 173 | */ 174 | public static List getNodeContentBySelector(String htmlSource, 175 | List selectorList, boolean isFilter) { 176 | return getNodeContentBySelector(htmlSource, selectorList, 177 | DataFormatStatus.CleanTxt, isFilter); 178 | } 179 | 180 | /** 181 | * 通过选择器来检索和获取节点数据,直接以集合形式传参 182 | * 183 | * @param htmlSource 184 | * @param selectorList 185 | * @return 186 | */ 187 | public static List getNodeContentBySelector(String htmlSource, 188 | List selectorList, DataFormatStatus dataFormatStatus, 189 | boolean isFilter) { 190 | if (htmlSource == null || htmlSource.isEmpty() || selectorList == null 191 | || selectorList.isEmpty()) { 192 | return null; 193 | } 194 | SystemAssert.assertNotNull(dataFormatStatus); 195 | 196 | Document doc = Jsoup.parse(htmlSource);// 先预解析 197 | 198 | Iterator selectorIteraotr = selectorList.iterator(); 199 | String temp_selector = null; 200 | List temp_list_element = new LinkedList(); 201 | Elements elements = null; 202 | Elements temp_elements = null; 203 | // 暂存循环迭代时候的结果 204 | List temp_list_line = new LinkedList(); 205 | Document temp_doc = null;// 暂存二次解析出来的doc 206 | boolean isFirst = true;// 标志是否是第一次进行选择器处理 207 | while (selectorIteraotr.hasNext()) { 208 | 209 | temp_selector = selectorIteraotr.next(); 210 | if (isFirst) { 211 | elements = doc.select(temp_selector); 212 | 213 | //System.out.println("elements: "+elements.toString()); 214 | isFirst = false; 215 | } else { 216 | /** 217 | * 对每个先前的结果集进行过滤 218 | */ 219 | elements.clear(); 220 | for (String line : temp_list_line) { 221 | if (line != null && (!line.isEmpty())) { 222 | temp_doc = Jsoup.parse(line);// 先预解析 223 | temp_elements = temp_doc.select(temp_selector); 224 | 225 | if (temp_elements != null && (!temp_elements.isEmpty())) { 226 | elements.addAll(temp_elements); 227 | } 228 | } 229 | } 230 | } 231 | temp_list_element.clear(); 232 | temp_list_element.addAll(elements); 233 | 234 | Iterator elementIteraotr = temp_list_element.iterator(); 235 | 236 | temp_list_line.clear(); 237 | while (elementIteraotr.hasNext()) { 238 | Element element = elementIteraotr.next(); 239 | //System.out.println("element: "+element); 240 | 241 | temp_list_line.add(element.toString()); 242 | } 243 | } 244 | // 做下过滤 245 | temp_list_line = doListFilter(temp_list_line, dataFormatStatus, 246 | isFilter); 247 | 248 | 249 | return temp_list_line; 250 | } 251 | 252 | /** 253 | * 第二版,主要是为有多个完全相同的内容块时,选取其中指定的索引位置的内容块,内容块位置从基数一开始 254 | * 255 | * @param htmlSource 256 | * @param selectorList 257 | * @param dataFormatStatus 258 | * @param isFilter 259 | * @return 260 | */ 261 | public static List getNodeContentBySelector4MultiSameBlock( 262 | String htmlSource, List selectorList, 263 | DataFormatStatus dataFormatStatus, boolean isFilter) { 264 | if (htmlSource == null || htmlSource.isEmpty() || selectorList == null 265 | || selectorList.isEmpty()) { 266 | return null; 267 | } 268 | SystemAssert.assertNotNull(dataFormatStatus); 269 | 270 | Document doc = Jsoup.parse(htmlSource);// 先预解析 271 | 272 | Iterator selectorIteraotr = selectorList.iterator(); 273 | String temp_selector = null; 274 | List temp_list_element = new LinkedList(); 275 | Elements elements = null; 276 | Elements temp_elements = null; 277 | // 暂存循环迭代时候的结果 278 | List temp_list_line = new LinkedList(); 279 | Document temp_doc = null;// 暂存二次解析出来的doc 280 | boolean isFirst = true;// 标志是否是第一次进行选择器处理 281 | String[] split_array = null; 282 | boolean find_block_index = false;// 标志是否有block_index存在 283 | int block_index = 0;// 如果block_index存在,则其具体的值在这 284 | while (selectorIteraotr.hasNext()) { 285 | temp_selector = selectorIteraotr.next(); 286 | // 判断是否有#index#来做分隔 287 | split_array = temp_selector.split(StaticValue.split_block_index); 288 | temp_selector = split_array[0]; 289 | 290 | //System.out.println("==="+temp_selector); 291 | if (split_array.length == 1) { 292 | find_block_index = false; 293 | } else if (split_array.length == 2) { 294 | try { 295 | block_index = Integer.parseInt(split_array[1]); 296 | if (block_index <= 0) { 297 | throw new Exception( 298 | "block index value <= 0 is wrong, plase the base value is 1"); 299 | } 300 | find_block_index = true; 301 | } catch (Exception e) { 302 | find_block_index = false; 303 | e.printStackTrace(); 304 | } 305 | } else { 306 | try { 307 | throw new Exception("jsoup规则写法有错误,目前只支持一个规则行中最多有一个 " 308 | + StaticValue.split_block_index 309 | + ",且在行末尾\n如果想写多个请用#split_big#来区分"); 310 | } catch (Exception e) { 311 | e.printStackTrace(); 312 | } 313 | } 314 | 315 | if (isFirst) { 316 | elements = doc.select(temp_selector); 317 | isFirst = false; 318 | } else { 319 | /** 320 | * 对每个先前的结果集进行过滤 321 | */ 322 | elements.clear(); 323 | for (String line : temp_list_line) { 324 | if (line != null && (!line.isEmpty())) { 325 | temp_doc = Jsoup.parse(line);// 先预解析 326 | temp_elements = temp_doc.select(temp_selector); 327 | 328 | if (temp_elements != null && (!temp_elements.isEmpty())) { 329 | elements.addAll(temp_elements); 330 | } 331 | } 332 | } 333 | } 334 | 335 | // 对前边的elements做block_index截取 336 | if (find_block_index 337 | && StringOperatorUtil.isNotBlankCollection(elements)) { 338 | int size = elements.size(); 339 | if (size >= block_index) { 340 | List eleList = new LinkedList(); 341 | eleList.add(elements.get(block_index - 1)); 342 | elements = new Elements(eleList); 343 | } 344 | } 345 | 346 | temp_list_element.clear(); 347 | temp_list_element.addAll(elements); 348 | 349 | Iterator elementIteraotr = temp_list_element.iterator(); 350 | 351 | temp_list_line.clear(); 352 | while (elementIteraotr.hasNext()) { 353 | Element element = elementIteraotr.next(); 354 | temp_list_line.add(element.toString()); 355 | } 356 | } 357 | // 做下过滤 358 | temp_list_line = doListFilter(temp_list_line, dataFormatStatus, 359 | isFilter); 360 | 361 | return temp_list_line; 362 | } 363 | 364 | /** 365 | * 取得字符串source的对应的属性attr的值 366 | * 367 | * @param source 368 | * @param attr 369 | * @return 370 | */ 371 | public static String getAttributeValue(String source, String attr) { 372 | if (source == null || attr == null) { 373 | return null; 374 | } 375 | Document doc = Jsoup.parse(source); 376 | Elements elements = doc.select("[" + attr + "]"); 377 | String temp = null; 378 | if (elements != null) { 379 | for (Element ele : elements) { 380 | temp = ele.attr(attr); 381 | } 382 | } 383 | return temp; 384 | } 385 | 386 | /** 387 | * 集合方式处理,得取集合中的每个元素串的attr属性对应的值 388 | * 389 | * @param sourceList 390 | * @param attr 391 | * @return 392 | */ 393 | public static List getAttributeValueList(List sourceList, 394 | String attr) { 395 | if (sourceList == null || attr == null) { 396 | return null; 397 | } 398 | List resultList = new LinkedList(); 399 | String selString = "[" + attr + "]"; 400 | for (String tempLine : sourceList) { 401 | Document doc = Jsoup.parse(tempLine); 402 | Elements elements = doc.select(selString); 403 | if (elements != null) { 404 | for (Element ele : elements) { 405 | resultList.add(ele.attr(attr)); 406 | } 407 | } 408 | } 409 | 410 | return resultList; 411 | } 412 | 413 | /** 414 | * 做最后的字符串过滤,该方法对用户透明 415 | * 416 | * @param temp_list_line 417 | * @param dataFormatStatus 418 | * @return 419 | */ 420 | private static List doListFilter(List temp_list_line, 421 | DataFormatStatus dataFormatStatus, boolean isFilter) { 422 | if (temp_list_line == null || temp_list_line.isEmpty()) { 423 | return null; 424 | } 425 | 426 | SystemAssert.assertNotNull(dataFormatStatus); 427 | 428 | // 最终的结合集进行所要的数据格式的过滤 429 | if (dataFormatStatus == DataFormatStatus.CleanTxt) { 430 | List cleanResultList = new LinkedList(); 431 | String temp_clean = null; 432 | for (String item : temp_list_line) { 433 | if (isFilter) { 434 | item = item.replaceAll(StaticValue.htmlTagRegex, ""); 435 | } 436 | if ((temp_clean = getCleanTxt(item)) != null 437 | && (!temp_clean.isEmpty())) { 438 | cleanResultList.add(temp_clean); 439 | } 440 | /* for (int i = 0; i < temp_list_line.size(); i ++) { 441 | if(null != cleanResultList && cleanResultList.get(i).indexOf(temp_clean) != -1) { 442 | 443 | } else { 444 | cleanResultList.add(temp_clean); 445 | } 446 | } 447 | }*/ 448 | } 449 | return cleanResultList; 450 | } 451 | return temp_list_line; 452 | } 453 | 454 | /** 455 | * url抽取相关,不带过滤条件 456 | * @param htmlSource 457 | * @return 458 | */ 459 | public static List getAllHref(String htmlSource) { 460 | try { 461 | Document doc = Jsoup.parse(htmlSource);// 先预解析 462 | Elements links = doc.getElementsByTag("a"); 463 | String linkHref = null; 464 | List urlList = new LinkedList(); 465 | for (Element link : links) { 466 | linkHref = link.attr("href"); 467 | // String linkText = link.text(); 468 | if (UrlOperatorUtil.isValidUrl(linkHref) 469 | && linkHref.startsWith("http:")) { 470 | urlList.add(linkHref.trim()); 471 | } 472 | } 473 | return urlList; 474 | } catch (Exception e) { 475 | e.printStackTrace(); 476 | } 477 | return null; 478 | } 479 | 480 | /** 481 | * url抽取相关,附带添加相关的host前缀 482 | * 483 | * @param fromUrl 484 | * @param host 485 | * @param htmlSource 486 | * @return 487 | */ 488 | public static List getAllHref4AddHost(String fromUrl, String host, 489 | String htmlSource) { 490 | try { 491 | Document doc = Jsoup.parse(htmlSource);// 先预解析 492 | Elements links = doc.getElementsByTag("a"); 493 | String linkHref = null; 494 | List urlList = new LinkedList(); 495 | // String host = UrlOperatorUtil.getHost(fromUrl); 496 | 497 | for (Element link : links) { 498 | linkHref = link.attr("href"); 499 | if (linkHref.startsWith("http://")) { 500 | // 不作处理 501 | } else if (linkHref.startsWith("/")) { 502 | // 绝对地址 503 | linkHref = "http://" + host + linkHref; 504 | } else { 505 | // 相对地址 506 | // 说明是相对路径 507 | int last_pos = fromUrl.lastIndexOf("/"); 508 | String relative_path = fromUrl.substring(0, last_pos + 1); 509 | linkHref = relative_path + linkHref; 510 | } 511 | if (UrlOperatorUtil.isValidUrl(linkHref) 512 | && linkHref.startsWith("http:")) { 513 | urlList.add(linkHref.trim()); 514 | } 515 | } 516 | return urlList; 517 | } catch (Exception e) { 518 | e.printStackTrace(); 519 | } 520 | return null; 521 | } 522 | 523 | /** 524 | * 去掉标签中某一部分内容,暂定位第一版 525 | * 526 | * @param htmlSource 527 | * @param selector 528 | * @param removeSelector 529 | * @return 530 | */ 531 | public static String removeInnerContent(String htmlSource, String selector, 532 | List removeSelector) { 533 | if (selector == null 534 | || StringOperatorUtil.isBlankCollection(removeSelector)) { 535 | return htmlSource; 536 | } 537 | try { 538 | Document doc = Jsoup.parse(htmlSource);// 先预解析 539 | Elements elements = doc.select(selector); 540 | String result = null; 541 | if (elements != null) { 542 | for (Element ele : elements) { 543 | for (String sel : removeSelector) { 544 | ele.select(sel).remove(); 545 | } 546 | // result = ele.toString(); 547 | result = JsoupHtmlParser.getCleanTxt(ele.toString()); 548 | break; 549 | } 550 | } 551 | return result; 552 | } catch (Exception e) { 553 | e.printStackTrace(); 554 | } 555 | return null; 556 | } 557 | /** 558 | * 559 | *百度词条重新爬取数据 560 | * @param htmlSource 561 | * @param selectorList 562 | * @return 563 | */ 564 | public static List getContentBySelector(String htmlSource, 565 | List selectorList, DataFormatStatus dataFormatStatus, 566 | boolean isFilter) { 567 | if (htmlSource == null || htmlSource.isEmpty() || selectorList == null 568 | || selectorList.isEmpty()) { 569 | return null; 570 | } 571 | SystemAssert.assertNotNull(dataFormatStatus); 572 | 573 | Document doc = Jsoup.parse(htmlSource);// 先预解析 574 | 575 | Iterator selectorIteraotr = selectorList.iterator(); 576 | String temp_selector = null; 577 | List temp_list_element = new LinkedList(); 578 | Elements elements = null; 579 | Elements temp_elements = null; 580 | // 暂存循环迭代时候的结果 581 | List temp_list_line = new LinkedList(); 582 | Document temp_doc = null;// 暂存二次解析出来的doc 583 | boolean isFirst = true;// 标志是否是第一次进行选择器处理 584 | while (selectorIteraotr.hasNext()) { 585 | 586 | temp_selector = selectorIteraotr.next(); 587 | if (isFirst) { 588 | elements = doc.select(temp_selector); 589 | 590 | //System.out.println("elements: "+elements.toString()); 591 | isFirst = false; 592 | } else { 593 | /** 594 | * 对每个先前的结果集进行过滤 595 | */ 596 | elements.clear(); 597 | for (String line : temp_list_line) { 598 | if (line != null && (!line.isEmpty())) { 599 | temp_doc = Jsoup.parse(line);// 先预解析 600 | temp_elements = temp_doc.select(temp_selector); 601 | 602 | if (temp_elements != null && (!temp_elements.isEmpty())) { 603 | elements.addAll(temp_elements); 604 | } 605 | } 606 | } 607 | } 608 | temp_list_element.clear(); 609 | temp_list_element.addAll(elements); 610 | 611 | Iterator elementIteraotr = temp_list_element.iterator(); 612 | temp_list_line.clear(); 613 | while (elementIteraotr.hasNext()) { 614 | Element element = elementIteraotr.next(); 615 | String aClass = element.attr("class"); 616 | String paraText = null; 617 | //String paraText = element.toString(); 618 | if("para".equals(aClass)){ 619 | paraText = element.text(); 620 | Element element2 = element.nextElementSibling(); 621 | String aClass2 = element2.attr("class"); 622 | while ("para".equals(aClass2)){ 623 | paraText +=element2.text(); 624 | element2 = element2.nextElementSibling(); 625 | aClass2 = element2.attr("class"); 626 | if(!"para".equals(aClass2)){ 627 | aClass=aClass2; 628 | } 629 | } 630 | 631 | }else{ 632 | paraText = element.toString(); 633 | 634 | } 635 | 636 | 637 | /*while ("para".equals(aClass)) { 638 | element = element.nextElementSibling(); 639 | aClass = element.attr("class"); 640 | 641 | if ("para".equals(aClass)){ 642 | String text = element.text(); 643 | paraText+=text; 644 | } 645 | }*/ 646 | temp_list_line.add(paraText); 647 | } 648 | } 649 | // 做下过滤 650 | temp_list_line = doListFilter(temp_list_line, dataFormatStatus, 651 | isFilter); 652 | 653 | 654 | return temp_list_line; 655 | } 656 | /** 657 | * 658 | *中国军医网数据爬取 659 | * @param htmlSource 660 | * @param selectorList 661 | * @return 662 | */ 663 | public static List getContentByChinese(String htmlSource, 664 | List selectorList, DataFormatStatus dataFormatStatus, 665 | boolean isFilter) { 666 | if (htmlSource == null || htmlSource.isEmpty() || selectorList == null 667 | || selectorList.isEmpty()) { 668 | return null; 669 | } 670 | SystemAssert.assertNotNull(dataFormatStatus); 671 | 672 | Document doc = Jsoup.parse(htmlSource);// 先预解析 673 | 674 | Iterator selectorIteraotr = selectorList.iterator(); 675 | String temp_selector = null; 676 | List temp_list_element = new LinkedList(); 677 | Elements elements = null; 678 | Elements temp_elements = null; 679 | // 暂存循环迭代时候的结果 680 | List temp_list_line = new LinkedList(); 681 | Document temp_doc = null;// 暂存二次解析出来的doc 682 | boolean isFirst = true;// 标志是否是第一次进行选择器处理 683 | while (selectorIteraotr.hasNext()) { 684 | 685 | temp_selector = selectorIteraotr.next(); 686 | if (isFirst) { 687 | elements = doc.select(temp_selector); 688 | 689 | //System.out.println("elements: "+elements.toString()); 690 | isFirst = false; 691 | } else { 692 | /** 693 | * 对每个先前的结果集进行过滤 694 | */ 695 | elements.clear(); 696 | for (String line : temp_list_line) { 697 | if (line != null && (!line.isEmpty())) { 698 | temp_doc = Jsoup.parse(line);// 先预解析 699 | temp_elements = temp_doc.select(temp_selector); 700 | 701 | if (temp_elements != null && (!temp_elements.isEmpty())) { 702 | elements.addAll(temp_elements); 703 | } 704 | } 705 | } 706 | } 707 | temp_list_element.clear(); 708 | temp_list_element.addAll(elements); 709 | 710 | Iterator elementIteraotr = temp_list_element.iterator(); 711 | 712 | temp_list_line.clear(); 713 | while (elementIteraotr.hasNext()) { 714 | Element element = elementIteraotr.next(); 715 | //System.out.println("element: "+element.toString()); 716 | String paraText = element.toString(); 717 | String aClass = element.attr("class"); 718 | while ("PreCaption".equals(aClass)) { 719 | element = elementIteraotr.next(); 720 | aClass = element.attr("class"); 721 | //System.out.println("elements: "+element.text()); 722 | if ("ColumnValue".equals(aClass)|"".equals(aClass)){ 723 | String text1 = element.text(); 724 | paraText +=text1; 725 | } 726 | } 727 | temp_list_line.add(paraText); 728 | 729 | } 730 | } 731 | // 做下过滤 732 | temp_list_line = doListFilter(temp_list_line, dataFormatStatus, 733 | isFilter); 734 | 735 | 736 | return temp_list_line; 737 | } 738 | } --------------------------------------------------------------------------------