├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src └── main ├── java └── com │ └── martin │ └── product │ ├── SpiderApplication.java │ ├── advice │ └── ExceptionAdvice.java │ ├── constants │ └── WebConstants.java │ ├── controller │ └── IndexController.java │ ├── listener │ └── InitListener.java │ ├── response │ └── BaseResponse.java │ ├── spider │ └── TaoBaoSpider.java │ ├── tuple │ └── Tuple2.java │ └── util │ ├── FileUtil.java │ ├── HtmlUtil.java │ ├── HttpUtil.java │ ├── LogUtil.java │ └── UserAgents.java └── resources ├── application.yml ├── logback.xml ├── static └── index.html └── user_agent └── user_agent.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea 3 | 4 | # Compiled class files 5 | target/ 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 戴小明 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TBSpider 2 | 基于Jsoup实现的爬虫demo项目,支持对淘宝商品页面进行抓取分析 3 | 4 | ## 框架和技术 5 | 6 | - JDK-1.8 7 | - SpringBoot-2.4.5 8 | - Jsoup-1.13.1 9 | - poi-5.0.0 10 | - bootstrap-3.3.5 bootstrap-fileinput-4.3.1 11 | 12 | ## 功能介绍 13 | 启动SpiderApplication后,页面访问地址为[http://localhost:8888](),选择需要分析的淘宝链接excel进行上传,后台收到文件会解析Excel并逐行对链接分析商品当前是否是出售中状态,分析完成后支持下载出售中的商品链接excel。 14 | 15 | 因为本项目是单机版,所以文件都存储在临时文件夹,进度也是保存在内存中,重启以后会丢失所有数据,需要分布式或者持久化的话可以自行改造。 16 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | com.martin.product 7 | tbspider 8 | 1.0 9 | 10 | org.springframework.boot 11 | spring-boot-starter-parent 12 | 2.4.5 13 | 14 | 15 | 16 | 17 | UTF-8 18 | UTF-8 19 | 1.8 20 | 21 | 1.4 22 | 2.7 23 | 3.7 24 | 1.2.76 25 | 30.1.1-jre 26 | 2.10.0 27 | 1.13.1 28 | 5.0.0 29 | 1.4.01 30 | 31 | 32 | 33 | 34 | 35 | commons-io 36 | commons-io 37 | ${commons.io.version} 38 | 39 | 40 | 41 | org.apache.commons 42 | commons-lang3 43 | ${commons.lang3.version} 44 | 45 | 46 | 47 | com.google.guava 48 | guava 49 | ${guava.version} 50 | 51 | 52 | 53 | com.fasterxml.jackson.core 54 | jackson-core 55 | ${jackson.version} 56 | 57 | 58 | com.fasterxml.jackson.core 59 | jackson-databind 60 | ${jackson.version} 61 | 62 | 63 | 64 | 65 | commons-fileupload 66 | commons-fileupload 67 | ${commons.fileupload.version} 68 | 69 | 70 | 71 | 72 | com.alibaba 73 | fastjson 74 | ${fastjson.version} 75 | 76 | 77 | 78 | 79 | org.jsoup 80 | jsoup 81 | ${jsoup.version} 82 | 83 | 84 | 85 | 86 | org.apache.poi 87 | poi 88 | ${poi.version} 89 | 90 | 91 | org.apache.poi 92 | poi-ooxml 93 | ${poi.version} 94 | 95 | 96 | xml-apis 97 | xml-apis 98 | ${xmlapis.version} 99 | 100 | 101 | 102 | 103 | 104 | 105 | org.springframework.boot 106 | spring-boot-starter-web 107 | 108 | 109 | 110 | org.springframework.boot 111 | spring-boot-starter-logging 112 | 113 | 114 | 115 | javax.servlet 116 | javax.servlet-api 117 | 118 | 119 | 120 | commons-io 121 | commons-io 122 | 123 | 124 | 125 | org.apache.commons 126 | commons-lang3 127 | 128 | 129 | 130 | com.google.guava 131 | guava 132 | 133 | 134 | 135 | commons-fileupload 136 | commons-fileupload 137 | 138 | 139 | 140 | com.alibaba 141 | fastjson 142 | 143 | 144 | 145 | org.jsoup 146 | jsoup 147 | 148 | 149 | 150 | org.apache.poi 151 | poi 152 | 153 | 154 | org.apache.poi 155 | poi-ooxml 156 | 157 | 158 | 159 | 160 | 161 | sonatype-nexus-snapshots 162 | https://oss.sonatype.org/content/repositories/snapshots 163 | 164 | true 165 | 166 | 167 | true 168 | 169 | 170 | 171 | 172 | 173 | tbspider 174 | 175 | 176 | org.apache.maven.plugins 177 | maven-compiler-plugin 178 | 3.5.1 179 | 180 | ${java.version} 181 | ${java.version} 182 | 183 | 184 | 185 | 186 | org.springframework.boot 187 | spring-boot-maven-plugin 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/SpiderApplication.java: -------------------------------------------------------------------------------- 1 | package com.martin.product; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | import org.springframework.boot.web.servlet.ServletComponentScan; 6 | 7 | @SpringBootApplication(scanBasePackages = "com.martin.product") 8 | @ServletComponentScan(basePackages = "com.martin.product.listener") 9 | public class SpiderApplication { 10 | 11 | public static void main(String[] args) { 12 | SpringApplication.run(SpiderApplication.class, args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/advice/ExceptionAdvice.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.advice; 2 | 3 | import com.martin.product.response.BaseResponse; 4 | import com.martin.product.util.LogUtil; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.web.HttpRequestMethodNotSupportedException; 8 | import org.springframework.web.bind.annotation.ControllerAdvice; 9 | import org.springframework.web.bind.annotation.ExceptionHandler; 10 | import org.springframework.web.bind.annotation.ResponseBody; 11 | 12 | import javax.servlet.http.HttpServletRequest; 13 | 14 | /** 15 | * 统一处理异常 16 | */ 17 | @ControllerAdvice 18 | public class ExceptionAdvice { 19 | 20 | private static final Logger logger = LoggerFactory.getLogger(ExceptionAdvice.class); 21 | 22 | @ExceptionHandler(value = Exception.class) 23 | @ResponseBody 24 | public BaseResponse handleException(HttpServletRequest request, Exception e) { 25 | BaseResponse response = new BaseResponse<>(); 26 | if (e instanceof IllegalArgumentException) { 27 | BaseResponse.fail(e.getMessage()); 28 | } else if (e instanceof HttpRequestMethodNotSupportedException) { 29 | BaseResponse.fail("不支持的请求方式"); 30 | } else { 31 | logger.error(LogUtil.buildLog("请求出现异常", request.getRequestURI(), request.getParameterMap()), e); 32 | BaseResponse.fail("服务器未知异常"); 33 | } 34 | 35 | return response; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/constants/WebConstants.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.constants; 2 | 3 | public class WebConstants { 4 | 5 | public static String ROOT_PATH = null; 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/controller/IndexController.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.controller; 2 | 3 | import com.google.common.collect.Maps; 4 | import com.martin.product.response.BaseResponse; 5 | import com.martin.product.spider.TaoBaoSpider; 6 | import com.martin.product.tuple.Tuple2; 7 | import com.martin.product.util.FileUtil; 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.poi.hssf.usermodel.HSSFWorkbook; 10 | import org.apache.poi.ss.usermodel.*; 11 | import org.apache.poi.xssf.usermodel.XSSFWorkbook; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | import org.springframework.util.Assert; 15 | import org.springframework.web.bind.annotation.*; 16 | import org.springframework.web.multipart.MultipartFile; 17 | 18 | import javax.servlet.http.HttpServletResponse; 19 | import java.io.*; 20 | import java.util.Map; 21 | 22 | @RestController 23 | public class IndexController { 24 | 25 | private static final Logger logger = LoggerFactory.getLogger(IndexController.class); 26 | 27 | private static final Map> PROCESS_MAP = Maps.newHashMap(); 28 | 29 | /** 30 | * 上传excel文件 31 | */ 32 | @PostMapping(value = "/upload") 33 | public BaseResponse> upload(@RequestParam MultipartFile file) { 34 | Assert.notNull(file, "请上传要分析的文件"); 35 | String fileName = file.getOriginalFilename(); 36 | String fileSuffix = fileName.substring(fileName.lastIndexOf(".")); 37 | if (!".xls".equals(fileSuffix) && !".xlsx".equals(fileSuffix)) { 38 | throw new IllegalArgumentException("不支持的文件格式"); 39 | } 40 | 41 | final String randomFileName = String.valueOf(System.currentTimeMillis()); 42 | FileUtil.saveTempFile(file, randomFileName + fileSuffix); 43 | PROCESS_MAP.put(fileName, new Tuple2<>(0, 0)); 44 | new Thread(() -> analyzeExcel(randomFileName, fileSuffix)).start(); 45 | Map result = Maps.newHashMap(); 46 | result.put("key", randomFileName); 47 | return BaseResponse.success(result); 48 | } 49 | 50 | /** 51 | * 获取进度 52 | */ 53 | @GetMapping(value = "/getProgress/{fileKey}") 54 | public BaseResponse> getProgress(@PathVariable("fileKey") String fileKey) { 55 | Map result = Maps.newHashMap(); 56 | if (PROCESS_MAP.containsKey(fileKey)) { 57 | Tuple2 progressTuple = PROCESS_MAP.get(fileKey); 58 | result.put("status", "processing"); 59 | result.put("percent", progressTuple.getP1() * 100 / progressTuple.getP2()); 60 | } else { 61 | result.put("status", "OK"); 62 | } 63 | return BaseResponse.success(result); 64 | } 65 | 66 | /** 67 | * 下载分析结果 68 | */ 69 | @GetMapping(value = "/download/{fileKey}") 70 | public void download(@PathVariable("fileKey") String fileKey, HttpServletResponse response) throws IOException { 71 | String path = FileUtil.getResultPath(); 72 | File file = new File(path + File.separator + fileKey + ".xls"); 73 | Assert.isTrue(file.exists(), "文件不存在"); 74 | 75 | try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file)); 76 | BufferedOutputStream bos = new BufferedOutputStream(response.getOutputStream())) { 77 | response.setCharacterEncoding("UTF-8"); 78 | response.setContentType("application/octet-stream"); 79 | response.setHeader("Content-Disposition", "attachment;filename=" + fileKey + ".xls"); 80 | byte[] b = new byte[1024]; 81 | int length; 82 | while ((length = bis.read(b)) > 0) { 83 | bos.write(b, 0, length); 84 | } 85 | bos.flush(); 86 | } 87 | } 88 | 89 | private static void analyzeExcel(String fileName, String suffix) { 90 | File file = new File(FileUtil.getTmpPath() + File.separator + fileName + suffix); 91 | InputStream is; 92 | Workbook readWB; 93 | Workbook writeWB; 94 | try { 95 | is = new FileInputStream(file); 96 | switch (suffix) { 97 | case ".xls": 98 | readWB = new HSSFWorkbook(is); 99 | break; 100 | case ".xlsx": 101 | readWB = new XSSFWorkbook(is); 102 | break; 103 | default: 104 | return; 105 | } 106 | Sheet sheet = readWB.getSheetAt(0); 107 | int total = sheet.getPhysicalNumberOfRows(); 108 | writeWB = new HSSFWorkbook(); 109 | writeWB.createCellStyle(); 110 | CellStyle style = writeWB.createCellStyle(); // 样式对象 111 | Sheet writeSheet = writeWB.createSheet(); 112 | int writeRowNum = 0; 113 | Row readRow; 114 | Row writeRow; 115 | Cell cell; 116 | String tbUrl; 117 | boolean isOnSale; 118 | for (int i = 0; i < total; i++) { 119 | readRow = sheet.getRow(i); 120 | cell = readRow.getCell(0); 121 | tbUrl = cell.getStringCellValue(); 122 | updateProgress(fileName, i + 1, total); 123 | if (StringUtils.isBlank(tbUrl)) { 124 | break; 125 | } 126 | isOnSale = TaoBaoSpider.checkItemIsOnSale(tbUrl); 127 | if (isOnSale) { 128 | writeRow = writeSheet.createRow(writeRowNum); 129 | createAndFillCell(writeSheet, writeRow, style, 0, tbUrl); 130 | writeRowNum++; 131 | } 132 | try { 133 | Thread.sleep(500); 134 | } catch (InterruptedException ignored) { 135 | } 136 | } 137 | } catch (Exception e) { 138 | logger.error("分析Excel异常", e); 139 | return; 140 | } finally { 141 | PROCESS_MAP.remove(fileName); 142 | } 143 | 144 | // 写入文件 145 | FileOutputStream fileOut = null; 146 | try { 147 | File folder = new File(FileUtil.getResultPath()); 148 | if (!folder.exists()) { 149 | folder.mkdirs(); 150 | } 151 | File resultFile = new File(folder, fileName + ".xls"); 152 | resultFile.deleteOnExit(); 153 | resultFile.createNewFile(); 154 | fileOut = new FileOutputStream(resultFile); 155 | writeWB.write(fileOut); 156 | fileOut.flush(); 157 | } catch (Exception ignored) { 158 | } finally { 159 | if (fileOut != null) { 160 | try { 161 | fileOut.close(); 162 | } catch (IOException ignored) { 163 | } 164 | } 165 | } 166 | } 167 | 168 | /** 169 | * 更新进度 170 | */ 171 | private static void updateProgress(String key, int current, int total) { 172 | Tuple2 processTuple; 173 | if (PROCESS_MAP.containsKey(key)) { 174 | processTuple = PROCESS_MAP.get(key); 175 | processTuple.setP1(current); 176 | processTuple.setP2(total); 177 | } else { 178 | processTuple = new Tuple2<>(current, total); 179 | PROCESS_MAP.put(key, processTuple); 180 | } 181 | } 182 | 183 | private static void createAndFillCell(Sheet sheet, Row row, CellStyle style, int colIdx, String value) { 184 | Cell cell = row.createCell(colIdx); 185 | cell.setCellStyle(style); 186 | cell.setCellValue(value); 187 | sheet.autoSizeColumn(colIdx, true); 188 | if (value != null) { 189 | int width = value.getBytes().length * 256; 190 | sheet.setColumnWidth(colIdx, width); 191 | } 192 | } 193 | 194 | } 195 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/listener/InitListener.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.listener; 2 | 3 | 4 | import com.martin.product.constants.WebConstants; 5 | 6 | import javax.servlet.ServletContext; 7 | import javax.servlet.ServletContextEvent; 8 | import javax.servlet.ServletContextListener; 9 | import javax.servlet.annotation.WebListener; 10 | import java.io.File; 11 | 12 | /** 13 | * 初始化监听 14 | */ 15 | @WebListener 16 | public class InitListener implements ServletContextListener { 17 | 18 | @Override 19 | public void contextInitialized(ServletContextEvent sce) { 20 | ServletContext servletContext = sce.getServletContext(); 21 | WebConstants.ROOT_PATH = servletContext.getRealPath(File.separator); 22 | } 23 | 24 | @Override 25 | public void contextDestroyed(ServletContextEvent sce) { 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/response/BaseResponse.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.response; 2 | 3 | 4 | /** 5 | * 基础响应结果类 6 | */ 7 | public class BaseResponse { 8 | 9 | /** 10 | * 错误信息 11 | */ 12 | private String message; 13 | /** 14 | * 是否响应成功 15 | */ 16 | private boolean success; 17 | /** 18 | * 响应数据 19 | */ 20 | private T data; 21 | 22 | public BaseResponse() { 23 | } 24 | 25 | public BaseResponse(T data) { 26 | success = true; 27 | this.data = data; 28 | } 29 | 30 | public static BaseResponse success() { 31 | BaseResponse response = new BaseResponse<>(); 32 | response.success = true; 33 | return response; 34 | } 35 | 36 | public static BaseResponse success(T t) { 37 | return new BaseResponse<>(t); 38 | } 39 | 40 | public static BaseResponse fail(String message) { 41 | BaseResponse response = new BaseResponse<>(); 42 | response.message = message; 43 | return response; 44 | } 45 | 46 | public String getMessage() { 47 | return message; 48 | } 49 | 50 | public void setMessage(String message) { 51 | this.message = message; 52 | } 53 | 54 | public boolean isSuccess() { 55 | return success; 56 | } 57 | 58 | public void setSuccess(boolean success) { 59 | this.success = success; 60 | } 61 | 62 | public T getData() { 63 | return data; 64 | } 65 | 66 | public void setData(T data) { 67 | this.data = data; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/spider/TaoBaoSpider.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.spider; 2 | 3 | import com.martin.product.util.HttpUtil; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | 7 | /** 8 | * 淘宝爬虫 9 | * Created by Martin on 2016/3/12. 10 | */ 11 | public class TaoBaoSpider { 12 | 13 | /** 14 | * 检查宝贝是否在售 15 | */ 16 | public static boolean checkItemIsOnSale(String url) { 17 | Document document = HttpUtil.buildHtmlDocument(url); 18 | Element element = document.getElementById("J_LinkBuy"); 19 | return element!= null; 20 | } 21 | 22 | public static void main(String[] args) { 23 | String url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.13.514e2499vNWsK9&id=561872607703&cm_id=140105335569ed55e27b&abbucket=7"; 24 | System.out.println(checkItemIsOnSale(url)); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/tuple/Tuple2.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.tuple; 2 | 3 | /** 4 | * 包含两个元素的元组 5 | */ 6 | public class Tuple2 { 7 | 8 | private P1 p1; 9 | private P2 p2; 10 | 11 | public Tuple2(P1 p1, P2 p2) { 12 | this.p1 = p1; 13 | this.p2 = p2; 14 | } 15 | 16 | public P1 getP1() { 17 | return p1; 18 | } 19 | 20 | public void setP1(P1 p1) { 21 | this.p1 = p1; 22 | } 23 | 24 | public P2 getP2() { 25 | return p2; 26 | } 27 | 28 | public void setP2(P2 p2) { 29 | this.p2 = p2; 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return "Tuple2{" + 35 | "p1=" + p1 + 36 | ", p2=" + p2 + 37 | '}'; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.util; 2 | 3 | import com.martin.product.constants.WebConstants; 4 | import org.springframework.web.multipart.MultipartFile; 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | 9 | /** 10 | * 文件工具类 11 | */ 12 | public class FileUtil { 13 | 14 | /** 15 | * 获取临时文件夹目录 16 | */ 17 | public static String getTmpPath() { 18 | return WebConstants.ROOT_PATH + "tmp"; 19 | } 20 | 21 | /** 22 | * 获取结果目录 23 | */ 24 | public static String getResultPath() { 25 | return WebConstants.ROOT_PATH + "excel"; 26 | } 27 | 28 | public static void saveTempFile(MultipartFile file, String targetFileName) { 29 | File folder = new File(getTmpPath()); 30 | if (!folder.exists()) { 31 | folder.mkdirs(); 32 | } 33 | File targetFile = new File(folder, targetFileName); 34 | try { 35 | file.transferTo(targetFile); 36 | } catch (IOException ignored) { 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/util/HtmlUtil.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.util; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | /** 7 | * html字符串操作工具类 8 | */ 9 | public class HtmlUtil { 10 | 11 | /** 12 | * 提取一段字符串s中的js变量,如果s中有多个同名变量,返回第一个 13 | */ 14 | public static String getJsVarValue(String s, String varName) { 15 | Pattern pattern = Pattern.compile("var\\s+" + varName + "\\s+=(.*);"); 16 | Matcher matcher = pattern.matcher(s); 17 | String value = ""; 18 | if (matcher.find()) { 19 | value = matcher.group(1).trim(); 20 | } 21 | return value; 22 | } 23 | 24 | /** 25 | * 获取host 26 | */ 27 | public static String getServerHost(String url) { 28 | if (url.contains("www") && url.indexOf(".com") > 0) { 29 | return url.substring(url.indexOf("www"), url.indexOf(".com") + 4); 30 | } else if (url.contains("https") && url.indexOf(".com") > 0) { 31 | return url.substring(url.indexOf("https") + 8, url.indexOf(".com") + 4); 32 | } else if (url.contains("http") && url.indexOf(".com") > 0) { 33 | return url.substring(url.indexOf("http") + 7, url.indexOf(".com") + 4); 34 | } 35 | return ""; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/util/HttpUtil.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.util; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | import org.jsoup.Connection; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * http访问工具类 12 | */ 13 | public class HttpUtil { 14 | 15 | /** 16 | * 根据url构建html页面 17 | */ 18 | public static Document buildHtmlDocument(String url) { 19 | return buildHtmlDocument(false, url); 20 | } 21 | 22 | /** 23 | * 根据url构建html页面 24 | */ 25 | public static Document buildHtmlDocument(boolean isUseProxy, String url) { 26 | return buildHtmlDocument(isUseProxy, url, null); 27 | } 28 | 29 | /** 30 | * 根据url和cookie构建html页面 31 | */ 32 | public static Document buildHtmlDocument(String url, String cookie) { 33 | String UA = ""; 34 | try { 35 | UA = UserAgents.getRandomUserAgent(); 36 | } catch (Exception ignored) { 37 | } 38 | return buildHtmlDocument(false, url, cookie, UA); 39 | } 40 | 41 | /** 42 | * 根据url和cookie构建html页面 43 | */ 44 | public static Document buildHtmlDocument(boolean isUseProxy, String url, String cookie) { 45 | String UA = ""; 46 | try { 47 | UA = UserAgents.getRandomUserAgent(); 48 | } catch (Exception ignored) { 49 | } 50 | return buildHtmlDocument(isUseProxy, url, cookie, UA); 51 | } 52 | 53 | /** 54 | * 根据url、cookie和UA构建html页面 55 | */ 56 | public static Document buildHtmlDocument(String url, String cookie, String UA) { 57 | return buildHtmlDocument(false, url, cookie, UA, null); 58 | } 59 | 60 | /** 61 | * 根据url、cookie和UA构建html页面 62 | */ 63 | public static Document buildHtmlDocument(boolean isUseProxy, String url, String cookie, String UA) { 64 | return buildHtmlDocument(isUseProxy, url, cookie, UA, null); 65 | } 66 | 67 | /** 68 | * 根据url、cookie、UA和referer构建html页面 69 | */ 70 | public static Document buildHtmlDocument(String url, String cookie, String UA, String referer) { 71 | return buildHtmlDocument(false, url, cookie, UA, referer, null); 72 | } 73 | 74 | /** 75 | * 根据url、cookie、UA和referer构建html页面 76 | */ 77 | public static Document buildHtmlDocument(boolean isUseProxy, String url, String cookie, String UA, String referer) { 78 | return buildHtmlDocument(isUseProxy, url, cookie, UA, referer, null); 79 | } 80 | 81 | /** 82 | * 根据url、cookie、UA、referer和host构建html页面 83 | */ 84 | public static Document buildHtmlDocument(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 85 | Connection connection = createGetDocumentConnection(isUseProxy, url, cookie, UA, referer, host); 86 | Connection.Response response = executeConnection(connection); 87 | if (response != null) { 88 | try { 89 | return response.parse(); 90 | } catch (IOException ignored) { 91 | } 92 | } 93 | return null; 94 | } 95 | 96 | /** 97 | * 构建一个同步抓取连接 98 | */ 99 | public static Connection getConnection(String url) { 100 | return getConnection(false, url, null); 101 | } 102 | 103 | /** 104 | * 构建一个同步抓取连接 105 | */ 106 | public static Connection getConnection(boolean isUseProxy, String url) { 107 | return getConnection(isUseProxy, url, null); 108 | } 109 | 110 | /** 111 | * 构建一个同步抓取连接 112 | */ 113 | public static Connection getConnection(String url, String cookie) { 114 | String UA = ""; 115 | try { 116 | UA = UserAgents.getRandomUserAgent(); 117 | } catch (Exception ignored) { 118 | } 119 | return getConnection(url, cookie, UA, null, null); 120 | } 121 | 122 | /** 123 | * 构建一个同步抓取连接 124 | */ 125 | public static Connection getConnection(boolean isUseProxy, String url, String cookie) { 126 | String UA = ""; 127 | try { 128 | UA = UserAgents.getRandomUserAgent(); 129 | } catch (Exception ignored) { 130 | } 131 | return getConnection(isUseProxy, url, cookie, UA, null, null); 132 | } 133 | 134 | /** 135 | * 构建一个同步抓取连接 136 | */ 137 | public static Connection getConnection(String url, String cookie, String UA, String referer, String host) { 138 | Connection connection = Jsoup.connect(url).timeout(60000).ignoreContentType(true).ignoreHttpErrors(true); 139 | connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 140 | connection.header("Accept-Encoding", "gzip,deflate,sdch"); 141 | connection.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4"); 142 | connection.header("Cache-Control", "no-cache"); 143 | connection.header("Connection", "keep-alive"); 144 | if (!StringUtils.isEmpty(UA)) { 145 | connection.header("User-Agent", UA); 146 | } 147 | if (!StringUtils.isEmpty(referer)) { 148 | connection.header("Referer", referer); 149 | } 150 | if (!StringUtils.isEmpty(cookie)) { 151 | connection.header("Cookie", cookie); 152 | } 153 | 154 | if (StringUtils.isEmpty(host)) { 155 | host = HtmlUtil.getServerHost(url); 156 | } 157 | if (StringUtils.isNotEmpty(host)) { 158 | connection.header("Host", host); 159 | } 160 | 161 | return connection; 162 | } 163 | 164 | /** 165 | * 构建一个同步抓取连接 166 | */ 167 | public static Connection getConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 168 | return getConnection(url, cookie, UA, referer, host); 169 | } 170 | 171 | /** 172 | * 构建一个请求类型为GET,返回类型为document的抓取连接 173 | */ 174 | public static Connection createGetDocumentConnection(boolean isUseProxy, String url, String cookie) { 175 | Connection connection = getConnection(url, cookie); 176 | connection.method(Connection.Method.GET); 177 | return connection; 178 | } 179 | 180 | /** 181 | * 构建一个请求类型为GET,返回类型为document的抓取连接 182 | */ 183 | public static Connection createGetDocumentConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 184 | Connection connection = getConnection(url, cookie, UA, referer, host); 185 | connection.method(Connection.Method.GET); 186 | return connection; 187 | } 188 | 189 | /** 190 | * 构建一个请求类型为POST,返回类型为document的抓取连接 191 | */ 192 | public static Connection createPostDocumentConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 193 | Connection connection = getConnection(url, cookie, UA, referer, host); 194 | connection.method(Connection.Method.POST); 195 | return connection; 196 | } 197 | 198 | /** 199 | * 构建一个返回类型为String的抓取连接 200 | */ 201 | public static Connection createStringConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 202 | Connection connection = getConnection(url, cookie, UA, referer, host); 203 | connection.header("Accept", "*/*"); 204 | return connection.ignoreContentType(true); 205 | } 206 | 207 | /** 208 | * 构建一个请求类型为GET,返回类型为String的抓取连接 209 | */ 210 | public static Connection createGetStringConnection(String url, String cookie) { 211 | return createGetStringConnection(false, url, cookie); 212 | } 213 | 214 | /** 215 | * 构建一个请求类型为GET,返回类型为String的抓取连接 216 | */ 217 | public static Connection createGetStringConnection(boolean isUseProxy, String url, String cookie) { 218 | return createGetStringConnection(isUseProxy, url, cookie, null, null, null); 219 | } 220 | 221 | /** 222 | * 构建一个请求类型为GET,返回类型为String的抓取连接 223 | */ 224 | public static Connection createGetStringConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host) { 225 | Connection connection = createStringConnection(isUseProxy, url, cookie, UA, referer, host); 226 | connection.method(Connection.Method.GET); 227 | return connection; 228 | } 229 | 230 | /** 231 | * 构建一个请求类型为POST,返回类型为String的抓取连接 232 | */ 233 | public static Connection createPostStringConnection(boolean isUseProxy, String url, String cookie, String UA, String referer, String host, 234 | String... postData) { 235 | Connection connection = createStringConnection(isUseProxy, url, cookie, UA, referer, host); 236 | connection.method(Connection.Method.POST); 237 | connection.data(postData); 238 | return connection; 239 | } 240 | 241 | public static Connection.Response executeConnection(Connection connection) { 242 | for (int i = 0; i < 3; i++) { 243 | try { 244 | return connection.execute(); 245 | } catch (Exception e) { 246 | try { 247 | Thread.sleep(1000); 248 | } catch (Exception ev) { 249 | Thread.interrupted(); 250 | } 251 | } 252 | } 253 | return null; 254 | } 255 | 256 | /** 257 | * 根据url构建byte数组返回数据,默认不使用代理 258 | */ 259 | public static byte[] buildGetBytes(String url) { 260 | return buildGetBytes(false, url); 261 | } 262 | 263 | /** 264 | * 根据url构建byte数组返回数据,根据参数选择使用代理 265 | */ 266 | public static byte[] buildGetBytes(boolean isUseProxy, String url) { 267 | return buildGetBytes(isUseProxy, url, null); 268 | } 269 | 270 | /** 271 | * 根据url和cookie构建byte数组返回数据,默认不使用代理 272 | */ 273 | public static byte[] buildGetBytes(String url, String cookie) { 274 | return buildGetBytes(false, url, cookie); 275 | } 276 | 277 | /** 278 | * 根据url和cookie构建byte数组返回数据,根据参数选择使用代理 279 | */ 280 | public static byte[] buildGetBytes(boolean isUseProxy, String url, String cookie) { 281 | try { 282 | Connection connection = createGetStringConnection(isUseProxy, url, cookie, UserAgents.getRandomUserAgent(), null, 283 | HtmlUtil.getServerHost(url)); 284 | Connection.Response response = executeConnection(connection); 285 | if (response != null) { 286 | return response.bodyAsBytes(); 287 | } 288 | } catch (Exception ignored) { 289 | } 290 | 291 | return null; 292 | } 293 | 294 | /** 295 | * 根据url构建String类型返回数据 296 | */ 297 | public static String buildGetString(String url) { 298 | return buildGetString(false, url); 299 | } 300 | 301 | /** 302 | * 根据url构建String类型返回数据 303 | */ 304 | public static String buildGetString(boolean isUseProxy, String url) { 305 | return buildGetString(isUseProxy, url, null); 306 | } 307 | 308 | /** 309 | * 根据url和cookie构建String类型返回数据 310 | */ 311 | public static String buildGetString(String url, String cookie) { 312 | return buildGetString(false, url, cookie); 313 | } 314 | 315 | /** 316 | * 根据url和cookie构建String类型返回数据 317 | */ 318 | public static String buildGetString(boolean isUseProxy, String url, String cookie) { 319 | String UA = ""; 320 | try { 321 | UA = UserAgents.getRandomUserAgent(); 322 | } catch (Exception ignored) { 323 | } 324 | return buildGetString(isUseProxy, url, cookie, UA); 325 | } 326 | 327 | /** 328 | * 根据url、feferer和cookie构建String类型返回数据 329 | */ 330 | public static String buildGetString(boolean isUseProxy, String url, String cookie, String UA) { 331 | return buildGetString(isUseProxy, url, cookie, UA, null); 332 | } 333 | 334 | /** 335 | * 根据url、feferer和cookie构建String类型返回数据 336 | */ 337 | public static String buildGetString(boolean isUseProxy, String url, String cookie, String UA, String referer) { 338 | try { 339 | Connection connection = createGetStringConnection(isUseProxy, url, cookie, UA, referer, HtmlUtil.getServerHost(url)); 340 | Connection.Response response = executeConnection(connection); 341 | if (response != null) { 342 | return response.body(); 343 | } 344 | } catch (Exception ignored) { 345 | } 346 | 347 | return null; 348 | } 349 | 350 | /** 351 | * 根据url、cookie和stringData构建String类型返回数据 352 | */ 353 | public static String buildPostString(String url, String cookie, String stringData) { 354 | return buildPostString(false, url, cookie, stringData); 355 | } 356 | 357 | // /** 358 | // * 根据url、cookie和stringData构建String类型返回数据 359 | // */ 360 | // public static String buildPostString(boolean isUseProxy, String url, String cookie, String stringData) { 361 | // try { 362 | // Connection connection = createPostStringConnection(isUseProxy, url, cookie, UserAgents.getRandomUserAgent(), null, 363 | // HtmlUtils.getServerHost(url)); 364 | // connection.header("Content-Type", "application/json;charset=UTF-8"); 365 | // connection.stringData(stringData); 366 | // Connection.Response response = executeConnection(connection); 367 | // if (response != null) { 368 | // return response.body(); 369 | // } 370 | // } catch (Exception ignored) { 371 | // } 372 | // 373 | // return null; 374 | // } 375 | 376 | /** 377 | * 根据url、cookie和postData构建String类型返回数据 378 | */ 379 | public static String buildPostString(String url, String cookie, String... postData) { 380 | return buildPostString(false, url, cookie, postData); 381 | } 382 | 383 | /** 384 | * 根据url、cookie和postData构建String类型返回数据 385 | */ 386 | public static String buildPostString(boolean isUseProxy, String url, String cookie, String... postData) { 387 | try { 388 | Connection connection = createPostStringConnection(isUseProxy, url, cookie, UserAgents.getRandomUserAgent(), null, 389 | HtmlUtil.getServerHost(url), postData); 390 | Connection.Response response = executeConnection(connection); 391 | if (response != null) { 392 | return response.body(); 393 | } 394 | } catch (Exception ignored) { 395 | } 396 | 397 | return null; 398 | } 399 | 400 | /** 401 | * 判断页面是否是重定向 402 | */ 403 | public static boolean isRedirect(Connection.Response response) { 404 | int code = response.statusCode(); 405 | return code == 301 || code == 302 || code == 303; 406 | } 407 | 408 | } 409 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/util/LogUtil.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.util; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | 5 | /** 6 | * 日志工具类 7 | */ 8 | public class LogUtil { 9 | 10 | /** 11 | * 构建日志字符串 12 | */ 13 | public static String buildLog(Object... objArray) { 14 | StringBuilder logBuilder = new StringBuilder(); 15 | 16 | for (Object obj : objArray) { 17 | if (logBuilder.length() > 0) { 18 | logBuilder.append(" | "); 19 | } 20 | 21 | if (obj instanceof String) { 22 | logBuilder.append(obj); 23 | } else { 24 | logBuilder.append(JSON.toJSONString(obj)); 25 | } 26 | } 27 | 28 | return logBuilder.toString(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/martin/product/util/UserAgents.java: -------------------------------------------------------------------------------- 1 | package com.martin.product.util; 2 | 3 | import org.springframework.core.io.ClassPathResource; 4 | 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.util.HashSet; 8 | import java.util.Random; 9 | import java.util.Scanner; 10 | import java.util.Set; 11 | 12 | public class UserAgents { 13 | 14 | private static final String FILE_PATH = "/user_agent/user_agent.txt"; 15 | 16 | private static final Object LOCK = new Object(); 17 | private static String[] userAgents = null; 18 | 19 | public static String[] getUserAgents() throws IOException { 20 | if (userAgents != null) { 21 | return userAgents; 22 | } 23 | 24 | synchronized (LOCK) { 25 | if (userAgents != null) { 26 | return userAgents; 27 | } 28 | 29 | Set userAgentSet = new HashSet<>(); 30 | ClassPathResource resource = new ClassPathResource(FILE_PATH); 31 | InputStream input = resource.getInputStream(); 32 | Scanner scanner = new Scanner(input); 33 | while (scanner.hasNext()) { 34 | String userAgent = scanner.nextLine(); 35 | userAgentSet.add(userAgent); 36 | } 37 | input.close(); 38 | 39 | userAgents = new String[userAgentSet.size()]; 40 | userAgentSet.toArray(userAgents); 41 | return userAgents; 42 | } 43 | } 44 | 45 | public static String getRandomUserAgent() throws Exception { 46 | String[] userAgentArray = getUserAgents(); 47 | 48 | Random random = new Random(); 49 | int index = random.nextInt() % userAgentArray.length; 50 | if (index < 0) { 51 | index = -index; 52 | } 53 | 54 | return userAgentArray[index]; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/resources/application.yml: -------------------------------------------------------------------------------- 1 | spring: 2 | application: 3 | name: tbSpider 4 | server: 5 | port: 8888 -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %date %-5level [%file:%line] - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/main/resources/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 淘宝网链接分析 8 | 26 | 27 | 28 |
29 |

淘宝网链接分析

30 | 31 | 37 | 38 |
39 | 40 | 41 | 42 | 43 | 44 | 97 | 98 | --------------------------------------------------------------------------------