├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
└── main
├── java
└── com
│ └── martin
│ └── product
│ ├── SpiderApplication.java
│ ├── advice
│ └── ExceptionAdvice.java
│ ├── constants
│ └── WebConstants.java
│ ├── controller
│ └── IndexController.java
│ ├── listener
│ └── InitListener.java
│ ├── response
│ └── BaseResponse.java
│ ├── spider
│ └── TaoBaoSpider.java
│ ├── tuple
│ └── Tuple2.java
│ └── util
│ ├── FileUtil.java
│ ├── HtmlUtil.java
│ ├── HttpUtil.java
│ ├── LogUtil.java
│ └── UserAgents.java
└── resources
├── application.yml
├── logback.xml
├── static
└── index.html
└── user_agent
└── user_agent.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # IntelliJ project files
2 | .idea
3 |
4 | # Compiled class files
5 | target/
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 戴小明
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TBSpider
2 | 基于Jsoup实现的爬虫demo项目,支持对淘宝商品页面进行抓取分析
3 |
4 | ## 框架和技术
5 |
6 | - JDK-1.8
7 | - SpringBoot-2.4.5
8 | - Jsoup-1.13.1
9 | - poi-5.0.0
10 | - bootstrap-3.3.5 bootstrap-fileinput-4.3.1
11 |
12 | ## 功能介绍
13 | 启动SpiderApplication后,页面访问地址为[http://localhost:8888](),选择需要分析的淘宝链接excel进行上传,后台收到文件会解析Excel并逐行对链接分析商品当前是否是出售中状态,分析完成后支持下载出售中的商品链接excel。
14 |
15 | 因为本项目是单机版,所以文件都存储在临时文件夹,进度也是保存在内存中,重启以后会丢失所有数据,需要分布式或者持久化的话可以自行改造。
16 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | com.martin.product
7 | tbspider
8 | 1.0
9 |
10 | org.springframework.boot
11 | spring-boot-starter-parent
12 | 2.4.5
13 |
14 |
15 |
16 |
17 | UTF-8
18 | UTF-8
19 | 1.8
20 |
21 | 1.4
22 | 2.7
23 | 3.7
24 | 1.2.76
25 | 30.1.1-jre
26 | 2.10.0
27 | 1.13.1
28 | 5.0.0
29 | 1.4.01
30 |
31 |
32 |
33 |
34 |
35 | commons-io
36 | commons-io
37 | ${commons.io.version}
38 |
39 |
40 |
41 | org.apache.commons
42 | commons-lang3
43 | ${commons.lang3.version}
44 |
45 |
46 |
47 | com.google.guava
48 | guava
49 | ${guava.version}
50 |
51 |
52 |
53 | com.fasterxml.jackson.core
54 | jackson-core
55 | ${jackson.version}
56 |
57 |
58 | com.fasterxml.jackson.core
59 | jackson-databind
60 | ${jackson.version}
61 |
62 |
63 |
64 |
65 | commons-fileupload
66 | commons-fileupload
67 | ${commons.fileupload.version}
68 |
69 |
70 |
71 |
72 | com.alibaba
73 | fastjson
74 | ${fastjson.version}
75 |
76 |
77 |
78 |
79 | org.jsoup
80 | jsoup
81 | ${jsoup.version}
82 |
83 |
84 |
85 |
86 | org.apache.poi
87 | poi
88 | ${poi.version}
89 |
90 |
91 | org.apache.poi
92 | poi-ooxml
93 | ${poi.version}
94 |
95 |
96 | xml-apis
97 | xml-apis
98 | ${xmlapis.version}
99 |
100 |
101 |
102 |
103 |
104 |
105 | org.springframework.boot
106 | spring-boot-starter-web
107 |
108 |
109 |
110 | org.springframework.boot
111 | spring-boot-starter-logging
112 |
113 |
114 |
115 | javax.servlet
116 | javax.servlet-api
117 |
118 |
119 |
120 | commons-io
121 | commons-io
122 |
123 |
124 |
125 | org.apache.commons
126 | commons-lang3
127 |
128 |
129 |
130 | com.google.guava
131 | guava
132 |
133 |
134 |
135 | commons-fileupload
136 | commons-fileupload
137 |
138 |
139 |
140 | com.alibaba
141 | fastjson
142 |
143 |
144 |
145 | org.jsoup
146 | jsoup
147 |
148 |
149 |
150 | org.apache.poi
151 | poi
152 |
153 |
154 | org.apache.poi
155 | poi-ooxml
156 |
157 |
158 |
159 |
160 |
161 | sonatype-nexus-snapshots
162 | https://oss.sonatype.org/content/repositories/snapshots
163 |
164 | true
165 |
166 |
167 | true
168 |
169 |
170 |
171 |
172 |
173 | tbspider
174 |
175 |
176 | org.apache.maven.plugins
177 | maven-compiler-plugin
178 | 3.5.1
179 |
180 | ${java.version}
181 | ${java.version}
182 |
183 |
184 |
185 |
186 | org.springframework.boot
187 | spring-boot-maven-plugin
188 |
189 |
190 |
191 |
--------------------------------------------------------------------------------
/src/main/java/com/martin/product/SpiderApplication.java:
--------------------------------------------------------------------------------
1 | package com.martin.product;
2 |
3 | import org.springframework.boot.SpringApplication;
4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
5 | import org.springframework.boot.web.servlet.ServletComponentScan;
6 |
7 | @SpringBootApplication(scanBasePackages = "com.martin.product")
8 | @ServletComponentScan(basePackages = "com.martin.product.listener")
9 | public class SpiderApplication {
10 |
11 | public static void main(String[] args) {
12 | SpringApplication.run(SpiderApplication.class, args);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/com/martin/product/advice/ExceptionAdvice.java:
--------------------------------------------------------------------------------
1 | package com.martin.product.advice;
2 |
3 | import com.martin.product.response.BaseResponse;
4 | import com.martin.product.util.LogUtil;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 | import org.springframework.web.HttpRequestMethodNotSupportedException;
8 | import org.springframework.web.bind.annotation.ControllerAdvice;
9 | import org.springframework.web.bind.annotation.ExceptionHandler;
10 | import org.springframework.web.bind.annotation.ResponseBody;
11 |
12 | import javax.servlet.http.HttpServletRequest;
13 |
14 | /**
15 | * 统一处理异常
16 | */
17 | @ControllerAdvice
18 | public class ExceptionAdvice {
19 |
20 | private static final Logger logger = LoggerFactory.getLogger(ExceptionAdvice.class);
21 |
22 | @ExceptionHandler(value = Exception.class)
23 | @ResponseBody
24 | public BaseResponse handleException(HttpServletRequest request, Exception e) {
25 | BaseResponse response = new BaseResponse<>();
26 | if (e instanceof IllegalArgumentException) {
27 | BaseResponse.fail(e.getMessage());
28 | } else if (e instanceof HttpRequestMethodNotSupportedException) {
29 | BaseResponse.fail("不支持的请求方式");
30 | } else {
31 | logger.error(LogUtil.buildLog("请求出现异常", request.getRequestURI(), request.getParameterMap()), e);
32 | BaseResponse.fail("服务器未知异常");
33 | }
34 |
35 | return response;
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/com/martin/product/constants/WebConstants.java:
--------------------------------------------------------------------------------
1 | package com.martin.product.constants;
2 |
3 | public class WebConstants {
4 |
5 | public static String ROOT_PATH = null;
6 | }
7 |
--------------------------------------------------------------------------------
/src/main/java/com/martin/product/controller/IndexController.java:
--------------------------------------------------------------------------------
1 | package com.martin.product.controller;
2 |
3 | import com.google.common.collect.Maps;
4 | import com.martin.product.response.BaseResponse;
5 | import com.martin.product.spider.TaoBaoSpider;
6 | import com.martin.product.tuple.Tuple2;
7 | import com.martin.product.util.FileUtil;
8 | import org.apache.commons.lang3.StringUtils;
9 | import org.apache.poi.hssf.usermodel.HSSFWorkbook;
10 | import org.apache.poi.ss.usermodel.*;
11 | import org.apache.poi.xssf.usermodel.XSSFWorkbook;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | import org.springframework.util.Assert;
15 | import org.springframework.web.bind.annotation.*;
16 | import org.springframework.web.multipart.MultipartFile;
17 |
18 | import javax.servlet.http.HttpServletResponse;
19 | import java.io.*;
20 | import java.util.Map;
21 |
22 | @RestController
23 | public class IndexController {
24 |
25 | private static final Logger logger = LoggerFactory.getLogger(IndexController.class);
26 |
27 | private static final Map> PROCESS_MAP = Maps.newHashMap();
28 |
29 | /**
30 | * 上传excel文件
31 | */
32 | @PostMapping(value = "/upload")
33 | public BaseResponse