├── .classpath
├── .gitignore
├── .project
├── .settings
├── org.eclipse.core.resources.prefs
├── org.eclipse.jdt.core.prefs
└── org.eclipse.m2e.core.prefs
├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── jdbee
│ │ ├── crawler
│ │ ├── GoodsList.java
│ │ ├── JDGoodsList.java
│ │ ├── JdCategory.java
│ │ └── RetailersCrawler.java
│ │ ├── dao
│ │ ├── BaseDao.java
│ │ └── GoodsDao.java
│ │ ├── main
│ │ ├── Main.java
│ │ └── NewMain.java
│ │ ├── model
│ │ ├── Category.java
│ │ ├── FiveCategory.java
│ │ ├── FourCategory.java
│ │ ├── Goods.java
│ │ ├── SecondCategory.java
│ │ └── ThreeCategory.java
│ │ └── utils
│ │ ├── Constants.java
│ │ ├── HttpUtil.java
│ │ ├── JsoupUtil.java
│ │ ├── PageUtils.java
│ │ ├── PropertiesUtils.java
│ │ └── ThreadUtil.java
└── resources
│ ├── categoryData.json
│ ├── chromedriver.exe
│ ├── config.properties
│ ├── log4j.properties
│ ├── parser.js
│ ├── phantomjs.exe
│ └── springJdbcContext.xml
└── test
└── java
└── com
└── handx
└── jd
└── JdBee
├── AppTest.java
├── TestGoodsDetail.java
├── TestWebcollector.java
└── dao
└── TestGoodsDao.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | JdBee
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/test/java=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
3 | org.eclipse.jdt.core.compiler.compliance=1.5
4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5 | org.eclipse.jdt.core.compiler.source=1.5
6 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # JdBee
2 | ## 使用jsoup抓取京东数据
3 |
4 | > **只用于学习交流,私自用于其他途径,后果自负!!!**
5 |
6 | > 目前只抓取零食相关的数据,现在就只需要零食相关的数据,其他后续再议!
7 |
8 | > 抓取零食相关的目的就是为了这个[vipsnacks](https://github.com/handexing/vipsnacks)项目的后续开发。
9 |
10 |
11 |
12 | ## 项目需要
13 |
14 | - httpclient
15 | - jsoup
16 | - slf4j
17 | - selenium
18 | - phantomjs
19 | - WebCollector
20 |
21 | ## 更新日志
22 |
23 | - 初始化项目,完成一,二级类目的抓取 (*2017-05-24*)
24 | - 采用selenium获取页面数据,获取三,四,五级类目(*2017-05-25*)
25 | - 多线程并发爬取类目分页数据(*2017-05-26*)
26 | - 多线程爬取商品skuid(*2017-05-28*)
27 |
28 | **selenium这个爬取的速度太慢了,而且每次还要打开一个网页,抓取少量数据还可以用一用,多的话实在罩不住,近期在找别的方法爬取**
29 |
30 | - 使用WebCollector+selenium+phantomjs爬取商品(*2017-06-01只爬取一个类目测试*)
31 | - 数据入库测试(*2017-06-02*)
32 | - 测试爬取一个小类目,爬取20万数据用时21分钟(*2017-06-03*)
33 | - 数据正常入库,爬取数据**285330**条(*2017-06-04*)
34 | - 优化获取商品代码,从获取一页要19664毫秒,优化到现在获取一页商品要7000毫秒左右,(*2017-06-07*)
35 |
36 |
37 | > 觉得不错的朋友可以点下star,watch,fork也算是对我的鼓励了。
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.handx.jd
6 | JdBee
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | JdBee
11 | http://maven.apache.org
12 |
13 |
14 | 1.7.2
15 | 3.2.4.RELEASE
16 | UTF-8
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | org.springframework
25 | spring-core
26 | ${spring.version}
27 |
28 |
29 |
30 | org.springframework
31 | spring-web
32 | ${spring.version}
33 |
34 |
35 |
36 | org.springframework
37 | spring-oxm
38 | ${spring.version}
39 |
40 |
41 |
42 | org.springframework
43 | spring-tx
44 | ${spring.version}
45 |
46 |
47 |
48 | org.springframework
49 | spring-jdbc
50 | ${spring.version}
51 |
52 |
53 |
54 | org.springframework
55 | spring-webmvc
56 | ${spring.version}
57 |
58 |
59 |
60 | org.springframework
61 | spring-aop
62 | ${spring.version}
63 |
64 |
65 |
66 | org.springframework
67 | spring-context-support
68 | ${spring.version}
69 |
70 |
71 |
72 | org.springframework
73 | spring-aop
74 | ${spring.version}
75 |
76 |
77 |
78 | org.springframework
79 | spring-test
80 | ${spring.version}
81 |
82 |
83 |
84 |
85 |
86 | junit
87 | junit
88 | 4.10
89 | test
90 |
91 |
92 |
93 |
94 | org.jsoup
95 | jsoup
96 | 1.10.2
97 |
98 |
99 |
100 |
101 | org.apache.httpcomponents
102 | httpclient
103 | 4.5.2
104 |
105 |
106 |
107 |
108 | org.slf4j
109 | slf4j-log4j12
110 | ${slf4j.version}
111 |
112 |
113 |
114 |
115 | org.seleniumhq.selenium
116 | selenium-java
117 | 2.44.0
118 |
119 |
120 |
121 |
122 | com.github.detro
123 | phantomjsdriver
124 | 1.2.0
125 |
126 |
127 |
128 |
129 | com.alibaba
130 | fastjson
131 | 1.2.4
132 |
133 |
134 |
135 |
136 | cn.edu.hfut.dmic.webcollector
137 | WebCollector
138 | 2.09
139 |
140 |
141 |
142 |
143 | mysql
144 | mysql-connector-java
145 | 5.1.35
146 |
147 |
148 |
149 |
150 | com.alibaba
151 | druid
152 | 1.0.2
153 |
154 |
155 |
156 |
157 |
158 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/crawler/GoodsList.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.crawler;
2 |
3 | import com.jdbee.model.Goods;
4 |
5 | import java.util.ArrayList;
6 |
7 | import cn.edu.hfut.dmic.webcollector.model.Page;
8 |
9 | /**
10 | * @ClassName: GoodsList
11 | * @Description: 考虑以后会有抽取多个平台数据,方便其他平台继承
12 | * @author handx 908716835@qq.com
13 | * @date 2017年6月1日 下午5:54:20
14 | */
15 | public abstract class GoodsList extends ArrayList {
16 |
17 | private static final long serialVersionUID = -7894645047969514212L;
18 |
19 | public abstract void addGoods(Page page);
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/crawler/JDGoodsList.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.crawler;
2 |
3 | import com.jdbee.dao.GoodsDao;
4 | import com.jdbee.model.Goods;
5 | import com.jdbee.utils.Constants;
6 | import com.jdbee.utils.PageUtils;
7 |
8 | import org.apache.commons.lang3.StringUtils;
9 | import org.apache.log4j.Logger;
10 | import org.openqa.selenium.By;
11 | import org.openqa.selenium.WebDriver;
12 | import org.openqa.selenium.WebElement;
13 | import org.springframework.context.ApplicationContext;
14 | import org.springframework.context.support.ClassPathXmlApplicationContext;
15 |
16 | import java.util.List;
17 |
18 | import cn.edu.hfut.dmic.webcollector.model.Page;
19 |
20 | /**
21 | * @ClassName: JDGoodsList
22 | * @Description: 获取京东商品
23 | * @author handx 908716835@qq.com
24 | * @date 2017年6月1日 下午5:55:47
25 | *
26 | */
27 | public class JDGoodsList extends GoodsList {
28 |
29 | private static final long serialVersionUID = -6016161025701938903L;
30 | private static GoodsDao goodsDao;
31 |
32 | static{
33 | ApplicationContext context = new ClassPathXmlApplicationContext("springJdbcContext.xml");
34 | goodsDao = (GoodsDao) context.getBean("goodsDao");
35 | }
36 |
37 | public final Logger log = Logger.getLogger(JDGoodsList.class);
38 |
39 |
40 | @Override
41 | public void addGoods(Page page) {
42 |
43 | WebDriver driver = null;
44 | try {
45 | driver = PageUtils.getWebDriver(page);
46 | System.out.println("&&&&&&&&&&&&&&&&&&&&&&&爬取地址:" + page.getUrl());
47 | List eles = driver.findElements(By.cssSelector("li.gl-item"));
48 | if (!eles.isEmpty()) {
49 | for (WebElement ele : eles) {
50 | Goods g = new Goods();
51 | g.setPlatform(Constants.JD);
52 | String priceStr = ele.findElement(By.className("p-price")).findElement(By.className("J_price"))
53 | .findElement(By.tagName("i")).getText();
54 | if (!StringUtils.isBlank(priceStr) && !"null".equals(priceStr)) {
55 | g.setPrice(priceStr);
56 | } else {
57 | g.setPrice("-1");
58 | }
59 | g.setName(ele.findElement(By.className("p-name")).findElement(By.tagName("em")).getText());
60 | g.setUrl(ele.findElement(By.className("p-name")).findElement(By.tagName("a")).getAttribute("href"));
61 | String commitCnt = ele.findElement(By.className("p-commit")).findElement(By.tagName("a")).getText();
62 | if (!StringUtils.isBlank(commitCnt) && !"null".equals(commitCnt)) {
63 | g.setCommitCnt(commitCnt);
64 | } else {
65 | g.setCommitCnt("-1");
66 | }
67 | // add(g);
68 | System.out.println(g.toString());
69 | goodsDao.createGoods(g);
70 |
71 | }
72 | } else {
73 | log.info("无商品列表!");
74 | }
75 | } catch (Exception e) {
76 | log.warn("爬取异常!!!");
77 | } finally {
78 | if (driver != null) {
79 | driver.quit();
80 | }
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/crawler/JdCategory.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.crawler;
2 |
3 | import com.jdbee.model.Category;
4 | import com.jdbee.model.SecondCategory;
5 | import com.jdbee.model.ThreeCategory;
6 | import com.jdbee.utils.Constants;
7 | import com.jdbee.utils.HttpUtil;
8 | import com.jdbee.utils.JsoupUtil;
9 |
10 | import org.apache.log4j.Logger;
11 | import org.jsoup.Jsoup;
12 | import org.jsoup.nodes.Document;
13 | import org.jsoup.nodes.Element;
14 | import org.jsoup.select.Elements;
15 |
16 | import java.util.ArrayList;
17 | import java.util.List;
18 |
19 | /**
20 | * @author handx 908716835@qq.com
21 | * @date 2017年6月1日 下午10:02:32
22 | */
23 |
24 | public class JdCategory {
25 |
26 | public static final Logger log = Logger.getLogger(JdCategory.class);
27 |
28 | /**
29 | * @Title: getCategory
30 | * @Description: 获取类目
31 | * @param 设定文件
32 | * @return void 返回类型
33 | * @throws
34 | */
35 | public static List getCategory() {
36 |
37 | List secondList = null;
38 | List threeList = null;
39 |
40 | String content = HttpUtil.sendGet(Constants.JD_URL);
41 | List list = JsoupUtil.getFirstCategory(content);
42 | Document document = Jsoup.parse(content);
43 |
44 | Elements elements = document.select(".item-title span");
45 |
46 | for (Element element : elements) {
47 | String text = element.text();
48 | for (int i = 0; i < list.size(); i++) {
49 | String name = list.get(i).getName();
50 | if ("电脑办公".equals(text)) {
51 | text = "电脑、办公";
52 | }
53 | if (name.contains(text)) {
54 |
55 | Element categoryItem = element.parent().parent().parent();
56 | Elements categories = categoryItem.select("dt a");
57 | Elements threeCate = categoryItem.select("dd a");
58 | secondList = new ArrayList();
59 |
60 | for (int j = 0; j < categories.size(); j++) {
61 | SecondCategory cate = new SecondCategory();
62 | cate.setName(categories.get(j).text());
63 | cate.setUrl("https:" + categories.get(j).attr("href"));
64 | secondList.add(cate);
65 |
66 | threeList = new ArrayList();
67 |
68 | for (int k = 0; k < threeCate.size(); k++) {
69 | ThreeCategory threeCategory = new ThreeCategory();
70 | threeCategory.setUrl("http:" + threeCate.get(k).attr("href"));
71 | threeCategory.setName(threeCate.get(k).text());
72 | threeList.add(threeCategory);
73 | }
74 | cate.setThreeCates(threeList);
75 | }
76 | list.get(i).setSenondCates(secondList);
77 | }
78 | }
79 | }
80 |
81 | // JSON json = (JSON) JSONObject.toJSON(list);
82 | // log.info(json);
83 |
84 | return list;
85 | }
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/crawler/RetailersCrawler.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.crawler;
2 |
3 | import com.jdbee.main.Main;
4 |
5 | import org.apache.log4j.Logger;
6 | import org.jsoup.nodes.Document;
7 |
8 | import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
9 | import cn.edu.hfut.dmic.webcollector.model.Links;
10 | import cn.edu.hfut.dmic.webcollector.model.Page;
11 | import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
12 | import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
13 | import cn.edu.hfut.dmic.webcollector.util.RegexRule;
14 |
15 | /**
16 | * @author handx 908716835@qq.com
17 | * @date 2017年6月1日 下午5:56:24
18 | */
19 | public abstract class RetailersCrawler extends DeepCrawler {
20 |
21 | public final Logger log = Logger.getLogger(Main.class);
22 |
23 | private String seedFormat;// 种子格式化
24 | protected RegexRule regexRule;// 正则匹配
25 |
26 | public RetailersCrawler(String crawlPath, String seedFormat) {
27 | super(crawlPath);
28 | this.seedFormat = seedFormat;
29 | this.regexRule = new RegexRule();
30 | }
31 |
32 | /**
33 | * @Title: addRegex
34 | * @Description: 添加正则
35 | * @param @param urlRegex 设定文件
36 | * @return void 返回类型
37 | * @throws
38 | */
39 | public void addRegex(String urlRegex) {
40 | this.regexRule.addRule(urlRegex);
41 | }
42 |
43 | /**
44 | * @Title: addSeed
45 | * @Description: 添加一个种子url
46 | * @param @throws Exception 设定文件
47 | * @return void 返回类型
48 | * @throws
49 | */
50 | private void addSeed() throws Exception{
51 | int totalPage=getTotalPage(getPage(getSeed(seedFormat, 1)));
52 | for (int page = 1; page <= totalPage; page++) {
53 | this.addSeed(getSeed(seedFormat, page));
54 | }
55 | }
56 |
57 | /**
58 | * @Title: getPage
59 | * @Description: 根据url获取Page实例
60 | * @param @param url
61 | * @param @throws Exception 设定文件
62 | * @return Page 返回类型
63 | * @throws
64 | */
65 | private Page getPage(String url) throws Exception {
66 | HttpRequest httpRequest = new HttpRequest(url);
67 | HttpResponse response = httpRequest.getResponse();
68 | Page page = new Page();
69 | page.setUrl(url);
70 | page.setHtml(response.getHtmlByCharsetDetect());
71 | page.setResponse(response);
72 | return page;
73 | }
74 |
75 |
76 | /**
77 | * @Title: getSeed
78 | * @Description: 获取seed url
79 | * @param @param seedFormat
80 | * @param @param page
81 | * @return String 返回类型
82 | * @throws
83 | */
84 | public String getSeed(String seedFormat, Object... page) {
85 | return String.format(seedFormat, page);
86 | }
87 |
88 | /**
89 | * @Title: getTotalPage
90 | * @Description:获取查询商品总页数
91 | * @param @param page
92 | * @return int 返回类型
93 | * @throws
94 | */
95 | public abstract int getTotalPage(Page page);
96 |
97 | @Override
98 | public void start(int depth) throws Exception {
99 | addSeed();
100 | super.start(depth);
101 | }
102 |
103 | public abstract void visit(Page page, Links links);
104 |
105 | public Links visitAndGetNextLinks(Page page) {
106 | Links nextLinks = new Links();
107 | String conteType = page.getResponse().getContentType();
108 | if (conteType != null && conteType.contains("text/html")) {
109 | Document doc = page.getDoc();
110 | if (doc != null) {
111 | nextLinks.addAllFromDocument(page.getDoc(), regexRule);
112 | }
113 | }
114 | try {
115 | visit(page, nextLinks);
116 | } catch (Exception ex) {
117 | log.info("Exception", ex);
118 | }
119 | return nextLinks;
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/dao/BaseDao.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.dao;
2 |
3 | import org.springframework.beans.factory.annotation.Autowired;
4 | import org.springframework.jdbc.core.JdbcTemplate;
5 |
6 | public class BaseDao {
7 |
8 | @Autowired
9 | protected JdbcTemplate jdbcTemplate;
10 |
11 | public JdbcTemplate getJdbcTemplate() {
12 | return jdbcTemplate;
13 | }
14 |
15 | public void setJdbcTemplate(JdbcTemplate jdbcTemplate) {
16 | this.jdbcTemplate = jdbcTemplate;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/dao/GoodsDao.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.dao;
2 |
3 | import com.jdbee.model.Goods;
4 |
5 | import org.springframework.stereotype.Service;
6 |
7 | /**
8 | * @author handx 908716835@qq.com
9 | * @date 2017年6月2日 上午10:55:25
10 | */
11 |
12 | @Service
13 | public class GoodsDao extends BaseDao {
14 |
15 | public void createGoods(Goods goods) {
16 | String sql = "INSERT INTO GOODS(PLATFORM,URL,NAME,PRICE,COMMITCNT) values(?,?,?,?,?)";
17 | jdbcTemplate.update(sql, goods.getPlatform(),
18 | goods.getUrl(), goods.getName(), goods.getPrice(), goods.getCommitCnt());
19 | }
20 |
21 | public String getGoodsName(Long id) {
22 | String sql = "SELECT NAME FROM GOODS WHERE ID=?";
23 | return jdbcTemplate.queryForObject(sql, String.class, id);
24 | }
25 |
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/com/jdbee/main/Main.java:
--------------------------------------------------------------------------------
1 | package com.jdbee.main;
2 |
3 | import com.jdbee.model.Category;
4 | import com.jdbee.model.FiveCategory;
5 | import com.jdbee.utils.Constants;
6 | import com.jdbee.utils.HttpUtil;
7 | import com.jdbee.utils.JsoupUtil;
8 | import com.jdbee.utils.ThreadUtil;
9 |
10 | import org.apache.log4j.Logger;
11 |
12 | import java.io.IOException;
13 | import java.util.Iterator;
14 | import java.util.List;
15 | import java.util.Map;
16 | import java.util.Map.Entry;
17 |
18 | /**
19 | *
20 | * @ClassName: Main
21 | * @Description: 程序入口
22 | * @author handx 908716835@qq.com
23 | * @date 2017年5月24日 下午8:05:02
24 | *
25 | */
26 | public class Main {
27 |
28 | public static final Logger log = Logger.getLogger(Main.class);
29 |
30 | // https://channel.jd.com/1320-5019.html
31 |
32 | public static void main(String[] args) throws IOException {
33 |
34 | // 获取网页数据
35 | String content = HttpUtil.sendGet(Constants.JD_URL);
36 | // 获取一级类目
37 | List list = JsoupUtil.getFirstCategory(content);
38 | // 获取二级类目
39 | list = JsoupUtil.getSecondCategory(content, list);
40 | // 获取三,四,五级类目
41 | list = JsoupUtil.getThreeCategory(list);
42 | // 获取商品url
43 | // List