├── .classpath ├── .gitignore ├── .project ├── .settings ├── org.eclipse.core.resources.prefs ├── org.eclipse.jdt.core.prefs └── org.eclipse.m2e.core.prefs ├── README.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── jdbee │ │ ├── crawler │ │ ├── GoodsList.java │ │ ├── JDGoodsList.java │ │ ├── JdCategory.java │ │ └── RetailersCrawler.java │ │ ├── dao │ │ ├── BaseDao.java │ │ └── GoodsDao.java │ │ ├── main │ │ ├── Main.java │ │ └── NewMain.java │ │ ├── model │ │ ├── Category.java │ │ ├── FiveCategory.java │ │ ├── FourCategory.java │ │ ├── Goods.java │ │ ├── SecondCategory.java │ │ └── ThreeCategory.java │ │ └── utils │ │ ├── Constants.java │ │ ├── HttpUtil.java │ │ ├── JsoupUtil.java │ │ ├── PageUtils.java │ │ ├── PropertiesUtils.java │ │ └── ThreadUtil.java └── resources │ ├── categoryData.json │ ├── chromedriver.exe │ ├── config.properties │ ├── log4j.properties │ ├── parser.js │ ├── phantomjs.exe │ └── springJdbcContext.xml └── test └── java └── com └── handx └── jd └── JdBee ├── AppTest.java ├── TestGoodsDetail.java ├── TestWebcollector.java └── dao └── TestGoodsDao.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | JdBee 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 3 | org.eclipse.jdt.core.compiler.compliance=1.5 4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 5 | org.eclipse.jdt.core.compiler.source=1.5 6 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JdBee 2 | ## 使用jsoup抓取京东数据 3 | 4 | > **只用于学习交流,私自用于其他途径,后果自负!!!** 5 | 6 | > 目前只抓取零食相关的数据,现在就只需要零食相关的数据,其他后续再议! 7 | 8 | > 抓取零食相关的目的就是为了这个[vipsnacks](https://github.com/handexing/vipsnacks)项目的后续开发。 9 | 10 | 11 | 12 | ## 项目需要 13 | 14 | - httpclient 15 | - jsoup 16 | - slf4j 17 | - selenium 18 | - phantomjs 19 | - WebCollector 20 | 21 | ## 更新日志 22 | 23 | - 初始化项目,完成一,二级类目的抓取 (*2017-05-24*) 24 | - 采用selenium获取页面数据,获取三,四,五级类目(*2017-05-25*) 25 | - 多线程并发爬取类目分页数据(*2017-05-26*) 26 | - 多线程爬取商品skuid(*2017-05-28*) 27 | 28 | **selenium这个爬取的速度太慢了,而且每次还要打开一个网页,抓取少量数据还可以用一用,多的话实在罩不住,近期在找别的方法爬取** 29 | 30 | - 使用WebCollector+selenium+phantomjs爬取商品(*2017-06-01只爬取一个类目测试*) 31 | - 数据入库测试(*2017-06-02*) 32 | - 测试爬取一个小类目,爬取20万数据用时21分钟(*2017-06-03*) 33 | - 数据正常入库,爬取数据**285330**条(*2017-06-04*) 34 | - 优化获取商品代码,从获取一页要19664毫秒,优化到现在获取一页商品要7000毫秒左右,(*2017-06-07*) 35 | 36 | 37 | > 觉得不错的朋友可以点下star,watch,fork也算是对我的鼓励了。 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.handx.jd 6 | JdBee 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | JdBee 11 | http://maven.apache.org 12 | 13 | 14 | 1.7.2 15 | 3.2.4.RELEASE 16 | UTF-8 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | org.springframework 25 | spring-core 26 | ${spring.version} 27 | 28 | 29 | 30 | org.springframework 31 | spring-web 32 | ${spring.version} 33 | 34 | 35 | 36 | org.springframework 37 | spring-oxm 38 | ${spring.version} 39 | 40 | 41 | 42 | org.springframework 43 | spring-tx 44 | ${spring.version} 45 | 46 | 47 | 48 | org.springframework 49 | spring-jdbc 50 | ${spring.version} 51 | 52 | 53 | 54 | org.springframework 55 | spring-webmvc 56 | ${spring.version} 57 | 58 | 59 | 60 | org.springframework 61 | spring-aop 62 | ${spring.version} 63 | 64 | 65 | 66 | org.springframework 67 | spring-context-support 68 | ${spring.version} 69 | 70 | 71 | 72 | org.springframework 73 | spring-aop 74 | ${spring.version} 75 | 76 | 77 | 78 | org.springframework 79 | spring-test 80 | ${spring.version} 81 | 82 | 83 | 84 | 85 | 86 | junit 87 | junit 88 | 4.10 89 | test 90 | 91 | 92 | 93 | 94 | org.jsoup 95 | jsoup 96 | 1.10.2 97 | 98 | 99 | 100 | 101 | org.apache.httpcomponents 102 | httpclient 103 | 4.5.2 104 | 105 | 106 | 107 | 108 | org.slf4j 109 | slf4j-log4j12 110 | ${slf4j.version} 111 | 112 | 113 | 114 | 115 | org.seleniumhq.selenium 116 | selenium-java 117 | 2.44.0 118 | 119 | 120 | 121 | 122 | com.github.detro 123 | phantomjsdriver 124 | 1.2.0 125 | 126 | 127 | 128 | 129 | com.alibaba 130 | fastjson 131 | 1.2.4 132 | 133 | 134 | 135 | 136 | cn.edu.hfut.dmic.webcollector 137 | WebCollector 138 | 2.09 139 | 140 | 141 | 142 | 143 | mysql 144 | mysql-connector-java 145 | 5.1.35 146 | 147 | 148 | 149 | 150 | com.alibaba 151 | druid 152 | 1.0.2 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/crawler/GoodsList.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.crawler; 2 | 3 | import com.jdbee.model.Goods; 4 | 5 | import java.util.ArrayList; 6 | 7 | import cn.edu.hfut.dmic.webcollector.model.Page; 8 | 9 | /** 10 | * @ClassName: GoodsList 11 | * @Description: 考虑以后会有抽取多个平台数据,方便其他平台继承 12 | * @author handx 908716835@qq.com 13 | * @date 2017年6月1日 下午5:54:20 14 | */ 15 | public abstract class GoodsList extends ArrayList { 16 | 17 | private static final long serialVersionUID = -7894645047969514212L; 18 | 19 | public abstract void addGoods(Page page); 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/crawler/JDGoodsList.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.crawler; 2 | 3 | import com.jdbee.dao.GoodsDao; 4 | import com.jdbee.model.Goods; 5 | import com.jdbee.utils.Constants; 6 | import com.jdbee.utils.PageUtils; 7 | 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.log4j.Logger; 10 | import org.openqa.selenium.By; 11 | import org.openqa.selenium.WebDriver; 12 | import org.openqa.selenium.WebElement; 13 | import org.springframework.context.ApplicationContext; 14 | import org.springframework.context.support.ClassPathXmlApplicationContext; 15 | 16 | import java.util.List; 17 | 18 | import cn.edu.hfut.dmic.webcollector.model.Page; 19 | 20 | /** 21 | * @ClassName: JDGoodsList 22 | * @Description: 获取京东商品 23 | * @author handx 908716835@qq.com 24 | * @date 2017年6月1日 下午5:55:47 25 | * 26 | */ 27 | public class JDGoodsList extends GoodsList { 28 | 29 | private static final long serialVersionUID = -6016161025701938903L; 30 | private static GoodsDao goodsDao; 31 | 32 | static{ 33 | ApplicationContext context = new ClassPathXmlApplicationContext("springJdbcContext.xml"); 34 | goodsDao = (GoodsDao) context.getBean("goodsDao"); 35 | } 36 | 37 | public final Logger log = Logger.getLogger(JDGoodsList.class); 38 | 39 | 40 | @Override 41 | public void addGoods(Page page) { 42 | 43 | WebDriver driver = null; 44 | try { 45 | driver = PageUtils.getWebDriver(page); 46 | System.out.println("&&&&&&&&&&&&&&&&&&&&&&&爬取地址:" + page.getUrl()); 47 | List eles = driver.findElements(By.cssSelector("li.gl-item")); 48 | if (!eles.isEmpty()) { 49 | for (WebElement ele : eles) { 50 | Goods g = new Goods(); 51 | g.setPlatform(Constants.JD); 52 | String priceStr = ele.findElement(By.className("p-price")).findElement(By.className("J_price")) 53 | .findElement(By.tagName("i")).getText(); 54 | if (!StringUtils.isBlank(priceStr) && !"null".equals(priceStr)) { 55 | g.setPrice(priceStr); 56 | } else { 57 | g.setPrice("-1"); 58 | } 59 | g.setName(ele.findElement(By.className("p-name")).findElement(By.tagName("em")).getText()); 60 | g.setUrl(ele.findElement(By.className("p-name")).findElement(By.tagName("a")).getAttribute("href")); 61 | String commitCnt = ele.findElement(By.className("p-commit")).findElement(By.tagName("a")).getText(); 62 | if (!StringUtils.isBlank(commitCnt) && !"null".equals(commitCnt)) { 63 | g.setCommitCnt(commitCnt); 64 | } else { 65 | g.setCommitCnt("-1"); 66 | } 67 | // add(g); 68 | System.out.println(g.toString()); 69 | goodsDao.createGoods(g); 70 | 71 | } 72 | } else { 73 | log.info("无商品列表!"); 74 | } 75 | } catch (Exception e) { 76 | log.warn("爬取异常!!!"); 77 | } finally { 78 | if (driver != null) { 79 | driver.quit(); 80 | } 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/crawler/JdCategory.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.crawler; 2 | 3 | import com.jdbee.model.Category; 4 | import com.jdbee.model.SecondCategory; 5 | import com.jdbee.model.ThreeCategory; 6 | import com.jdbee.utils.Constants; 7 | import com.jdbee.utils.HttpUtil; 8 | import com.jdbee.utils.JsoupUtil; 9 | 10 | import org.apache.log4j.Logger; 11 | import org.jsoup.Jsoup; 12 | import org.jsoup.nodes.Document; 13 | import org.jsoup.nodes.Element; 14 | import org.jsoup.select.Elements; 15 | 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | /** 20 | * @author handx 908716835@qq.com 21 | * @date 2017年6月1日 下午10:02:32 22 | */ 23 | 24 | public class JdCategory { 25 | 26 | public static final Logger log = Logger.getLogger(JdCategory.class); 27 | 28 | /** 29 | * @Title: getCategory 30 | * @Description: 获取类目 31 | * @param 设定文件 32 | * @return void 返回类型 33 | * @throws 34 | */ 35 | public static List getCategory() { 36 | 37 | List secondList = null; 38 | List threeList = null; 39 | 40 | String content = HttpUtil.sendGet(Constants.JD_URL); 41 | List list = JsoupUtil.getFirstCategory(content); 42 | Document document = Jsoup.parse(content); 43 | 44 | Elements elements = document.select(".item-title span"); 45 | 46 | for (Element element : elements) { 47 | String text = element.text(); 48 | for (int i = 0; i < list.size(); i++) { 49 | String name = list.get(i).getName(); 50 | if ("电脑办公".equals(text)) { 51 | text = "电脑、办公"; 52 | } 53 | if (name.contains(text)) { 54 | 55 | Element categoryItem = element.parent().parent().parent(); 56 | Elements categories = categoryItem.select("dt a"); 57 | Elements threeCate = categoryItem.select("dd a"); 58 | secondList = new ArrayList(); 59 | 60 | for (int j = 0; j < categories.size(); j++) { 61 | SecondCategory cate = new SecondCategory(); 62 | cate.setName(categories.get(j).text()); 63 | cate.setUrl("https:" + categories.get(j).attr("href")); 64 | secondList.add(cate); 65 | 66 | threeList = new ArrayList(); 67 | 68 | for (int k = 0; k < threeCate.size(); k++) { 69 | ThreeCategory threeCategory = new ThreeCategory(); 70 | threeCategory.setUrl("http:" + threeCate.get(k).attr("href")); 71 | threeCategory.setName(threeCate.get(k).text()); 72 | threeList.add(threeCategory); 73 | } 74 | cate.setThreeCates(threeList); 75 | } 76 | list.get(i).setSenondCates(secondList); 77 | } 78 | } 79 | } 80 | 81 | // JSON json = (JSON) JSONObject.toJSON(list); 82 | // log.info(json); 83 | 84 | return list; 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/crawler/RetailersCrawler.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.crawler; 2 | 3 | import com.jdbee.main.Main; 4 | 5 | import org.apache.log4j.Logger; 6 | import org.jsoup.nodes.Document; 7 | 8 | import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler; 9 | import cn.edu.hfut.dmic.webcollector.model.Links; 10 | import cn.edu.hfut.dmic.webcollector.model.Page; 11 | import cn.edu.hfut.dmic.webcollector.net.HttpRequest; 12 | import cn.edu.hfut.dmic.webcollector.net.HttpResponse; 13 | import cn.edu.hfut.dmic.webcollector.util.RegexRule; 14 | 15 | /** 16 | * @author handx 908716835@qq.com 17 | * @date 2017年6月1日 下午5:56:24 18 | */ 19 | public abstract class RetailersCrawler extends DeepCrawler { 20 | 21 | public final Logger log = Logger.getLogger(Main.class); 22 | 23 | private String seedFormat;// 种子格式化 24 | protected RegexRule regexRule;// 正则匹配 25 | 26 | public RetailersCrawler(String crawlPath, String seedFormat) { 27 | super(crawlPath); 28 | this.seedFormat = seedFormat; 29 | this.regexRule = new RegexRule(); 30 | } 31 | 32 | /** 33 | * @Title: addRegex 34 | * @Description: 添加正则 35 | * @param @param urlRegex 设定文件 36 | * @return void 返回类型 37 | * @throws 38 | */ 39 | public void addRegex(String urlRegex) { 40 | this.regexRule.addRule(urlRegex); 41 | } 42 | 43 | /** 44 | * @Title: addSeed 45 | * @Description: 添加一个种子url 46 | * @param @throws Exception 设定文件 47 | * @return void 返回类型 48 | * @throws 49 | */ 50 | private void addSeed() throws Exception{ 51 | int totalPage=getTotalPage(getPage(getSeed(seedFormat, 1))); 52 | for (int page = 1; page <= totalPage; page++) { 53 | this.addSeed(getSeed(seedFormat, page)); 54 | } 55 | } 56 | 57 | /** 58 | * @Title: getPage 59 | * @Description: 根据url获取Page实例 60 | * @param @param url 61 | * @param @throws Exception 设定文件 62 | * @return Page 返回类型 63 | * @throws 64 | */ 65 | private Page getPage(String url) throws Exception { 66 | HttpRequest httpRequest = new HttpRequest(url); 67 | HttpResponse response = httpRequest.getResponse(); 68 | Page page = new Page(); 69 | page.setUrl(url); 70 | page.setHtml(response.getHtmlByCharsetDetect()); 71 | page.setResponse(response); 72 | return page; 73 | } 74 | 75 | 76 | /** 77 | * @Title: getSeed 78 | * @Description: 获取seed url 79 | * @param @param seedFormat 80 | * @param @param page 81 | * @return String 返回类型 82 | * @throws 83 | */ 84 | public String getSeed(String seedFormat, Object... page) { 85 | return String.format(seedFormat, page); 86 | } 87 | 88 | /** 89 | * @Title: getTotalPage 90 | * @Description:获取查询商品总页数 91 | * @param @param page 92 | * @return int 返回类型 93 | * @throws 94 | */ 95 | public abstract int getTotalPage(Page page); 96 | 97 | @Override 98 | public void start(int depth) throws Exception { 99 | addSeed(); 100 | super.start(depth); 101 | } 102 | 103 | public abstract void visit(Page page, Links links); 104 | 105 | public Links visitAndGetNextLinks(Page page) { 106 | Links nextLinks = new Links(); 107 | String conteType = page.getResponse().getContentType(); 108 | if (conteType != null && conteType.contains("text/html")) { 109 | Document doc = page.getDoc(); 110 | if (doc != null) { 111 | nextLinks.addAllFromDocument(page.getDoc(), regexRule); 112 | } 113 | } 114 | try { 115 | visit(page, nextLinks); 116 | } catch (Exception ex) { 117 | log.info("Exception", ex); 118 | } 119 | return nextLinks; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/dao/BaseDao.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.dao; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.jdbc.core.JdbcTemplate; 5 | 6 | public class BaseDao { 7 | 8 | @Autowired 9 | protected JdbcTemplate jdbcTemplate; 10 | 11 | public JdbcTemplate getJdbcTemplate() { 12 | return jdbcTemplate; 13 | } 14 | 15 | public void setJdbcTemplate(JdbcTemplate jdbcTemplate) { 16 | this.jdbcTemplate = jdbcTemplate; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/dao/GoodsDao.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.dao; 2 | 3 | import com.jdbee.model.Goods; 4 | 5 | import org.springframework.stereotype.Service; 6 | 7 | /** 8 | * @author handx 908716835@qq.com 9 | * @date 2017年6月2日 上午10:55:25 10 | */ 11 | 12 | @Service 13 | public class GoodsDao extends BaseDao { 14 | 15 | public void createGoods(Goods goods) { 16 | String sql = "INSERT INTO GOODS(PLATFORM,URL,NAME,PRICE,COMMITCNT) values(?,?,?,?,?)"; 17 | jdbcTemplate.update(sql, goods.getPlatform(), 18 | goods.getUrl(), goods.getName(), goods.getPrice(), goods.getCommitCnt()); 19 | } 20 | 21 | public String getGoodsName(Long id) { 22 | String sql = "SELECT NAME FROM GOODS WHERE ID=?"; 23 | return jdbcTemplate.queryForObject(sql, String.class, id); 24 | } 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/main/Main.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.main; 2 | 3 | import com.jdbee.model.Category; 4 | import com.jdbee.model.FiveCategory; 5 | import com.jdbee.utils.Constants; 6 | import com.jdbee.utils.HttpUtil; 7 | import com.jdbee.utils.JsoupUtil; 8 | import com.jdbee.utils.ThreadUtil; 9 | 10 | import org.apache.log4j.Logger; 11 | 12 | import java.io.IOException; 13 | import java.util.Iterator; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.Map.Entry; 17 | 18 | /** 19 | * 20 | * @ClassName: Main 21 | * @Description: 程序入口 22 | * @author handx 908716835@qq.com 23 | * @date 2017年5月24日 下午8:05:02 24 | * 25 | */ 26 | public class Main { 27 | 28 | public static final Logger log = Logger.getLogger(Main.class); 29 | 30 | // https://channel.jd.com/1320-5019.html 31 | 32 | public static void main(String[] args) throws IOException { 33 | 34 | // 获取网页数据 35 | String content = HttpUtil.sendGet(Constants.JD_URL); 36 | // 获取一级类目 37 | List list = JsoupUtil.getFirstCategory(content); 38 | // 获取二级类目 39 | list = JsoupUtil.getSecondCategory(content, list); 40 | // 获取三,四,五级类目 41 | list = JsoupUtil.getThreeCategory(list); 42 | // 获取商品url 43 | // List>> pageMap = 44 | // JsoupUtil.getPageUrlList(list); 45 | // JSON json = (JSON) JSONObject.toJSON(pageMap); 46 | // System.out.println(json.toJSONString()); 47 | 48 | List fiveCategories = JsoupUtil.getLastCategory(list, "食品饮料、保健食品", "进口食品"); 49 | 50 | // 获取类目分页信息 51 | List>> categoryPageUrl = ThreadUtil.getCategoryPageUrl(fiveCategories); 52 | 53 | 54 | int i = 0; 55 | 56 | for (Map> map : categoryPageUrl) { 57 | Iterator>> iterator = map.entrySet().iterator(); 58 | while (iterator.hasNext()) { 59 | Entry> next = iterator.next(); 60 | List urls = next.getValue(); 61 | String key = next.getKey(); 62 | System.err.println("\n key:" + key); 63 | for (String url : urls) { 64 | System.out.println(url); 65 | i++; 66 | } 67 | } 68 | } 69 | System.out.println("共有" + i + "页数据..."); 70 | 71 | List> skus = ThreadUtil.getGoodsSkuIdByCatePages(categoryPageUrl); 72 | 73 | System.out.println("共爬取界面:" + skus.size() + "个!"); 74 | 75 | int j = 0; 76 | for (List lists : skus) { 77 | for (String str : lists) { 78 | System.err.println(str); 79 | j++; 80 | } 81 | } 82 | System.out.println("共有SKU" + j + "个..."); 83 | 84 | 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/main/NewMain.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.main; 2 | 3 | import com.jdbee.crawler.JDGoodsList; 4 | import com.jdbee.crawler.JdCategory; 5 | import com.jdbee.crawler.RetailersCrawler; 6 | import com.jdbee.model.Category; 7 | import com.jdbee.model.SecondCategory; 8 | import com.jdbee.model.ThreeCategory; 9 | import com.jdbee.utils.Constants; 10 | 11 | import org.apache.log4j.Logger; 12 | import org.jsoup.nodes.Element; 13 | 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | import cn.edu.hfut.dmic.webcollector.model.Links; 18 | import cn.edu.hfut.dmic.webcollector.model.Page; 19 | 20 | /** 21 | * @ClassName: NewMain 22 | * @Description: 程序入口 23 | * @author handx 908716835@qq.com 24 | * @date 2017年6月1日 下午9:47:17 25 | */ 26 | public class NewMain extends RetailersCrawler { 27 | 28 | public static final Logger log = Logger.getLogger(NewMain.class); 29 | 30 | /** 31 | * @Title: getCategorySnacksUrlList 32 | * @Description: 获取零食url地址 33 | * @param @return 34 | * @return List 35 | */ 36 | private static List getCategorySnacksUrlList() { 37 | List urls = new ArrayList(); 38 | 39 | // 获取类目列表 40 | List list = JdCategory.getCategory(); 41 | 42 | for (Category category : list) { 43 | if ("食品饮料、保健食品".equals(category.getName())) { 44 | List senondCates = category.getSenondCates(); 45 | for (SecondCategory secondCategory : senondCates) { 46 | List threeCates = secondCategory.getThreeCates(); 47 | for (final ThreeCategory threeCategory : threeCates) { 48 | urls.add(threeCategory.getUrl()); 49 | } 50 | } 51 | } 52 | } 53 | return urls; 54 | } 55 | 56 | public static void main(String[] args) throws Exception { 57 | 58 | long startTime = System.currentTimeMillis(); 59 | List urls = getCategorySnacksUrlList(); 60 | 61 | for (String url : urls) { 62 | NewMain crawler = new NewMain("data", url + Constants.JD_PAGING_PARAMETER); 63 | crawler.setThreads(5);// 抓取启动线程数 64 | crawler.start(1);// 层数 65 | } 66 | 67 | long endTime = System.currentTimeMillis(); 68 | System.out.println("程序运行时间: " + (endTime - startTime) + "ms"); 69 | 70 | } 71 | 72 | private JDGoodsList goodsList; 73 | 74 | public NewMain(String crawlPath, String seekFormat) { 75 | super(crawlPath, seekFormat); 76 | goodsList = new JDGoodsList(); 77 | } 78 | 79 | @Override 80 | public int getTotalPage(Page page) { 81 | Element ele = page.getDoc().select("div#J_bottomPage").select("span.p-skip>em").first().select("b").first(); 82 | return ele == null ? 0 : Integer.parseInt(ele.text()); 83 | } 84 | 85 | @Override 86 | public void visit(Page page, Links links) { 87 | goodsList.addGoods(page); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/Category.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @ClassName: Category 7 | * @Description: 一级类目 8 | * @author handx 908716835@qq.com 9 | * @date 2017年5月24日 下午1:45:25 10 | */ 11 | 12 | public class Category { 13 | 14 | private Integer id; 15 | private String name; 16 | 17 | private List senondCates;// 二级类别 18 | 19 | public Integer getId() { 20 | return id; 21 | } 22 | 23 | public String getName() { 24 | return name; 25 | } 26 | 27 | public List getSenondCates() { 28 | return senondCates; 29 | } 30 | 31 | public void setId(Integer id) { 32 | this.id = id; 33 | } 34 | 35 | public void setName(String name) { 36 | this.name = name; 37 | } 38 | 39 | public void setSenondCates(List senondCates) { 40 | this.senondCates = senondCates; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return "Category [id=" + id + ", name=" + name + ", senondCates=" + senondCates + "]"; 46 | } 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/FiveCategory.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | /** 4 | * 5 | * @ClassName: FiveCategory 6 | * @Description: 五级类目 7 | * @author handx 908716835@qq.com 8 | * @date 2017年5月25日 下午2:48:33 9 | * 10 | */ 11 | public class FiveCategory { 12 | 13 | private String name; 14 | private String url; 15 | 16 | public String getName() { 17 | return name; 18 | } 19 | public String getUrl() { 20 | return url; 21 | } 22 | 23 | public void setName(String name) { 24 | this.name = name; 25 | } 26 | 27 | public void setUrl(String url) { 28 | this.url = url; 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | return "name=" + name + ", url=" + url; 34 | } 35 | 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/FourCategory.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @ClassName: FourCategory 7 | * @Description: 四级类目 8 | * @author handx 908716835@qq.com 9 | * @date 2017年5月25日 下午2:27:35 10 | */ 11 | public class FourCategory { 12 | 13 | private String name; 14 | private String url; 15 | 16 | private List fiveCates;// 五级类别 17 | 18 | public List getFiveCates() { 19 | return fiveCates; 20 | } 21 | public String getName() { 22 | return name; 23 | } 24 | 25 | public String getUrl() { 26 | return url; 27 | } 28 | 29 | public void setFiveCates(List fiveCates) { 30 | this.fiveCates = fiveCates; 31 | } 32 | 33 | public void setName(String name) { 34 | this.name = name; 35 | } 36 | 37 | public void setUrl(String url) { 38 | this.url = url; 39 | } 40 | 41 | @Override 42 | public String toString() { 43 | return "FourCategory [name=" + name + ", url=" + url + ", fiveCates=" + fiveCates + "]"; 44 | } 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/Goods.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | /** 4 | * @author handx 908716835@qq.com 5 | * @date 2017年6月1日 下午4:52:29 6 | */ 7 | public class Goods { 8 | 9 | private String platform;// 平台 10 | private String url;// 请求路径 11 | private String name;// 名称 12 | private String price;// 价格,因为有部分是“暂无报价” 13 | private String commitCnt;// 评论数量 14 | public String getCommitCnt() { 15 | return commitCnt; 16 | } 17 | 18 | public String getName() { 19 | return name; 20 | } 21 | // private String img1; 22 | // private String img2; 23 | // private String img3; 24 | // private String img4; 25 | // private String img5; 26 | // 27 | // private String skuId; 28 | // private String venderId; 29 | // private String shopId; 30 | // private String plusPrice;// 会员价 31 | public String getPlatform() { 32 | return platform; 33 | } 34 | 35 | public String getPrice() { 36 | return price; 37 | } 38 | 39 | public String getUrl() { 40 | return url; 41 | } 42 | public void setCommitCnt(String commitCnt) { 43 | this.commitCnt = commitCnt; 44 | } 45 | 46 | public void setName(String name) { 47 | this.name = name; 48 | } 49 | public void setPlatform(String platform) { 50 | this.platform = platform; 51 | } 52 | 53 | public void setPrice(String price) { 54 | this.price = price; 55 | } 56 | 57 | public void setUrl(String url) { 58 | this.url = url; 59 | } 60 | @Override 61 | public String toString() { 62 | return "Goods [platform=" + platform + ", url=" + url + ", name=" + name + ", price=" + price + ", commitCnt=" 63 | + commitCnt + "]"; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/SecondCategory.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * 7 | * @ClassName: SecondCategory 8 | * @Description: 二级类目 9 | * @author handx 908716835@qq.com 10 | * @date 2017年5月25日 下午1:19:34 11 | * 12 | */ 13 | public class SecondCategory { 14 | 15 | private String name; 16 | private String url; 17 | 18 | private List threeCates;// 三级类别 19 | 20 | 21 | public String getName() { 22 | return name; 23 | } 24 | 25 | public List getThreeCates() { 26 | return threeCates; 27 | } 28 | public String getUrl() { 29 | return url; 30 | } 31 | public void setName(String name) { 32 | this.name = name; 33 | } 34 | 35 | public void setThreeCates(List threeCates) { 36 | this.threeCates = threeCates; 37 | } 38 | 39 | public void setUrl(String url) { 40 | this.url = url; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return "name=" + name + ", url=" + url + ", threeCates=" + threeCates; 46 | } 47 | 48 | 49 | 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/model/ThreeCategory.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.model; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * 7 | * @ClassName: ThreeCategory 8 | * @Description: 三级类目 9 | * @author handx 908716835@qq.com 10 | * @date 2017年5月25日 下午1:57:01 11 | */ 12 | public class ThreeCategory { 13 | 14 | private String name; 15 | private String url; 16 | 17 | private List fourCates;// 四级类别 18 | 19 | public List getFourCates() { 20 | return fourCates; 21 | } 22 | 23 | public String getName() { 24 | return name; 25 | } 26 | public String getUrl() { 27 | return url; 28 | } 29 | public void setFourCates(List fourCates) { 30 | this.fourCates = fourCates; 31 | } 32 | 33 | public void setName(String name) { 34 | this.name = name; 35 | } 36 | 37 | public void setUrl(String url) { 38 | this.url = url; 39 | } 40 | 41 | @Override 42 | public String toString() { 43 | return "ThreeCategory [name=" + name + ", url=" + url + ", fourCates=" + fourCates + "]"; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/Constants.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | /** 4 | * @ClassName: Constants 5 | * @Description: 存放固定常量 6 | * @author handx 908716835@qq.com 7 | * @date 2017年5月24日 下午9:45:41 8 | * 9 | */ 10 | public class Constants { 11 | 12 | /** 13 | * 京东 14 | */ 15 | public static final String JD_URL = "https://www.jd.com/allSort.aspx"; 16 | 17 | public static String JDURL = "http://search.jd.com/Search?keyword="; 18 | /** 19 | * JD汉字编码格式 20 | */ 21 | public static String JDENC = "&enc=utf-8"; 22 | /** 23 | * JD分页 24 | */ 25 | public static String JDPAGE = "&page="; 26 | 27 | /** 28 | * sku url 29 | */ 30 | public static String JD_SKU_URL = "https://item.jd.com/"; 31 | 32 | // https://search.jd.com/Search?keyword=%E5%8E%9F%E5%91%B3%E6%B5%B7%E8%8B%94&enc=utf-8&suggest=1.def.0.T07&wq=yuanwei%E6%B5%B7%E8%8B%94&pvid=17eeaaa031dc4594a49e3c1700d0d3c3 33 | 34 | /** 35 | * 最大线程数 36 | */ 37 | public static final int MAX_THREAD_CNT = 5; 38 | 39 | /** 40 | * 京东 41 | */ 42 | public static final String JD = "JD"; 43 | 44 | /** 45 | * 京东分页参数 46 | */ 47 | public static final String JD_PAGING_PARAMETER = "&page=%s&go=0&JL=6_0_0"; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/HttpUtil.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | import org.apache.http.HttpEntity; 4 | import org.apache.http.NameValuePair; 5 | import org.apache.http.client.entity.UrlEncodedFormEntity; 6 | import org.apache.http.client.methods.CloseableHttpResponse; 7 | import org.apache.http.client.methods.HttpGet; 8 | import org.apache.http.client.methods.HttpPost; 9 | import org.apache.http.client.protocol.HttpClientContext; 10 | import org.apache.http.impl.client.CloseableHttpClient; 11 | import org.apache.http.impl.client.HttpClients; 12 | import org.apache.http.util.EntityUtils; 13 | import org.apache.log4j.Logger; 14 | import org.jsoup.Jsoup; 15 | import org.jsoup.nodes.Document; 16 | import org.openqa.selenium.WebDriver; 17 | import org.openqa.selenium.chrome.ChromeDriver; 18 | import org.openqa.selenium.os.WindowsUtils; 19 | 20 | import java.io.IOException; 21 | import java.util.List; 22 | 23 | /** 24 | * @ClassName: HttpUtil 25 | * @Description: 26 | * @author handx 908716835@qq.com 27 | * @date 2017年5月24日 下午3:32:31 28 | * 29 | */ 30 | 31 | /** 32 | * @ClassName: HttpUtil 33 | * @Description: httpclient请求数据工具类 34 | * @author handx 908716835@qq.com 35 | * @date 2017年5月24日 下午7:46:06 36 | * 37 | */ 38 | public class HttpUtil { 39 | 40 | public static final Logger log = Logger.getLogger(JsoupUtil.class); 41 | 42 | public static final String TAG = "HttpUtils"; 43 | public static CloseableHttpClient httpClient = HttpClients.createDefault(); 44 | public static HttpClientContext context = new HttpClientContext(); 45 | 46 | /** 47 | * @Title: getDocumentByUrl 48 | * @Description:使用Selenium模拟浏览器动态获取数据 49 | * @param @param url 50 | * @param @return 设定文件 51 | * @return Document 返回类型 52 | * @throws 53 | */ 54 | public static Document getDocumentByUrl(String url) { 55 | WebDriver webDriver = null; 56 | Document document = null; 57 | try { 58 | System.getProperties().setProperty("webdriver.chrome.driver", 59 | "D:\\myWorkspace\\JdBee\\src\\main\\resources\\chromedriver.exe"); 60 | webDriver = new ChromeDriver(); 61 | webDriver.get(url); 62 | Thread.sleep(1000);// 停止1s模拟网速 63 | document = Jsoup.parse(webDriver.getPageSource()); 64 | } catch (Exception e) { 65 | e.printStackTrace(); 66 | } finally { 67 | webDriver.close(); 68 | webDriver.quit(); 69 | } 70 | return document; 71 | } 72 | 73 | /** 74 | * @Title: killChromDriver 75 | * @Description: 杀死chromDriver后台进程,每次都会启动一次很浪费内存,执行完就杀掉 76 | * @param 设定文件 77 | * @return void 返回类型 78 | * @throws 79 | */ 80 | public static void killChromDriver() { 81 | try { 82 | WindowsUtils.tryToKillByName("chromedriver.exe"); 83 | // Runtime.getRuntime().exec("wmic process where 84 | // name=\"chromedriver.exe\" call terminate"); 85 | } catch (Exception e) { 86 | e.printStackTrace(); 87 | } 88 | } 89 | 90 | /** 91 | * @Title: sendGet 92 | * @Description: get请求 93 | * @param @param url 94 | * @param @return 设定文件 95 | * @return String 返回类型 96 | * @throws 97 | */ 98 | public static String sendGet(String url) { 99 | CloseableHttpResponse response = null; 100 | String content = null; 101 | try { 102 | HttpGet get = new HttpGet(url); 103 | response = httpClient.execute(get, context); 104 | HttpEntity entity = response.getEntity(); 105 | content = EntityUtils.toString(entity, "UTF-8"); 106 | EntityUtils.consume(entity); 107 | return content; 108 | } catch (Exception e) { 109 | log.error("get请求获取数据失败,请检查url是否正确!", e); 110 | } finally { 111 | try { 112 | response.close(); 113 | } catch (IOException e) { 114 | e.printStackTrace(); 115 | } 116 | } 117 | return content; 118 | } 119 | 120 | /** 121 | * @Title: sendPost 122 | * @Description: post请求 123 | * @param @param url 124 | * @param @param nvps 125 | * @param @return 设定文件 126 | * @return String 返回类型 127 | * @throws 128 | */ 129 | public static String sendPost(String url, List nvps) { 130 | CloseableHttpResponse response = null; 131 | String content = null; 132 | try { 133 | // HttpClient中的post请求包装类 134 | HttpPost post = new HttpPost(url); 135 | // nvps是包装请求参数的list 136 | if (nvps != null) { 137 | post.setEntity(new UrlEncodedFormEntity(nvps, "UTF-8")); 138 | } 139 | // 执行请求用execute方法,content用来帮我们附带上额外信息 140 | response = httpClient.execute(post, context); 141 | // 得到相应实体、包括响应头以及相应内容 142 | HttpEntity entity = response.getEntity(); 143 | // 得到response的内容 144 | content = EntityUtils.toString(entity); 145 | // System.out.println(TAG + "POST:" + content); 146 | // 关闭输入流 147 | EntityUtils.consume(entity); 148 | return content; 149 | } catch (Exception e) { 150 | log.error("post请求获取数据失败,请检查url是否正确!", e); 151 | } finally { 152 | try { 153 | response.close(); 154 | } catch (IOException e) { 155 | e.printStackTrace(); 156 | } 157 | } 158 | return content; 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/JsoupUtil.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | import com.jdbee.model.Category; 4 | import com.jdbee.model.FiveCategory; 5 | import com.jdbee.model.FourCategory; 6 | import com.jdbee.model.SecondCategory; 7 | import com.jdbee.model.ThreeCategory; 8 | 9 | import org.apache.commons.lang3.StringUtils; 10 | import org.apache.log4j.Logger; 11 | import org.jsoup.Jsoup; 12 | import org.jsoup.nodes.Document; 13 | import org.jsoup.nodes.Element; 14 | import org.jsoup.select.Elements; 15 | 16 | import java.util.ArrayList; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | /** 22 | * @ClassName: JsoupUtil 23 | * @Description: 解析数据工具类 24 | * @author handx 908716835@qq.com 25 | * @date 2017年5月24日 下午8:23:17 26 | * 27 | */ 28 | public class JsoupUtil { 29 | 30 | public static final Logger log = Logger.getLogger(JsoupUtil.class); 31 | 32 | /** 33 | * @Title: getCategories 34 | * @Description: 获取一级类别 35 | * @param @param content 36 | * @param @return 设定文件 37 | * @return List 返回类型 38 | * @throws 39 | */ 40 | public static List getFirstCategory(String content) { 41 | 42 | Document document = Jsoup.parse(content); 43 | Elements categories = document.getElementsByClass("categories"); 44 | categories = categories.select("a"); 45 | 46 | List list = new ArrayList(); 47 | 48 | for (int i = 0; i < categories.size(); i++) { 49 | Category category = new Category(); 50 | category.setName(categories.get(i).text()); 51 | category.setId(i); 52 | list.add(category); 53 | } 54 | 55 | return list; 56 | } 57 | 58 | 59 | /** 60 | * 61 | * @Title: getGoodsSku 62 | * @Description: 根据分页url获取页面里面的商品sku 63 | * @param @param url 64 | * @return List 返回类型 65 | * @throws 66 | */ 67 | public static List getGoodsSku(String url) { 68 | 69 | List skuUrls = new ArrayList(); 70 | 71 | Document document = HttpUtil.getDocumentByUrl(url); 72 | Element element = document.getElementById("J_goodsList"); 73 | Elements sku = element.select("li"); 74 | 75 | for (Element skuId : sku) { 76 | String path = skuId.attr("data-sku"); 77 | if (!StringUtils.isEmpty(path)) { 78 | skuUrls.add(path); 79 | } 80 | } 81 | 82 | return skuUrls; 83 | } 84 | 85 | /** 86 | * @Title: getLastCategory 87 | * @Description: 找到最后一级类别 88 | * @param @param list 89 | * @param @param firstcate 90 | * @param @param secondCate 91 | * @param @return 设定文件 92 | * @return List 返回类型 93 | * @throws 94 | */ 95 | public static List getLastCategory(List list, String firstcate, String secondCate) { 96 | 97 | List fiveCate = new ArrayList(); 98 | 99 | for (Category category : list) { 100 | if (firstcate.equals(category.getName())) { 101 | List senondCates = category.getSenondCates(); 102 | for (SecondCategory secondCategory : senondCates) { 103 | if (secondCate.equals(secondCategory.getName())) { 104 | List threeCates = secondCategory.getThreeCates(); 105 | for (ThreeCategory threeCategory : threeCates) { 106 | List fourCates = threeCategory.getFourCates(); 107 | for (FourCategory fourCategory : fourCates) { 108 | List fiveCates = fourCategory.getFiveCates(); 109 | for (FiveCategory fiveCategory : fiveCates) { 110 | fiveCate.add(fiveCategory); 111 | } 112 | } 113 | } 114 | } 115 | } 116 | } 117 | } 118 | 119 | return fiveCate; 120 | } 121 | 122 | 123 | /** 124 | * @Title: getPageUrlList 125 | * @Description: 获取类目 126 | * @param @param list 127 | * @param @return 设定文件 128 | * @return List>> 返回类型 129 | * @throws 130 | */ 131 | // public static List>> 132 | // getPageUrlList(List list) { 133 | // 134 | // List>> pageMap = new ArrayList>>(); 136 | // 137 | // for (Category category : list) { 138 | // if ("食品饮料、保健食品".equals(category.getName())) { 139 | // List senondCates = category.getSenondCates(); 140 | // for (SecondCategory secondCategory : senondCates) { 141 | // if ("进口食品".equals(secondCategory.getName())) { 142 | // List threeCates = secondCategory.getThreeCates(); 143 | // for (ThreeCategory threeCategory : threeCates) { 144 | // List fourCates = threeCategory.getFourCates(); 145 | // for (FourCategory fourCategory : fourCates) { 146 | // List fiveCates = fourCategory.getFiveCates(); 147 | // for (FiveCategory fiveCategory : fiveCates) { 148 | // Map> map = getPageUrl(fiveCategory); 149 | // pageMap.add(map); 150 | // } 151 | // } 152 | // } 153 | // } 154 | // } 155 | // } 156 | // } 157 | // 158 | // return pageMap; 159 | // } 160 | 161 | /** 162 | * @Title: getPageUrl 163 | * @Description: 获取类别页数 164 | * @param @param fiveCategory 165 | * @param @return 设定文件 166 | * @return Map> 返回类型 167 | * @throws 168 | */ 169 | public static Map> getPageUrl(FiveCategory fiveCategory) { 170 | 171 | Map> map = new HashMap>(); 172 | List urls = new ArrayList(); 173 | 174 | Document document = HttpUtil.getDocumentByUrl(fiveCategory.getUrl()); 175 | Element element = document.getElementById("J_bottomPage"); 176 | 177 | if (element.childNodeSize() > 0) {// 判断是否有分页 178 | int cnt = Integer.parseInt(element.select(".p-skip b").text()); 179 | for (int i = 1; i < cnt; i++) { 180 | String url = Constants.JDURL + fiveCategory.getName() + Constants.JDENC + Constants.JDPAGE + i; 181 | urls.add(url); 182 | } 183 | log.info("正在爬取:" + fiveCategory.getName() + ",共" + urls.size() + "页 ,url:" + fiveCategory.getUrl()); 184 | map.put(fiveCategory.getName(), urls); 185 | } else { 186 | String url = Constants.JDURL + fiveCategory.getName() + Constants.JDENC + Constants.JDPAGE + 1; 187 | urls.add(url); 188 | map.put(fiveCategory.getName(), urls); 189 | } 190 | 191 | return map; 192 | } 193 | 194 | /** 195 | * @Title: getSecondCategory 196 | * @Description: 获取二级类别 197 | * @param @param content 198 | * @param @param cates 199 | * @param @return 设定文件 200 | * @return List 返回类型 201 | * @throws 202 | */ 203 | public static List getSecondCategory(String content, List cates) { 204 | 205 | List list = null; 206 | Document document = Jsoup.parse(content); 207 | Elements elements = document.select(".item-title span"); 208 | 209 | for (Element element : elements) { 210 | String text = element.text(); 211 | for (int i = 0; i < cates.size(); i++) { 212 | String name = cates.get(i).getName(); 213 | if ("电脑办公".equals(text)) { 214 | text = "电脑、办公"; 215 | } 216 | if (name.contains(text)) { 217 | 218 | Element categoryItem = element.parent().parent().parent(); 219 | Elements categories = categoryItem.select("dt a"); 220 | list = new ArrayList(); 221 | 222 | for (int j = 0; j < categories.size(); j++) { 223 | SecondCategory cate = new SecondCategory(); 224 | cate.setName(categories.get(j).text()); 225 | cate.setUrl("https:" + categories.get(j).attr("href")); 226 | list.add(cate); 227 | } 228 | cates.get(i).setSenondCates(list); 229 | } 230 | } 231 | } 232 | 233 | return cates; 234 | } 235 | 236 | 237 | /** 238 | * @Title: getThreeCategory 239 | * @Description: 获取3,4,5级类目 240 | * @param list 241 | * @return List 返回类型 242 | */ 243 | public static List getThreeCategory(List list) { 244 | 245 | List threeCategories = null; 246 | List fourCategories = null; 247 | List fiveCategories = null; 248 | 249 | for (Category category : list) { 250 | if ("食品饮料、保健食品".equals(category.getName())) { 251 | List senondCates = category.getSenondCates(); 252 | 253 | for (SecondCategory secondCategory : senondCates) { 254 | if ("进口食品".equals(secondCategory.getName())) { 255 | 256 | threeCategories = new ArrayList(); 257 | 258 | Document document = HttpUtil.getDocumentByUrl(secondCategory.getUrl()); 259 | Elements foodNav = document.getElementsByClass("food_nav"); 260 | Elements titles = foodNav.select(".item_header_title"); 261 | for (Element element : titles) { 262 | ThreeCategory threeCategory = new ThreeCategory(); 263 | threeCategory.setName(element.select("a").text()); 264 | threeCategory.setUrl(element.select("a").attr("href")); 265 | threeCategories.add(threeCategory); 266 | 267 | Element item = element.parent().parent(); 268 | Elements foodNavSubs = item.select(".food_nav_sub_item"); 269 | fourCategories = new ArrayList(); 270 | 271 | for (Element sub : foodNavSubs) { 272 | FourCategory fourCategory = new FourCategory(); 273 | fourCategory.setName(sub.select("a").eq(0).text()); 274 | fourCategory.setUrl(sub.select("a").eq(0).attr("href")); 275 | fourCategories.add(fourCategory); 276 | 277 | Elements navSubMains = sub.select(".food_nav_sub_main a"); 278 | fiveCategories = new ArrayList(); 279 | 280 | for (Element a : navSubMains) { 281 | FiveCategory fiveCategory = new FiveCategory(); 282 | fiveCategory.setName(a.text()); 283 | fiveCategory.setUrl(a.attr("href")); 284 | fiveCategories.add(fiveCategory); 285 | } 286 | fourCategory.setFiveCates(fiveCategories); 287 | } 288 | threeCategory.setFourCates(fourCategories); 289 | 290 | } 291 | secondCategory.setThreeCates(threeCategories); 292 | } 293 | } 294 | 295 | } 296 | } 297 | return list; 298 | } 299 | 300 | } 301 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/PageUtils.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | import com.gargoylesoftware.htmlunit.BrowserVersion; 4 | 5 | import org.openqa.selenium.WebDriver; 6 | import org.openqa.selenium.htmlunit.HtmlUnitDriver; 7 | import org.openqa.selenium.phantomjs.PhantomJSDriver; 8 | import org.openqa.selenium.phantomjs.PhantomJSDriverService; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.IOException; 12 | import java.io.InputStream; 13 | import java.io.InputStreamReader; 14 | 15 | import cn.edu.hfut.dmic.webcollector.model.Page; 16 | 17 | /** 18 | * @ClassName: PageUtils 19 | * @Description: PhantomJS工具类 20 | * @author handx 908716835@qq.com 21 | * @date 2017年6月1日 下午5:53:10 22 | */ 23 | public class PageUtils { 24 | 25 | /** 26 | * 获取webcollector 自带htmlUnitDriver实例 27 | * 28 | * @param page 29 | * @param browserVersion 30 | * 模拟浏览器 31 | * @return 32 | */ 33 | public static HtmlUnitDriver getDriver(Page page, BrowserVersion browserVersion) { 34 | HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion); 35 | driver.setJavascriptEnabled(true); 36 | driver.get(page.getUrl()); 37 | return driver; 38 | } 39 | 40 | /** 41 | * 获取webcollector 自带 htmlUnitDriver实例(模拟默认浏览器) 42 | * 43 | * @param page 44 | * @return 45 | */ 46 | public static HtmlUnitDriver getDriver(String url) { 47 | HtmlUnitDriver driver = new HtmlUnitDriver(); 48 | driver.setJavascriptEnabled(true); 49 | driver.get(url); 50 | return driver; 51 | } 52 | 53 | /** 54 | * 直接调用原生phantomJS(即不通过selenium) 55 | * 56 | * @param page 57 | * @return 58 | * @throws IOException 59 | */ 60 | public static String getPhantomJSDriver(String url) throws IOException { 61 | InputStream in = null; 62 | try { 63 | Process process = Runtime.getRuntime() 64 | .exec(PropertiesUtils.getProperty(PropertiesUtils.PHANTOMJS_DRIVER_PATH) 65 | + " " + PropertiesUtils.getProperty(PropertiesUtils.PHANTOMJS_JS) + " " + url); 66 | in = process.getInputStream(); 67 | InputStreamReader reader = new InputStreamReader(in, "UTF-8"); 68 | BufferedReader br = new BufferedReader(reader); 69 | StringBuffer sbf = new StringBuffer(); 70 | String tmp = ""; 71 | while ((tmp = br.readLine()) != null) { 72 | sbf.append(tmp); 73 | } 74 | return sbf.toString(); 75 | } catch (IOException e) { 76 | e.printStackTrace(); 77 | }finally { 78 | in.close(); 79 | } 80 | return null; 81 | } 82 | 83 | /** 84 | * 获取PhantomJsDriver(可以爬取js动态生成的html) 85 | * 86 | * @param page 87 | * @return 88 | */ 89 | public static WebDriver getWebDriver(Page page) { 90 | System.setProperty(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, 91 | PropertiesUtils.getProperty(PropertiesUtils.PHANTOMJS_DRIVER_PATH)); 92 | WebDriver driver = new PhantomJSDriver(); 93 | driver.get(page.getUrl()); 94 | return driver; 95 | } 96 | 97 | /** 98 | * @Title: getWebDriver 99 | * @Description: 根据url获取html 100 | * @param @param url 101 | * @param @return 102 | * @return WebDriver 103 | * @throws 104 | */ 105 | public static WebDriver getWebDriver(String url) { 106 | System.setProperty(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, 107 | PropertiesUtils.getProperty(PropertiesUtils.PHANTOMJS_DRIVER_PATH)); 108 | WebDriver driver = new PhantomJSDriver(); 109 | driver.get(url); 110 | return driver; 111 | } 112 | 113 | public static void main(String[] args) { 114 | System.out.println(PropertiesUtils.getProperty(PropertiesUtils.PHANTOMJS_DRIVER_PATH)); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/PropertiesUtils.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.nio.file.Files; 6 | import java.nio.file.Paths; 7 | import java.util.Properties; 8 | 9 | /** 10 | * @author handx 908716835@qq.com 11 | * @date 2017年6月1日 下午10:34:15 12 | */ 13 | public class PropertiesUtils { 14 | 15 | private static final String DRIVER_PROPERTIES_PATH = "src/main/resources/config.properties"; 16 | public static final String PHANTOMJS_DRIVER_PATH = "phantomjs.driver.path"; 17 | public static final String PHANTOMJS_JS = "phantomjs.js"; 18 | 19 | private static Properties properties; 20 | 21 | public static String getProperty(String key) { 22 | if (properties == null) { 23 | properties = readProperties(); 24 | } 25 | return properties.getProperty(key); 26 | } 27 | 28 | public static Properties readProperties() { 29 | Properties props = new Properties(); 30 | try { 31 | InputStream inputStream = Files.newInputStream(Paths.get(DRIVER_PROPERTIES_PATH)); 32 | props.load(inputStream); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | return props; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/jdbee/utils/ThreadUtil.java: -------------------------------------------------------------------------------- 1 | package com.jdbee.utils; 2 | 3 | import com.jdbee.model.FiveCategory; 4 | 5 | import org.apache.log4j.Logger; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Iterator; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.concurrent.Callable; 13 | import java.util.concurrent.ExecutorService; 14 | import java.util.concurrent.Executors; 15 | 16 | public class ThreadUtil { 17 | 18 | public static final Logger log = Logger.getLogger(ThreadUtil.class); 19 | 20 | private static List>> list = new ArrayList>>(); 21 | private static List> skus = new ArrayList>(); 22 | 23 | /** 24 | * @Title: getCategoryPageUrl 25 | * @Description: 获取类目的分页集合 26 | * @param @param fiveCategories 27 | * @param @return 设定文件 28 | * @return List>> 返回类型 29 | * @throws 30 | */ 31 | public static List>> getCategoryPageUrl(List fiveCategories) { 32 | 33 | ExecutorService p = Executors.newFixedThreadPool(Constants.MAX_THREAD_CNT); 34 | final List> partitions = new ArrayList>(); 35 | 36 | try { 37 | 38 | for (final FiveCategory category : fiveCategories) { 39 | partitions.add(new Callable() { 40 | public Integer call() throws Exception { 41 | Map> map = JsoupUtil.getPageUrl(category); 42 | list.add(map); 43 | log.info(category.getName() + "已爬完...已爬取" + list.size() + "个类别..."); 44 | return 0; 45 | } 46 | }); 47 | } 48 | 49 | p.invokeAll(partitions); 50 | p.shutdown(); 51 | HttpUtil.killChromDriver(); 52 | } catch (InterruptedException e) { 53 | e.printStackTrace(); 54 | } 55 | return list; 56 | } 57 | 58 | /** 59 | * @Title: getGoodsSkuIdByCatePages 60 | * @Description: 多线程根据分页url获取sku 61 | * @param @param categoryPageUrl 62 | * @return List> 返回类型 63 | * @throws 64 | */ 65 | public static List> getGoodsSkuIdByCatePages(List>> categoryPageUrl) { 66 | 67 | ExecutorService p = Executors.newFixedThreadPool(Constants.MAX_THREAD_CNT); 68 | final List> partitions = new ArrayList>(); 69 | 70 | try { 71 | for (Map> map : categoryPageUrl) { 72 | Iterator>> iterator = map.entrySet().iterator(); 73 | while (iterator.hasNext()) { 74 | Entry> next = iterator.next(); 75 | List urls = next.getValue(); 76 | for (final String url : urls) { 77 | partitions.add(new Callable() { 78 | public Integer call() throws Exception { 79 | List goodsSkus = JsoupUtil.getGoodsSku(url); 80 | skus.add(goodsSkus); 81 | log.info("正在爬取:" + url + "界面共爬取sku个数:" + goodsSkus.size()); 82 | return 0; 83 | } 84 | }); 85 | } 86 | } 87 | } 88 | p.invokeAll(partitions); 89 | p.shutdown(); 90 | } catch (Exception e) { 91 | e.printStackTrace(); 92 | } 93 | 94 | return skus; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/main/resources/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/handexing/JdBee/dfc017309cb6f235994238f13c928f7696ba6e3e/src/main/resources/chromedriver.exe -------------------------------------------------------------------------------- /src/main/resources/config.properties: -------------------------------------------------------------------------------- 1 | mysql_url=jdbc:mysql://localhost:3306/vipsnacks?characterEncoding=utf-8 2 | mysql_username=root 3 | mysql_password=root 4 | mysql_driverClassName=com.mysql.jdbc.Driver 5 | maxActive:20 6 | #maxActive:50 7 | initialSize:5 8 | maxWait:30000 9 | minIdle:10 10 | 11 | #phantomjs 12 | phantomjs.driver.path=D:\\myWorkspace\\JdBee\\src\\main\\resources\\phantomjs.exe 13 | phantomjs.js=D:\\myWorkspace\\JdBee\\src\\main\\resources\\parser.js -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=info,stdout,logfile 2 | #debug 3 | #stdout configure 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern= %d %p [%c] - <%m>%n 7 | 8 | #logfile configure 9 | log4j.appender.logfile=org.apache.log4j.DailyRollingFileAppender 10 | log4j.appender.logfile.File=logs/jdbee.log 11 | log4j.appender.logfile.layout=org.apache.log4j.PatternLayout 12 | log4j.appender.logfile.layout.ConversionPattern= %d %p [%c] - <%m>%n 13 | 14 | log4j.logger.org.apache.zookeeper.ClientCnxn=info 15 | log4j.logger.org.vista.mall.web.ui.utils.ModelHolder=info 16 | 17 | #log4j.appender.logfile=org.apache.log4j.RollingFileAppender 18 | #log4j.appender.logfile.Append=false 19 | #log4j.appender.logfile.File=./logs/server1/SystemOut.log 20 | # Pattern to output: date priority [category] - message 21 | #log4j.appender.logfile.layout=org.apache.log4j.PatternLayout 22 | #log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n -------------------------------------------------------------------------------- /src/main/resources/parser.js: -------------------------------------------------------------------------------- 1 | system = require('system') 2 | address = system.args[1];//获得命令行第二个参数 接下来会用到 3 | //console.log('Loading a web page'); 4 | var page = require('webpage').create(); 5 | var url = address; 6 | //console.log(url); 7 | page.open(url, function (status) { 8 | //Page is loaded! 9 | if (status !== 'success') { 10 | console.log('Unable to post!'); 11 | } else { 12 | //此处的打印,是将结果一流的形式output到java中,java通过InputStream可以获取该输出内容 13 | console.log(page.content); 14 | } 15 | phantom.exit(); 16 | }); -------------------------------------------------------------------------------- /src/main/resources/phantomjs.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/handexing/JdBee/dfc017309cb6f235994238f13c928f7696ba6e3e/src/main/resources/phantomjs.exe -------------------------------------------------------------------------------- /src/main/resources/springJdbcContext.xml: -------------------------------------------------------------------------------- 1 | 2 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | false 44 | 45 | 46 | false 47 | 48 | 49 | true 50 | 51 | 52 | 180000 53 | 54 | 55 | 6000 56 | 57 | 58 | SELECT 1 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /src/test/java/com/handx/jd/JdBee/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.handx.jd.JdBee; 2 | 3 | import com.jdbee.utils.HttpUtil; 4 | 5 | import org.apache.commons.lang3.StringUtils; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.junit.Test; 11 | import org.openqa.selenium.WebDriver; 12 | import org.openqa.selenium.chrome.ChromeDriver; 13 | 14 | import java.io.IOException; 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | public class AppTest { 19 | 20 | @Test 21 | public void testCmd(){ 22 | try { 23 | Runtime.getRuntime().exec("wmic process where name=\"phantomjs.exe\" call terminate"); 24 | } catch (IOException e) { 25 | e.printStackTrace(); 26 | } 27 | } 28 | 29 | @Test 30 | public void testGetGoods() throws InterruptedException { 31 | List skuUrls = new ArrayList(); 32 | 33 | Document document = HttpUtil.getDocumentByUrl("http://search.jd.com/Search?keyword=玉米片&enc=utf-8&page=1"); 34 | Element element = document.getElementById("J_goodsList"); 35 | Elements sku = element.select("li"); 36 | 37 | for (Element skuId : sku) { 38 | String url = skuId.attr("data-sku"); 39 | if (!StringUtils.isEmpty(url)) { 40 | skuUrls.add(url); 41 | } 42 | } 43 | 44 | for (String url : skuUrls) { 45 | System.out.println(url); 46 | } 47 | System.out.println(skuUrls.size()); 48 | } 49 | 50 | @Test 51 | public void testHtmlUnit() throws InterruptedException { 52 | // WebDriver driver = new HtmlUnitDriver(true); 53 | // driver.get("https://channel.jd.com/1320-5019.html"); 54 | // Thread.sleep(3000); 55 | // 56 | // String str = driver.getPageSource(); 57 | // Document parse = Jsoup.parse(str); 58 | // System.err.println(parse.getElementsByClass("food_nav") + "======="); 59 | } 60 | 61 | @Test 62 | public void testSelenium() throws InterruptedException { 63 | WebDriver webDriver = null; 64 | try { 65 | System.getProperties().setProperty("webdriver.chrome.driver", 66 | "D:\\myWorkspace\\JdBee\\src\\main\\resources\\chromedriver.exe"); 67 | 68 | webDriver = new ChromeDriver(); 69 | webDriver.get("https://channel.jd.com/1320-5019.html"); 70 | Thread.sleep(1000); 71 | 72 | String str = webDriver.getPageSource(); 73 | Document parse = Jsoup.parse(str); 74 | System.out.println(parse.getElementsByClass("food_nav")); 75 | } catch (Exception e) { 76 | e.printStackTrace(); 77 | } finally { 78 | webDriver.close(); 79 | webDriver.quit(); 80 | } 81 | } 82 | 83 | @Test 84 | public void testUrl() { 85 | // Document document = HttpUtil.getDocumentByUrl( 86 | // "https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E6%A6%B4%E8%8E%B2%E5%A8%81%E5%8C%96&enc=utf-8&wq=%E8%BF%9B%E5%8F%A3%E6%A6%B4%E8%8E%B2%E5%A8%81%E5%8C%96&pvid=63028a7071c84281a09c47cb9ed707e2"); 87 | // Element elementById = document.getElementById("J_bottomPage"); 88 | // if (elementById.childNodeSize() > 0) { 89 | // System.out.println(elementById.select(".p-skip b").text()); 90 | // } else { 91 | // System.out.println("=========="); 92 | // } 93 | 94 | // Document document = HttpUtil.getDocumentByUrl( 95 | // "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.6.HOWjLB&id=540016761844&skuId=3201176746543&areaId=310100&user_id=3000674726&cat_id=2&is_b=1&rn=e380807ac88b9acccf0385aa2d128038"); 96 | // System.out.println(document.getElementById("J_DetailMeta")); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/test/java/com/handx/jd/JdBee/TestGoodsDetail.java: -------------------------------------------------------------------------------- 1 | package com.handx.jd.JdBee; 2 | 3 | import com.jdbee.model.Goods; 4 | import com.jdbee.utils.Constants; 5 | import com.jdbee.utils.HttpUtil; 6 | import com.jdbee.utils.PageUtils; 7 | 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.log4j.Logger; 10 | import org.jsoup.Jsoup; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | import org.openqa.selenium.By; 15 | import org.openqa.selenium.WebDriver; 16 | import org.openqa.selenium.WebElement; 17 | 18 | import java.util.List; 19 | 20 | public class TestGoodsDetail { 21 | 22 | public static final Logger log = Logger.getLogger(TestGoodsDetail.class); 23 | 24 | /** 25 | * @Title: addGoods 26 | * @Description: 以前老方法 27 | * @param @param url 28 | * @return void 29 | * @throws 30 | */ 31 | public static void addGoods(String url) { 32 | 33 | WebDriver driver = null; 34 | try { 35 | driver = PageUtils.getWebDriver(url); 36 | List eles = driver.findElements(By.cssSelector("li.gl-item")); 37 | if (!eles.isEmpty()) { 38 | for (WebElement ele : eles) { 39 | Goods g = new Goods(); 40 | g.setPlatform(Constants.JD); 41 | String priceStr = ele.findElement(By.className("p-price")).findElement(By.className("J_price")) 42 | .findElement(By.tagName("i")).getText(); 43 | if (!StringUtils.isBlank(priceStr) && !"null".equals(priceStr)) { 44 | g.setPrice(priceStr); 45 | } else { 46 | g.setPrice("-1"); 47 | } 48 | g.setName(ele.findElement(By.className("p-name")).findElement(By.tagName("em")).getText()); 49 | g.setUrl(ele.findElement(By.className("p-name")).findElement(By.tagName("a")).getAttribute("href")); 50 | String commitCnt = ele.findElement(By.className("p-commit")).findElement(By.tagName("a")).getText(); 51 | if (!StringUtils.isBlank(commitCnt) && !"null".equals(commitCnt)) { 52 | g.setCommitCnt(commitCnt); 53 | } else { 54 | g.setCommitCnt("-1"); 55 | } 56 | } 57 | } else { 58 | log.info("无商品列表!"); 59 | } 60 | } catch (Exception e) { 61 | log.warn("爬取异常!!!"); 62 | } finally { 63 | if (driver != null) { 64 | driver.quit(); 65 | } 66 | } 67 | } 68 | 69 | /** 70 | * @Title: addGoods1 71 | * @Description: 新方法 72 | * @param @param url 73 | * @return void 74 | * @throws 75 | */ 76 | public static void addGoods1(String url) { 77 | try { 78 | Document document = Jsoup.parse(HttpUtil.sendGet(url)); 79 | Elements list = document.select("li.gl-item"); 80 | for (Element element : list) { 81 | String price = element.select(".p-price").select(".J_price").eq(0).select("i").text(); 82 | String title = element.select(".p-name a em").text(); 83 | String href = element.select(".p-name a").attr("href"); 84 | String commit = element.select(".p-commit a").text(); 85 | getGoodsDetail("http:" + href); 86 | } 87 | } catch (Exception e) { 88 | log.warn("爬取异常!!!"); 89 | } 90 | } 91 | 92 | public static void getGoodsDetail(String url) { 93 | 94 | try { 95 | 96 | String[] cateArrs = new String[5]; 97 | String[] scoreArrs = new String[3]; 98 | String[] imgArrs = new String[6]; 99 | 100 | Document document = Jsoup.parse(HttpUtil.sendGet(url)); 101 | Element cates = document.getElementById("crumb-wrap"); 102 | Elements elements = cates.getElementsByTag("a"); 103 | 104 | for (int i = 0; i < elements.size(); i++) { 105 | Element element = elements.get(i); 106 | String attr = element.attr("clstag"); 107 | if (!StringUtils.isEmpty(attr)) { 108 | String str = attr.substring(attr.length() - 7, attr.length()); 109 | if (str.contains("mbNav")) { 110 | cateArrs[i] = element.text() + "&"; 111 | } 112 | } 113 | Elements evaluateDetail = element.getElementsByClass("score-detail"); 114 | for (int j = 0; j < evaluateDetail.size(); j++) { 115 | scoreArrs[j] = evaluateDetail.get(j).getElementsByTag("em").text() + "&"; 116 | } 117 | } 118 | 119 | // http://img14.360buyimg.com/n5/jfs/t3817/341/1304817369/214434/516e0b83/582aaa29Nbb46e619.jpg 120 | 121 | Elements productIntroDoc = document.getElementsByClass("product-intro"); 122 | 123 | for (Element element : productIntroDoc) { 124 | Elements imgDoc = element.select("#spec-list img"); 125 | for (int i = 0; i < imgDoc.size(); i++) { 126 | imgArrs[i] = "http:" + imgDoc.get(i).attr("src"); 127 | } 128 | } 129 | 130 | } catch (Exception e) { 131 | e.printStackTrace(); 132 | log.warn("爬取异常!!!"); 133 | } 134 | } 135 | 136 | // 19664 6080 137 | public static void main(String[] args) { 138 | // https://list.jd.com/list.html?cat=1320,2641,2642 139 | long start = System.currentTimeMillis(); 140 | // getGoodsDetail("http://item.jd.com/10120597276.html"); 141 | addGoods1("https://list.jd.com/list.html?cat=1320,2641,2642"); 142 | long end = System.currentTimeMillis(); 143 | System.out.println(end - start); 144 | } 145 | 146 | } 147 | -------------------------------------------------------------------------------- /src/test/java/com/handx/jd/JdBee/TestWebcollector.java: -------------------------------------------------------------------------------- 1 | package com.handx.jd.JdBee; 2 | 3 | import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler; 4 | import cn.edu.hfut.dmic.webcollector.model.Links; 5 | import cn.edu.hfut.dmic.webcollector.model.Page; 6 | 7 | /** 8 | * @author handx 908716835@qq.com 9 | * @date 2017年6月1日 上午11:08:19 10 | */ 11 | 12 | public class TestWebcollector extends BreadthCrawler { 13 | 14 | public static void main(String[] args) throws Exception { 15 | TestWebcollector crawler = new TestWebcollector("data", true); 16 | crawler.addSeed("http://baidu.com/"); 17 | crawler.start(5); 18 | } 19 | 20 | public TestWebcollector(String crawlPath, boolean autoParse) { 21 | super(crawlPath, autoParse); 22 | } 23 | 24 | @Override 25 | public void visit(Page page, Links nextLinks) { 26 | System.out.println("正在抽取" + page.getUrl()); 27 | String title = page.getDoc().title(); 28 | System.out.println(title); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/com/handx/jd/JdBee/dao/TestGoodsDao.java: -------------------------------------------------------------------------------- 1 | package com.handx.jd.JdBee.dao; 2 | 3 | import com.jdbee.dao.GoodsDao; 4 | import com.jdbee.model.Goods; 5 | 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | import org.springframework.context.ApplicationContext; 9 | import org.springframework.context.support.ClassPathXmlApplicationContext; 10 | 11 | public class TestGoodsDao { 12 | 13 | GoodsDao goodsDao; 14 | 15 | @Before 16 | public void setUp() { 17 | ApplicationContext context = new ClassPathXmlApplicationContext("springJdbcContext.xml"); 18 | goodsDao = (GoodsDao) context.getBean("goodsDao"); 19 | } 20 | 21 | @Test 22 | public void testFindGoods() { 23 | String goodsName = goodsDao.getGoodsName(1L); 24 | System.out.println(goodsName); 25 | } 26 | 27 | @Test 28 | public void testSaveGoods() { 29 | Goods goods = new Goods(); 30 | goods.setPlatform("JD"); 31 | goods.setCommitCnt("10000"); 32 | goods.setName("AD钙奶"); 33 | goods.setPrice("100"); 34 | goods.setUrl("http://xxxxx"); 35 | goodsDao.createGoods(goods); 36 | } 37 | } 38 | --------------------------------------------------------------------------------