├── .settings ├── org.eclipse.m2e.core.prefs ├── org.eclipse.core.resources.prefs └── org.eclipse.jdt.core.prefs ├── target ├── classes │ ├── cn │ │ └── ivan │ │ │ └── spider │ │ │ ├── Spider.class │ │ │ ├── UrlJob.class │ │ │ ├── Spider$1.class │ │ │ ├── UrlManager.class │ │ │ ├── domain │ │ │ └── Page.class │ │ │ ├── utils │ │ │ ├── Config.class │ │ │ ├── HtmlUtils.class │ │ │ ├── PageUtils.class │ │ │ ├── DomainUtils.class │ │ │ ├── HBaseUtils.class │ │ │ ├── RedisUtils.class │ │ │ └── SleepUtils.class │ │ │ ├── SpiderWatcher.class │ │ │ ├── store │ │ │ ├── Storeable.class │ │ │ ├── HBaseStoreable.class │ │ │ └── ConsoleStoreable.class │ │ │ ├── process │ │ │ ├── Processable.class │ │ │ └── JDProcessable.class │ │ │ ├── reposity │ │ │ ├── Repository.class │ │ │ ├── QueueRepository.class │ │ │ ├── RedisRepository.class │ │ │ ├── RandomQueueRepository.class │ │ │ └── RandomRedisRepository.class │ │ │ └── download │ │ │ ├── Downloadable.class │ │ │ └── HttpClientDownable.class │ ├── META-INF │ │ ├── MANIFEST.MF │ │ └── maven │ │ │ └── Spider │ │ │ └── spider │ │ │ ├── pom.properties │ │ │ └── pom.xml │ └── log4j.properties └── test-classes │ └── cn │ └── ivan │ └── spider │ ├── SpiderTest.class │ └── CuratorTest.class ├── README.md ├── src ├── main │ ├── java │ │ └── cn │ │ │ └── ivan │ │ │ └── spider │ │ │ ├── store │ │ │ ├── Storeable.java │ │ │ ├── ConsoleStoreable.java │ │ │ └── HBaseStoreable.java │ │ │ ├── process │ │ │ ├── Processable.java │ │ │ └── JDProcessable.java │ │ │ ├── download │ │ │ ├── Downloadable.java │ │ │ └── HttpClientDownable.java │ │ │ ├── reposity │ │ │ ├── Repository.java │ │ │ ├── RedisRepository.java │ │ │ ├── QueueRepository.java │ │ │ ├── RandomRedisRepository.java │ │ │ └── RandomQueueRepository.java │ │ │ ├── utils │ │ │ ├── Config.java │ │ │ ├── SleepUtils.java │ │ │ ├── DomainUtils.java │ │ │ ├── HtmlUtils.java │ │ │ ├── RedisUtils.java │ │ │ ├── PageUtils.java │ │ │ └── HBaseUtils.java │ │ │ ├── UrlJob.java │ │ │ ├── UrlManager.java │ │ │ ├── domain │ │ │ └── Page.java │ │ │ ├── SpiderWatcher.java │ │ │ └── Spider.java │ └── resources │ │ └── log4j.properties └── test │ └── java │ └── cn │ └── ivan │ └── spider │ ├── SpiderTest.java │ └── CuratorTest.java ├── .project ├── .classpath └── pom.xml /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/Spider.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/Spider.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/UrlJob.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/UrlJob.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/Spider$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/Spider$1.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/UrlManager.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/UrlManager.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/domain/Page.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/domain/Page.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/Config.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/Config.class -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spider 2 | 使用java+httpclient+httpcleaner,多线程、分布式爬去电商网站商品信息,数据存储在hbase上,并使用solr对商品建立索引,使用redis队列存储一个共享的url仓库;使用zookeeper对爬虫节点生命周期进行监视等。 3 | -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/SpiderWatcher.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/SpiderWatcher.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/store/Storeable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/store/Storeable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/HtmlUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/HtmlUtils.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/PageUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/PageUtils.class -------------------------------------------------------------------------------- /target/test-classes/cn/ivan/spider/SpiderTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/test-classes/cn/ivan/spider/SpiderTest.class -------------------------------------------------------------------------------- /target/classes/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Build-Jdk: 1.7.0_51 3 | Built-By: Hades 4 | Created-By: Maven Integration for Eclipse 5 | 6 | -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/DomainUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/DomainUtils.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/HBaseUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/HBaseUtils.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/RedisUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/RedisUtils.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/utils/SleepUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/utils/SleepUtils.class -------------------------------------------------------------------------------- /target/test-classes/cn/ivan/spider/CuratorTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/test-classes/cn/ivan/spider/CuratorTest.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/process/Processable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/process/Processable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/reposity/Repository.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/reposity/Repository.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/store/HBaseStoreable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/store/HBaseStoreable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/download/Downloadable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/download/Downloadable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/process/JDProcessable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/process/JDProcessable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/store/ConsoleStoreable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/store/ConsoleStoreable.class -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/reposity/QueueRepository.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/reposity/QueueRepository.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/reposity/RedisRepository.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/reposity/RedisRepository.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/download/HttpClientDownable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/download/HttpClientDownable.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/reposity/RandomQueueRepository.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/reposity/RandomQueueRepository.class -------------------------------------------------------------------------------- /target/classes/cn/ivan/spider/reposity/RandomRedisRepository.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JFanZhao/spider/HEAD/target/classes/cn/ivan/spider/reposity/RandomRedisRepository.class -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/store/Storeable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.store; 2 | 3 | import cn.ivan.spider.domain.Page; 4 | 5 | public interface Storeable { 6 | public void store(Page page); 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/process/Processable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.process; 2 | 3 | import cn.ivan.spider.domain.Page; 4 | 5 | public interface Processable { 6 | public void process(Page page); 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/download/Downloadable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.download; 2 | 3 | import cn.ivan.spider.domain.Page; 4 | 5 | public interface Downloadable { 6 | public Page download(String url); 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/reposity/Repository.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.reposity; 2 | 3 | public interface Repository { 4 | 5 | String poll(); 6 | 7 | void addHigh(String nextUrl); 8 | 9 | void add(String nextUrl); 10 | 11 | } 12 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 3 | org.eclipse.jdt.core.compiler.compliance=1.5 4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 5 | org.eclipse.jdt.core.compiler.source=1.5 6 | -------------------------------------------------------------------------------- /target/classes/META-INF/maven/Spider/spider/pom.properties: -------------------------------------------------------------------------------- 1 | #Generated by Maven Integration for Eclipse 2 | #Sat Jan 09 08:17:32 CST 2016 3 | version=0.0.1-SNAPSHOT 4 | groupId=Spider 5 | m2e.projectName=spider 6 | m2e.projectLocation=C\:\\Users\\Hades\\Documents\\GitHub\\spider 7 | artifactId=spider 8 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/store/ConsoleStoreable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.store; 2 | 3 | import cn.ivan.spider.domain.Page; 4 | 5 | public class ConsoleStoreable implements Storeable{ 6 | 7 | public void store(Page page) { 8 | System.out.println(page.getUrl()+"--"+page.getValues().get("price")); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /target/classes/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,stdout 2 | 3 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target = System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,stdout 2 | 3 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target = System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/Config.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | 4 | /** 5 | * 把项目中可能会发生变化的参数的值都提取到这个类中, 6 | * 注意:在这我直接把值在这个类中写死了,实际中这些参数的值是需要读取配置文件或者从数据库读取的 7 | * 如果读取配置文件,可以使用properties类 8 | * @author Administrator 9 | * 10 | */ 11 | public class Config { 12 | public static long MILLION_1 = 1000; 13 | public static long MILLION_5 = 5000; 14 | 15 | public static int THREADNUM = 5; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/download/HttpClientDownable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.download; 2 | 3 | import cn.ivan.spider.domain.Page; 4 | import cn.ivan.spider.utils.PageUtils; 5 | 6 | public class HttpClientDownable implements Downloadable{ 7 | 8 | public Page download(String url) { 9 | // TODO Auto-generated method stub 10 | Page page = new Page(); 11 | page.setUrl(url); 12 | page.setContent(PageUtils.getContent(url)); 13 | return page; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/SleepUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | public class SleepUtils { 7 | static Logger logger = LoggerFactory.getLogger(SleepUtils.class); 8 | 9 | public static void sleep(long millons){ 10 | try { 11 | Thread.currentThread().sleep(millons); 12 | } catch (InterruptedException e) { 13 | // TODO Auto-generated catch block 14 | logger.error("线程休息失败~~~"); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/reposity/RedisRepository.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.reposity; 2 | 3 | import cn.ivan.spider.utils.RedisUtils; 4 | 5 | public class RedisRepository implements Repository{ 6 | RedisUtils redisUtils = new RedisUtils(); 7 | public String poll() { 8 | String url = redisUtils.poll(RedisUtils.HIGHKEY); 9 | if(url==null){ 10 | url = redisUtils.poll(RedisUtils.LOWKEY); 11 | } 12 | return url; 13 | } 14 | 15 | public void addHigh(String nextUrl) { 16 | redisUtils.add(redisUtils.HIGHKEY, nextUrl); 17 | } 18 | 19 | public void add(String nextUrl) { 20 | redisUtils.add(redisUtils.LOWKEY, nextUrl); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/UrlJob.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | import java.util.List; 4 | 5 | import org.quartz.Job; 6 | import org.quartz.JobExecutionContext; 7 | import org.quartz.JobExecutionException; 8 | 9 | import cn.ivan.spider.utils.RedisUtils; 10 | /** 11 | * 将种子url 添加到redis的队列中 12 | * @author Hades 13 | * 14 | */ 15 | public class UrlJob implements Job { 16 | RedisUtils redisUtils = new RedisUtils(); 17 | public void execute(JobExecutionContext arg0) throws JobExecutionException { 18 | List list = redisUtils.lrange(RedisUtils.START_URL, 0, -1); 19 | for (String url : list) { 20 | redisUtils.add(RedisUtils.HIGHKEY, url); 21 | } 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/DomainUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URL; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | public class DomainUtils { 9 | 10 | /** 11 | * 获取url的顶级域名 12 | * @param url 13 | * @return 14 | */ 15 | public static String getTopDomain(String url){ 16 | 17 | try { 18 | String host = new URL(url).getHost().toLowerCase();// 此处获取值转换为小写 19 | Pattern pattern = Pattern.compile("[^\\.]+(\\.com\\.cn|\\.net\\.cn|\\.org\\.cn|\\.gov\\.cn|\\.com|\\.net|\\.cn|\\.org|\\.cc|\\.me|\\.tel|\\.mobi|\\.asia|\\.biz|\\.info|\\.name|\\.tv|\\.hk|\\.公司|\\.中国|\\.网络)"); 20 | Matcher matcher = pattern.matcher(host); 21 | while (matcher.find()) { 22 | return matcher.group(); 23 | } 24 | } catch (MalformedURLException e) { 25 | e.printStackTrace(); 26 | } 27 | return null; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/cn/ivan/spider/SpiderTest.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | import org.junit.Test; 4 | 5 | import cn.ivan.spider.domain.Page; 6 | import cn.ivan.spider.download.HttpClientDownable; 7 | import cn.ivan.spider.process.JDProcessable; 8 | import cn.ivan.spider.store.ConsoleStoreable; 9 | import cn.ivan.spider.store.HBaseStoreable; 10 | 11 | public class SpiderTest { 12 | @Test 13 | public void testName() throws Exception { 14 | Spider spider = new Spider(); 15 | String url = "http://list.jd.com/list.html?cat=9987,653,655"; 16 | spider.setDownloadable(new HttpClientDownable()); 17 | //下载页面 18 | Page page = spider.download(url); 19 | //获取页面所有内容 20 | spider.setProcessable(new JDProcessable()); 21 | spider.process(page); 22 | for (String s : page.getUrls()) { 23 | System.out.println(s); 24 | } 25 | // System.out.println(page.getValues().get("spec")); 26 | spider.setStoreable(new ConsoleStoreable()); 27 | spider.store(page); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/cn/ivan/spider/CuratorTest.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.apache.curator.RetryPolicy; 6 | import org.apache.curator.framework.CuratorFramework; 7 | import org.apache.curator.framework.CuratorFrameworkFactory; 8 | import org.apache.curator.retry.ExponentialBackoffRetry; 9 | import org.apache.zookeeper.CreateMode; 10 | import org.junit.Test; 11 | 12 | public class CuratorTest { 13 | @Test 14 | public void test() throws Exception { 15 | //获取curator客户端 16 | String connectString = "192.168.57.133:2181"; 17 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(3000, 3); 18 | CuratorFramework client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 19 | 20 | client.start(); 21 | String forPath = client.create() 22 | .creatingParentsIfNeeded()//如果父节点不存在,则创建 23 | .withMode(CreateMode.EPHEMERAL)//指定创建的节点类型,设置为临时节点 24 | .forPath("/spider/192.168.57.1"); 25 | System.out.println(forPath); 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/reposity/QueueRepository.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.reposity; 2 | 3 | import java.util.Queue; 4 | import java.util.concurrent.ConcurrentLinkedQueue; 5 | /** 6 | * 7 | * @author ivan Email:seeHades@163.com 8 | * 9 | * @date 2016年1月7日 上午12:45:11 10 | * 11 | * @desc 优先级队列url 仓库 12 | */ 13 | public class QueueRepository implements Repository{ 14 | /** 15 | * 高优先级队列 16 | */ 17 | Queue highQueue = new ConcurrentLinkedQueue(); 18 | /** 19 | * 低优先级队列 20 | */ 21 | Queue lowQueue = new ConcurrentLinkedQueue(); 22 | /** 23 | * 从url仓库中取url,先从高优先级队列中取,如果高优先级队列中没有url,则从低优先级队列中取 24 | */ 25 | public String poll() { 26 | String url = highQueue.poll(); 27 | if(url==null){ 28 | url = lowQueue.poll(); 29 | } 30 | return url; 31 | } 32 | /** 33 | * 向高优先级队列中存入url 34 | */ 35 | public void addHigh(String nextUrl) { 36 | highQueue.add(nextUrl); 37 | } 38 | /** 39 | * 向低优先级队列中存入url 40 | */ 41 | public void add(String nextUrl) { 42 | lowQueue.add(nextUrl); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/UrlManager.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | import org.quartz.CronTrigger; 4 | import org.quartz.JobDetail; 5 | import org.quartz.Scheduler; 6 | import org.quartz.Trigger; 7 | import org.quartz.impl.StdSchedulerFactory; 8 | 9 | /** 10 | * url调度类 11 | * 负责每天定时向url仓库中添加种子url 12 | * 这样可以保证爬虫每天都抓取一次所有数据 13 | * @author Administrator 14 | * 15 | */ 16 | public class UrlManager { 17 | public static void main(String[] args) { 18 | try { 19 | //获取一个默认调度器 20 | Scheduler defaultScheduler = StdSchedulerFactory.getDefaultScheduler(); 21 | //开启调度器 22 | defaultScheduler.start(); 23 | String simpleName = UrlJob.class.getSimpleName(); 24 | //具体要调度的任务 25 | JobDetail jobDetail = new JobDetail(simpleName, Scheduler.DEFAULT_GROUP, UrlJob.class); 26 | //说明什么时候执行任务,每天凌晨一点执行任务一次 27 | Trigger trigger = new CronTrigger(simpleName, Scheduler.DEFAULT_GROUP, "00 00 01 * * ?"); 28 | defaultScheduler.scheduleJob(trigger); 29 | } catch (Exception e) { 30 | // TODO Auto-generated catch block 31 | e.printStackTrace(); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/reposity/RandomRedisRepository.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.reposity; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Queue; 6 | import java.util.Random; 7 | import java.util.concurrent.ConcurrentLinkedQueue; 8 | 9 | import com.sun.tools.doclets.internal.toolkit.resources.doclets; 10 | 11 | import cn.ivan.spider.utils.DomainUtils; 12 | import cn.ivan.spider.utils.RedisUtils; 13 | 14 | /** 15 | * 随机获取某一个电商网站的url 16 | * @author Hades 17 | * 18 | */ 19 | public class RandomRedisRepository implements Repository{ 20 | RedisUtils redisUtils = new RedisUtils(); 21 | Map hashMap = new HashMap(); 22 | Random random = new Random(); 23 | public String poll() { 24 | String[] keyArray = hashMap.keySet().toArray(new String[0]); 25 | int nextInt = random.nextInt(keyArray.length); 26 | String key = keyArray[nextInt]; 27 | String value = hashMap.get(key); 28 | return redisUtils.poll(value); 29 | } 30 | 31 | public void addHigh(String nextUrl) { 32 | String topDomain = DomainUtils.getTopDomain(nextUrl); 33 | String value = hashMap.get(topDomain); 34 | if(value==null){ 35 | value = topDomain; 36 | hashMap.put(topDomain, value); 37 | } 38 | redisUtils.add(topDomain, nextUrl); 39 | } 40 | 41 | public void add(String nextUrl) { 42 | addHigh(nextUrl); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/HtmlUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | import org.htmlcleaner.TagNode; 4 | import org.htmlcleaner.XPatherException; 5 | 6 | import com.sun.javadoc.Tag; 7 | /** 8 | * html页面的utils类 9 | * @author Hades 10 | * 11 | */ 12 | public class HtmlUtils { 13 | /** 14 | * 获取指定的标签node 15 | * @param tagNode 16 | * @param xPath 17 | * @return 18 | */ 19 | public static TagNode getTagNodeByXpath(TagNode tagNode,String xPath){ 20 | Object[] evaluateXPath; 21 | TagNode node = null; 22 | try { 23 | evaluateXPath = tagNode.evaluateXPath(xPath); 24 | if(evaluateXPath!=null && evaluateXPath.length>0){ 25 | node = (TagNode)evaluateXPath[0]; 26 | } 27 | } catch (XPatherException e) { 28 | e.printStackTrace(); 29 | } 30 | 31 | return node; 32 | } 33 | /** 34 | * 获取指定标签的值 35 | * @param tagNode 36 | * @param xPath 37 | * @return 38 | */ 39 | public static String getText(TagNode tagNode,String xPath){ 40 | return getTagNodeByXpath(tagNode, xPath).getText().toString(); 41 | } 42 | /** 43 | * 获取指定标签指定属性的值 44 | * @param tagNode 45 | * @param Xpath 46 | * @param attr 47 | * @return 48 | */ 49 | public static String getAttributeByAttr(TagNode tagNode,String xPath,String attr){ 50 | TagNode node = getTagNodeByXpath(tagNode, xPath); 51 | return node.getAttributeByName(attr); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/store/HBaseStoreable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.store; 2 | 3 | import java.util.Map; 4 | 5 | import cn.ivan.spider.domain.Page; 6 | import cn.ivan.spider.utils.HBaseUtils; 7 | import cn.ivan.spider.utils.RedisUtils; 8 | 9 | public class HBaseStoreable implements Storeable{ 10 | HBaseUtils hbBaseUtils = new HBaseUtils(); 11 | RedisUtils redisUtils = new RedisUtils(); 12 | public void store(Page page) { 13 | String rowKey = page.getGoodId(); 14 | Map values = page.getValues(); 15 | try { 16 | hbBaseUtils.put(HBaseUtils.TABLE_NAME, rowKey, HBaseUtils.COLUMNFAMILY_1, HBaseUtils.COLUMNFAMILY_1_DATA_URL, page.getUrl()); 17 | hbBaseUtils.put(HBaseUtils.TABLE_NAME, rowKey, HBaseUtils.COLUMNFAMILY_1, HBaseUtils.COLUMNFAMILY_1_PIC_URL, values.get("pic_url")); 18 | hbBaseUtils.put(HBaseUtils.TABLE_NAME, rowKey, HBaseUtils.COLUMNFAMILY_1, HBaseUtils.COLUMNFAMILY_1_PRICE, values.get("price")); 19 | hbBaseUtils.put(HBaseUtils.TABLE_NAME, rowKey, HBaseUtils.COLUMNFAMILY_1, HBaseUtils.COLUMNFAMILY_1_TITLE, values.get("title")); 20 | hbBaseUtils.put(HBaseUtils.TABLE_NAME, rowKey, HBaseUtils.COLUMNFAMILY_2, HBaseUtils.COLUMNFAMILY_2_PARAM, values.get("spec")); 21 | } catch (Exception e) { 22 | // TODO: handle exception 23 | e.printStackTrace(); 24 | } 25 | 26 | redisUtils.add("solr_index", rowKey); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/domain/Page.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.domain; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | /** 9 | * 存储页面的原始数据 10 | * @author Hades 11 | * 12 | */ 13 | public class Page { 14 | /** 15 | * 保存商品页面内容 16 | */ 17 | private String content; 18 | /** 19 | * 商品url 20 | */ 21 | private String url; 22 | /** 23 | * 24 | * 商品基本信息 25 | */ 26 | 27 | private Map values =new HashMap(); 28 | 29 | /** 30 | * 保存页面中解析出来的url 31 | */ 32 | private List urls = new ArrayList(); 33 | 34 | 35 | 36 | private String goodId; 37 | 38 | public void addUrl(String url){ 39 | this.urls.add(url); 40 | } 41 | public List getUrls() { 42 | return urls; 43 | } 44 | public void setUrls(List urls) { 45 | this.urls = urls; 46 | } 47 | public void setValues(Map values) { 48 | this.values = values; 49 | } 50 | public String getGoodId() { 51 | return goodId; 52 | } 53 | public void setGoodId(String goodId) { 54 | this.goodId = goodId; 55 | } 56 | public Map getValues() { 57 | return values; 58 | } 59 | public void addField(String key,String value){ 60 | values.put(key, value); 61 | } 62 | public String getUrl() { 63 | return url; 64 | } 65 | 66 | public void setUrl(String url) { 67 | this.url = url; 68 | } 69 | 70 | public String getContent() { 71 | return content; 72 | } 73 | 74 | public void setContent(String content) { 75 | this.content = content; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/RedisUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | import java.util.List; 4 | 5 | import redis.clients.jedis.Jedis; 6 | import redis.clients.jedis.JedisPool; 7 | import redis.clients.jedis.JedisPoolConfig; 8 | 9 | public class RedisUtils { 10 | public static String START_URL = "cn.spider.start_url"; 11 | public static String HIGHKEY = "spider.todo.high"; 12 | public static String LOWKEY = "spider.todo.low"; 13 | 14 | JedisPool jedisPool = null; 15 | /** 16 | * 构造方法 初始化redis配置信息 17 | */ 18 | public RedisUtils() { 19 | JedisPoolConfig poolConfig = new JedisPoolConfig(); 20 | poolConfig.setMaxIdle(10); 21 | poolConfig.setMaxTotal(100); 22 | poolConfig.setMaxWaitMillis(10000); 23 | poolConfig.setTestOnBorrow(true); 24 | jedisPool = new JedisPool(poolConfig, "192.168.57.133", 6379); 25 | } 26 | /** 27 | * 模拟redis的lrange方法 28 | * @param key 29 | * @param start 30 | * @param end 31 | * @return 32 | */ 33 | public List lrange(String key,int start,int end){ 34 | Jedis resource = jedisPool.getResource(); 35 | List lrange = resource.lrange(key, start, end); 36 | jedisPool.returnResourceObject(resource); 37 | return lrange; 38 | } 39 | 40 | public void add(String lowKey, String url){ 41 | Jedis resource = jedisPool.getResource(); 42 | Long lpush = resource.lpush(lowKey, url); 43 | jedisPool.returnResourceObject(resource); 44 | 45 | } 46 | 47 | public String poll(String key){ 48 | Jedis resource = jedisPool.getResource(); 49 | String result = resource.rpop(key); 50 | jedisPool.returnResourceObject(resource); 51 | return result; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/reposity/RandomQueueRepository.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.reposity; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Queue; 6 | import java.util.Random; 7 | import java.util.concurrent.ConcurrentLinkedQueue; 8 | 9 | import cn.ivan.spider.utils.DomainUtils; 10 | 11 | /** 12 | * 13 | * @author ivan Email:seeHades@163.com 14 | * 15 | * @date 2016年1月7日 上午12:49:20 16 | * 17 | * @desc 使用对列 随机获取某一个电商网站的url 18 | */ 19 | 20 | public class RandomQueueRepository implements Repository{ 21 | /** 22 | * Map> hashMap 23 | * @desc map对象存储各个电商网站的url队列,主要用在分布式爬虫中,但是使用队列会存在url仓库共享问题,后面是用redis解决 24 | * key用电商网站的域名,value是一个队列 25 | */ 26 | Map> hashMap = new HashMap>(); 27 | Random random = new Random(); 28 | /** 29 | * @Title: poll 30 | * @Description: 随机获取一个url 31 | * @return 32 | * @throws 33 | */ 34 | public String poll() { 35 | String[] keyArray = hashMap.keySet().toArray(new String[0]); 36 | int nextInt = random.nextInt(keyArray.length); 37 | String key = keyArray[nextInt];//获取随机站点域名 38 | Queue queue = hashMap.get(key); 39 | return queue.poll(); 40 | } 41 | 42 | /** 43 | * @Title: addHigh 44 | * @Description: 向队列仓库中添加url,在添加之前先截取url中的顶级域名,当作key值 45 | * @param nextUrl 46 | * @throws 47 | */ 48 | public void addHigh(String nextUrl) { 49 | String topDomain = DomainUtils.getTopDomain(nextUrl); 50 | Queue queue = hashMap.get(topDomain); 51 | if(queue==null){ 52 | queue = new ConcurrentLinkedQueue(); 53 | hashMap.put(topDomain, queue); 54 | } 55 | queue.add(nextUrl); 56 | } 57 | 58 | public void add(String nextUrl) { 59 | addHigh(nextUrl); 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/PageUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | 4 | import org.apache.http.HttpEntity; 5 | import org.apache.http.HttpHost; 6 | import org.apache.http.client.methods.CloseableHttpResponse; 7 | import org.apache.http.client.methods.HttpGet; 8 | import org.apache.http.conn.HttpHostConnectException; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.apache.http.impl.client.HttpClientBuilder; 11 | import org.apache.http.impl.client.HttpClients; 12 | import org.apache.http.util.EntityUtils; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | 17 | /** 18 | * 页面帮助类 19 | * @author Hades 20 | * 21 | */ 22 | public class PageUtils { 23 | static Logger logger = LoggerFactory.getLogger(PageUtils.class); 24 | /** 25 | * 根据url获取页面内容 26 | * @param url 27 | * @return 28 | */ 29 | public static String getContent(String url){ 30 | HttpClientBuilder builder = HttpClients.custom(); 31 | //创建一个链接 32 | /* 33 | * 设置代理,注意:代理ip和端口在这不能写死。因为代理可能会随时失效,我们需要有一个代理IP库,代理ip库使用redis中list列表实现 34 | * 可以把这些代理信息保存到redis的list列表中,存储格式为ip:port 35 | * 使用的时候就从list中取一个,取出来之后使用:分割出来ip和端口即可。 36 | */ 37 | String proxy_ip = "110.73.3.225"; 38 | int proxy_port = 8000; 39 | HttpHost proxy = new HttpHost(proxy_ip, proxy_port); 40 | //表示获取httpclient对象 41 | CloseableHttpClient client = builder/*.setProxy(proxy )*/.build(); 42 | HttpGet request = new HttpGet(url); 43 | String content = null; 44 | try { 45 | long start_time = System.currentTimeMillis(); 46 | CloseableHttpResponse response = client.execute(request); 47 | HttpEntity entity = response.getEntity(); 48 | content = EntityUtils.toString(entity); 49 | logger.info("当前线程id:{},页面下载成功,url:{},消耗时间:{}",Thread.currentThread().getId(),url,System.currentTimeMillis()-start_time); 50 | } catch(HttpHostConnectException e){ 51 | logger.error("代理ip失效,ip:{},port:{}",proxy_ip,proxy_port); 52 | }catch (Exception e) { 53 | logger.error("页面下载失败,url:{}",url); 54 | } 55 | 56 | return content; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/SpiderWatcher.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.curator.RetryPolicy; 6 | import org.apache.curator.framework.CuratorFramework; 7 | import org.apache.curator.framework.CuratorFrameworkFactory; 8 | import org.apache.curator.retry.ExponentialBackoffRetry; 9 | import org.apache.zookeeper.WatchedEvent; 10 | import org.apache.zookeeper.Watcher; 11 | 12 | /** 13 | * Spider 使用Curator监控爬虫节点的生命周期 14 | * @author Hades 15 | * 16 | */ 17 | public class SpiderWatcher implements Watcher{ 18 | private CuratorFramework client; 19 | List children ; 20 | public SpiderWatcher() { 21 | String connectString = "192.168.57.133:2181"; 22 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(3000, 3); 23 | client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 24 | client.start(); 25 | //指定需要监控的父节点 26 | try { 27 | //表示给spider节点注册一个监视器 28 | children = client.getChildren().usingWatcher(this).forPath("/spider"); 29 | } catch (Exception e) { 30 | // TODO Auto-generated catch block 31 | e.printStackTrace(); 32 | } 33 | } 34 | 35 | 36 | /** 37 | * 当监视器发现监控的父节点下面的子节点发生变化的时候,会调用这个方法 38 | * 39 | * 注意:watcher监视器是单次有效的,注册一次,只能使用一次,需要想要一直使用,则需要重复注册 40 | */ 41 | public void process(WatchedEvent arg0) { 42 | // TODO Auto-generated method stub 43 | try { 44 | List newChildren = client.getChildren().usingWatcher(this).forPath("/spider"); 45 | for (String node :children) { 46 | if(!newChildren.contains(node)){ 47 | System.out.println(node+"节点消失了~~~"); 48 | //TODO-- 在这需要给管理员发送邮件或者短信提醒 49 | /** 50 | * 发邮件使用javamail 51 | * 52 | * 53 | * 发短信使用 云片网 54 | */ 55 | } 56 | } 57 | for (String node : newChildren) { 58 | if(!children.contains(node)){ 59 | System.out.println("新增节点:"+node); 60 | } 61 | } 62 | 63 | children = newChildren; 64 | } catch (Exception e) { 65 | // TODO Auto-generated catch block 66 | e.printStackTrace(); 67 | } 68 | } 69 | 70 | public static void main(String[] args) { 71 | SpiderWatcher spiderWatcher = new SpiderWatcher(); 72 | spiderWatcher.start(); 73 | } 74 | 75 | /** 76 | * 让当前进程一直运行 77 | */ 78 | private void start() { 79 | // TODO Auto-generated method stub 80 | while(true){ 81 | ; 82 | } 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | Spider 6 | spider 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | spider 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.12 22 | test 23 | 24 | 25 | jdk.tools 26 | jdk.tools 27 | 1.7 28 | system 29 | ${JAVA_HOME}/lib/tools.jar 30 | 31 | 32 | 33 | org.apache.httpcomponents 34 | httpclient 35 | 4.4 36 | 37 | 38 | 39 | net.sourceforge.htmlcleaner 40 | htmlcleaner 41 | 2.10 42 | 43 | 44 | org.json 45 | json 46 | 20140107 47 | 48 | 49 | org.apache.hbase 50 | hbase-client 51 | 0.98.8-hadoop2 52 | 53 | 54 | redis.clients 55 | jedis 56 | 2.7.0 57 | 58 | 59 | 60 | org.slf4j 61 | slf4j-api 62 | 1.7.10 63 | 64 | 65 | org.slf4j 66 | slf4j-log4j12 67 | 1.7.10 68 | 69 | 70 | 71 | org.quartz-scheduler 72 | quartz 73 | 1.8.4 74 | 75 | 76 | 77 | org.apache.curator 78 | curator-framework 79 | 2.7.0 80 | 81 | 82 | 83 | 84 | 85 | 86 | org.apache.maven.plugins 87 | maven-compiler-plugin 88 | 2.3.2 89 | 90 | UTF-8 91 | 1.7 92 | 1.7 93 | true 94 | 95 | 96 | 97 | maven-assembly-plugin 98 | 99 | 100 | jar-with-dependencies 101 | 102 | 103 | 104 | cn.crxy.Spider_10.Spider 105 | 106 | 107 | 108 | 109 | 110 | make-assembly 111 | package 112 | 113 | single 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /target/classes/META-INF/maven/Spider/spider/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | Spider 6 | spider 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | spider 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.12 22 | test 23 | 24 | 25 | jdk.tools 26 | jdk.tools 27 | 1.7 28 | system 29 | ${JAVA_HOME}/lib/tools.jar 30 | 31 | 32 | 33 | org.apache.httpcomponents 34 | httpclient 35 | 4.4 36 | 37 | 38 | 39 | net.sourceforge.htmlcleaner 40 | htmlcleaner 41 | 2.10 42 | 43 | 44 | org.json 45 | json 46 | 20140107 47 | 48 | 49 | org.apache.hbase 50 | hbase-client 51 | 0.98.8-hadoop2 52 | 53 | 54 | redis.clients 55 | jedis 56 | 2.7.0 57 | 58 | 59 | 60 | org.slf4j 61 | slf4j-api 62 | 1.7.10 63 | 64 | 65 | org.slf4j 66 | slf4j-log4j12 67 | 1.7.10 68 | 69 | 70 | 71 | org.quartz-scheduler 72 | quartz 73 | 1.8.4 74 | 75 | 76 | 77 | org.apache.curator 78 | curator-framework 79 | 2.7.0 80 | 81 | 82 | 83 | 84 | 85 | 86 | org.apache.maven.plugins 87 | maven-compiler-plugin 88 | 2.3.2 89 | 90 | UTF-8 91 | 1.7 92 | 1.7 93 | true 94 | 95 | 96 | 97 | maven-assembly-plugin 98 | 99 | 100 | jar-with-dependencies 101 | 102 | 103 | 104 | cn.crxy.Spider_10.Spider 105 | 106 | 107 | 108 | 109 | 110 | make-assembly 111 | package 112 | 113 | single 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/process/JDProcessable.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.process; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import org.htmlcleaner.HtmlCleaner; 7 | import org.htmlcleaner.TagNode; 8 | import org.htmlcleaner.XPatherException; 9 | import org.json.JSONArray; 10 | import org.json.JSONObject; 11 | 12 | import cn.ivan.spider.domain.Page; 13 | import cn.ivan.spider.utils.HtmlUtils; 14 | import cn.ivan.spider.utils.PageUtils; 15 | 16 | public class JDProcessable implements Processable { 17 | 18 | public void process(Page page) { 19 | // TODO Auto-generated method stub 20 | HtmlCleaner htmlCleaner = new HtmlCleaner(); 21 | // 相当于htmlcleaner对页面进行处理 22 | TagNode rootNode = htmlCleaner.clean(page.getContent()); 23 | if(page.getUrl().startsWith("http://item.jd.com")){//表示是商品详情页 24 | processProduct(page, rootNode); 25 | }else{//处理页面的url 26 | String next_url = HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"J_topPage\"]/a[2]", "href"); 27 | if(!next_url.equals("javascript:;")){ 28 | System.out.println("http://list.jd.com"+next_url.replace("&", "&")); 29 | String x = "http://list.jd.com"+next_url.replace("&", "&"); 30 | page.addUrl(x); 31 | } 32 | try { 33 | Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a"); 34 | for (Object object : evaluateXPath) { 35 | TagNode tagNode = (TagNode)object; 36 | String goodsUrl = tagNode.getAttributeByName("href"); 37 | page.addUrl(goodsUrl); 38 | } 39 | } catch (XPatherException e) { 40 | // TODO Auto-generated catch block 41 | e.printStackTrace(); 42 | } 43 | } 44 | } 45 | /** 46 | * 解析商品详细信息 47 | * @param page 48 | * @param rootNode 49 | */ 50 | private void processProduct(Page page, TagNode rootNode) { 51 | try { 52 | // 标题 53 | page.addField("title", HtmlUtils.getText(rootNode, "//*[@id=\"name\"]/h1")); 54 | // 获取图片地址 55 | page.addField("pic_url", "http:" +HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"spec-n1\"]/img", "src")); 56 | // 价格:注意,价格是使用ajax异步请求动态写到页面的。所以不能使用普通xpath方式获取 57 | /* 58 | * evaluateXPath = rootNode.evaluateXPath("//*[@id=\"jd-price\"]"); 59 | * if(evaluateXPath!=null && evaluateXPath.length>0){ TagNode 60 | * priceNode = (TagNode)evaluateXPath[0]; 61 | * System.out.println("--"+priceNode.getText().toString()+"--"); } 62 | */ 63 | // 活得商品编号 64 | String url = page.getUrl(); 65 | Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html"); 66 | Matcher matcher = compile.matcher(url); 67 | String goodId = null; 68 | if (matcher.find()) { 69 | goodId = matcher.group(1); 70 | page.setGoodId("JD_"+goodId); 71 | } 72 | 73 | // 获取价格 74 | String price_object = PageUtils.getContent("http://p.3.cn/prices/get?skuid=J_" + goodId); 75 | // 把价格解析出来,使用jsonarray,还有url中的商品编号不能写死,需要动态获取。 76 | JSONArray jsonArray = new JSONArray(price_object); 77 | JSONObject jsonObject = jsonArray.getJSONObject(0); 78 | float price = Float.parseFloat(jsonObject.getString("p")); 79 | page.addField("price", ""+price); 80 | //规格参数 81 | Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr"); 82 | JSONArray jsonArray2 = new JSONArray(); 83 | for (Object object : evaluateXPath) { 84 | TagNode trNode = (TagNode)object; 85 | if(!trNode.getText().toString().trim().equals("")){//把tr为空的标签过滤掉 86 | JSONObject jsonObject2 = new JSONObject(); 87 | Object[] evaluateXPath2 = trNode.evaluateXPath("//th"); 88 | if(evaluateXPath2!=null && evaluateXPath2.length>0){ 89 | //tr下面是th标签 90 | TagNode thNode = (TagNode)evaluateXPath2[0]; 91 | jsonObject2.put("name", ""); 92 | jsonObject2.put("value", thNode.getText().toString()); 93 | }else{ 94 | evaluateXPath2 = trNode.evaluateXPath("//td"); 95 | //tr下面是td标签 96 | TagNode tdNode1 = (TagNode)evaluateXPath2[0]; 97 | TagNode tdNode2 = (TagNode)evaluateXPath2[1]; 98 | jsonObject2.put("name", tdNode1.getText().toString()); 99 | jsonObject2.put("value", tdNode2.getText().toString()); 100 | } 101 | jsonArray2.put(jsonObject2); 102 | } 103 | } 104 | page.addField("spec", jsonArray2.toString()); 105 | /* 106 | * evaluateXPath = rootNode.evaluateXPath( 107 | * "//*[@id=\"product-detail-2\"]/table/tbody/tr"); 108 | * if(evaluateXPath!=null&&evaluateXPath.length>0){ for (int i = 0; 109 | * i < evaluateXPath.length; i++) { TagNode trNode = 110 | * (TagNode)evaluateXPath[i]; TagNode[] allElements = 111 | * trNode.getAllElements(true); for (TagNode tagNode : allElements) 112 | * { System.out.print(tagNode.getText().toString()+":"); } 113 | * System.out.println(); } 114 | * 115 | * System.out.println(); } 116 | */ 117 | } catch (XPatherException e) { 118 | // TODO Auto-generated catch block 119 | e.printStackTrace(); 120 | } 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/Spider.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider; 2 | 3 | 4 | import java.net.InetAddress; 5 | import java.util.List; 6 | import java.util.concurrent.ExecutorService; 7 | import java.util.concurrent.Executors; 8 | 9 | import org.apache.commons.lang.StringUtils; 10 | import org.apache.curator.RetryPolicy; 11 | import org.apache.curator.framework.CuratorFramework; 12 | import org.apache.curator.framework.CuratorFrameworkFactory; 13 | import org.apache.curator.retry.ExponentialBackoffRetry; 14 | import org.apache.zookeeper.CreateMode; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import cn.ivan.spider.domain.Page; 19 | import cn.ivan.spider.download.Downloadable; 20 | import cn.ivan.spider.download.HttpClientDownable; 21 | import cn.ivan.spider.process.JDProcessable; 22 | import cn.ivan.spider.process.Processable; 23 | import cn.ivan.spider.reposity.QueueRepository; 24 | import cn.ivan.spider.reposity.RedisRepository; 25 | import cn.ivan.spider.reposity.Repository; 26 | import cn.ivan.spider.store.ConsoleStoreable; 27 | import cn.ivan.spider.store.HBaseStoreable; 28 | import cn.ivan.spider.store.Storeable; 29 | import cn.ivan.spider.utils.Config; 30 | import cn.ivan.spider.utils.SleepUtils; 31 | 32 | public class Spider { 33 | Logger logger = LoggerFactory.getLogger(Spider.class); 34 | Downloadable downloadable = new HttpClientDownable(); 35 | Processable processable; 36 | Storeable storeable = new ConsoleStoreable(); 37 | Repository repository = new QueueRepository(); 38 | /** 39 | * 创建一个大小为5的线程池 40 | */ 41 | ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(Config.THREADNUM); 42 | public Spider() { 43 | String connectString = "192.168.57.133:2181"; 44 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(3000, 3); 45 | CuratorFramework client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 46 | 47 | client.start(); 48 | try { 49 | InetAddress localHost = InetAddress.getLocalHost(); 50 | String ip = localHost.getHostAddress(); 51 | String forPath = client.create() 52 | .creatingParentsIfNeeded()//如果父节点不存在,则创建 53 | .withMode(CreateMode.EPHEMERAL)//指定创建的节点类型,设置为临时节点 54 | .forPath("/spider/"+ip); 55 | } catch (Exception e) { 56 | // TODO Auto-generated catch block 57 | e.printStackTrace(); 58 | } 59 | } 60 | public void start(){ 61 | check(); 62 | logger.info("开始启动爬虫。。。"); 63 | while(true){ 64 | final String url = repository.poll(); 65 | if(StringUtils.isNotBlank(url)){ 66 | newFixedThreadPool.execute(new Runnable() { 67 | public void run() { 68 | Page page = Spider.this.download(url); 69 | List urls = page.getUrls(); 70 | Spider.this.process(page); 71 | for (String nextUrl : urls) { 72 | if (nextUrl.startsWith("http://list.jd.com/")) { 73 | repository.addHigh(nextUrl); 74 | } else { 75 | repository.add(nextUrl); 76 | } 77 | } 78 | //列表页面才进行存储 79 | if (urls == null || urls.size() == 0) { 80 | Spider.this.store(page); 81 | } 82 | SleepUtils.sleep(Config.MILLION_1); 83 | } 84 | }); 85 | }else{ 86 | logger.info("没有url,休息一会吧~~~~"); 87 | SleepUtils.sleep(Config.MILLION_5); 88 | } 89 | } 90 | } 91 | /** 92 | * 配置检查 93 | */ 94 | private void check() { 95 | if(processable==null){ 96 | String error_message = "没有配置解析类!"; 97 | logger.error(error_message); 98 | throw new RuntimeException(error_message); 99 | } 100 | 101 | logger.info("==================================================="); 102 | logger.info("下载功能的实现类为:{}",downloadable.getClass().getName()); 103 | logger.info("解析功能的实现类为:{}",processable.getClass().getName()); 104 | logger.info("存储功能的实现类为:{}",storeable.getClass().getName()); 105 | logger.info("url队列功能的实现类为:{}",repository.getClass().getName()); 106 | logger.info("==================================================="); 107 | } 108 | public Storeable getStoreable() { 109 | return storeable; 110 | } 111 | 112 | public void setStoreable(Storeable storeable) { 113 | this.storeable = storeable; 114 | } 115 | public void setDownloadable(Downloadable downloadable) { 116 | this.downloadable = downloadable; 117 | } 118 | 119 | /** 120 | * 下载页面 121 | * 122 | * @param url 123 | * @return 124 | */ 125 | public Page download(String url) { 126 | Page page = this.downloadable.download(url); 127 | page.setUrl(url); 128 | return page; 129 | } 130 | 131 | /** 132 | * 解析页面 133 | * 134 | * @param page 135 | */ 136 | public void process(Page page) { 137 | 138 | this.processable.process(page); 139 | } 140 | 141 | public Processable getProcessable() { 142 | return processable; 143 | } 144 | 145 | public void setProcessable(Processable processable) { 146 | this.processable = processable; 147 | } 148 | 149 | public Repository getRepository() { 150 | return repository; 151 | } 152 | public void setRepository(Repository repository) { 153 | this.repository = repository; 154 | } 155 | public Downloadable getDownloadable() { 156 | return downloadable; 157 | } 158 | 159 | public void store(Page page) { 160 | this.storeable.store(page); 161 | } 162 | 163 | public void setSeedUrl(String url){ 164 | this.repository.addHigh(url); 165 | } 166 | public static void main(String[] args) { 167 | Spider spider = new Spider(); 168 | String url = "http://list.jd.com/list.html?cat=9987,653,655"; 169 | spider.setProcessable(new JDProcessable()); 170 | spider.setStoreable(new HBaseStoreable()); 171 | spider.setRepository(new RedisRepository()); 172 | // spider.setDownloadable(new HttpClientDownable()); 173 | // spider.setProcessable(new JDProcessable()); 174 | // spider.setStoreable(new ConsoleStoreable()); 175 | // spider.setRepository(new RedisRepository()); 176 | spider.setSeedUrl(url); 177 | spider.start(); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/cn/ivan/spider/utils/HBaseUtils.java: -------------------------------------------------------------------------------- 1 | package cn.ivan.spider.utils; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.hbase.HColumnDescriptor; 9 | import org.apache.hadoop.hbase.HTableDescriptor; 10 | import org.apache.hadoop.hbase.KeyValue; 11 | import org.apache.hadoop.hbase.MasterNotRunningException; 12 | import org.apache.hadoop.hbase.ZooKeeperConnectionException; 13 | import org.apache.hadoop.hbase.client.Delete; 14 | import org.apache.hadoop.hbase.client.HBaseAdmin; 15 | import org.apache.hadoop.hbase.client.HTable; 16 | import org.apache.hadoop.hbase.client.HTableInterface; 17 | import org.apache.hadoop.hbase.client.HTablePool; 18 | import org.apache.hadoop.hbase.client.Put; 19 | import org.apache.hadoop.hbase.client.Result; 20 | import org.apache.hadoop.hbase.client.ResultScanner; 21 | import org.apache.hadoop.hbase.client.Scan; 22 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp; 23 | import org.apache.hadoop.hbase.util.Bytes; 24 | import org.apache.hadoop.hbase.filter.RegexStringComparator; 25 | import org.apache.hadoop.hbase.filter.RowFilter; 26 | 27 | 28 | public class HBaseUtils { 29 | /** 30 | * 表名称 31 | */ 32 | public static String TABLE_NAME = "spider"; 33 | /** 34 | * 列簇1 商品信息 35 | */ 36 | public static String COLUMNFAMILY_1 = "goodinfo"; 37 | /** 38 | * 列簇1中的列 39 | */ 40 | public static String COLUMNFAMILY_1_PRICE = "price"; 41 | public static String COLUMNFAMILY_1_PIC_URL = "pic_url"; 42 | public static String COLUMNFAMILY_1_DATA_URL = "data_url"; 43 | public static String COLUMNFAMILY_1_TITLE = "title"; 44 | 45 | /** 46 | * 列簇2 商品参数信息 47 | * 48 | */ 49 | public static String COLUMNFAMILY_2 = "spec"; 50 | public static String COLUMNFAMILY_2_PARAM = "param"; 51 | 52 | HBaseAdmin admin = null; 53 | Configuration conf = null; 54 | 55 | /** 56 | * 构造函数 加载配置 57 | */ 58 | public HBaseUtils() { 59 | // TODO Auto-generated constructor stub 60 | conf = new Configuration(); 61 | conf.set("hbase.zookeeper.quorum", "192.168.57.133:2181"); 62 | conf.set("hbase.rootdir", "hdfs://192.168.57.133:9000/hbase"); 63 | try { 64 | admin = new HBaseAdmin(conf); 65 | } catch (MasterNotRunningException e) { 66 | // TODO Auto-generated catch block 67 | e.printStackTrace(); 68 | } catch (ZooKeeperConnectionException e) { 69 | // TODO Auto-generated catch block 70 | e.printStackTrace(); 71 | } catch (IOException e) { 72 | // TODO Auto-generated catch block 73 | e.printStackTrace(); 74 | } 75 | } 76 | 77 | public static void main(String[] args) throws Exception { 78 | HBaseUtils hbase = new HBaseUtils(); 79 | //创建一张表 80 | // hbase.createTable("stu","cf"); 81 | // //查询所有表名 82 | // hbase.getALLTable(); 83 | // //往表中添加一条记录 84 | // hbase.addOneRecord("stu","key1","cf","name","zhangsan"); 85 | // hbase.addOneRecord("stu","key1","cf","age","24"); 86 | // //查询一条记录 87 | // hbase.getKey("stu","key1"); 88 | // //获取表的所有数据 89 | // hbase.getALLData("stu"); 90 | // //删除一条记录 91 | // hbase.deleteOneRecord("stu","key1"); 92 | // //删除表 93 | // hbase.deleteTable("stu"); 94 | //scan过滤器的使用 95 | // hbase.getScanData("stu","cf","age"); 96 | //rowFilter的使用 97 | //84138413_20130313145955 98 | } 99 | /** 100 | * rowFilter 的使用 101 | * @param tableName 102 | * @param reg 103 | * @throws Exception 104 | */ 105 | public void getRowFilter(String tableName,String reg) throws Exception{ 106 | HTable hTable = new HTable(conf, tableName); 107 | Scan scan = new Scan(); 108 | RowFilter rowFilter = new RowFilter(CompareOp.NOT_EQUAL, new RegexStringComparator(reg)); 109 | scan.setFilter(rowFilter); 110 | ResultScanner scanner = hTable.getScanner(scan); 111 | for (Result result : scanner) { 112 | System.out.println(new String(result.getRow())); 113 | } 114 | } 115 | /** 116 | * 扫描数据 117 | * @param tableName 118 | * @param family 119 | * @param qualifier 120 | * @throws Exception 121 | */ 122 | public void getScanData(String tableName,String family, String qualifier) throws Exception{ 123 | HTable hTable = new HTable(conf, tableName); 124 | Scan scan = new Scan(); 125 | scan.addColumn(family.getBytes(), qualifier.getBytes()); 126 | ResultScanner scanner = hTable.getScanner(scan); 127 | for (Result result : scanner) { 128 | if (result.raw().length == 0) { 129 | System.out.println(tableName + " 表数据为空!"); 130 | } else { 131 | for (KeyValue kv : result.raw()) { 132 | System.out.println(new String(kv.getKey()) + "\t" + new String(kv.getValue())); 133 | } 134 | } 135 | } 136 | } 137 | /** 138 | * 删除表 139 | * @param tableName 140 | * @throws Exception 141 | */ 142 | public void deleteTable(String tableName){ 143 | try { 144 | if(admin.tableExists(tableName)){ 145 | admin.disableTable(tableName); 146 | admin.deleteTable(tableName); 147 | System.out.println(tableName + "表删除成功!"); 148 | } 149 | } catch (Exception e) { 150 | // TODO Auto-generated catch block 151 | e.printStackTrace(); 152 | } 153 | } 154 | /** 155 | * 删除一条记录 156 | * @param tableName 157 | * @param rowKey 158 | */ 159 | public void deleteOneRecord(String tableName,String rowKey){ 160 | HTablePool hTablePool = new HTablePool(conf, 1000); 161 | HTableInterface table = hTablePool.getTable(tableName); 162 | Delete delete = new Delete(rowKey.getBytes()); 163 | try { 164 | table.delete(delete); 165 | System.out.println(rowKey + "记录删除成功!"); 166 | } catch (IOException e) { 167 | e.printStackTrace(); 168 | System.out.println(rowKey + "记录删除失败!"); 169 | } 170 | } 171 | /** 172 | * 获取表中所有数据 173 | * @param tableName 174 | * @throws Exception 175 | */ 176 | public void getALLData(String tableName){ 177 | try { 178 | HTable hTable = new HTable(conf, tableName); 179 | Scan scan = new Scan(); 180 | ResultScanner scanner = hTable.getScanner(scan); 181 | for (Result result : scanner) { 182 | if (result.raw().length == 0) { 183 | System.out.println(tableName + " 表数据为空!"); 184 | } else { 185 | for (KeyValue kv : result.raw()) { 186 | System.out.println(new String(kv.getKey()) + "\t" + new String(kv.getValue())); 187 | } 188 | } 189 | } 190 | } catch (Exception e) { 191 | // TODO Auto-generated catch block 192 | e.printStackTrace(); 193 | } 194 | } 195 | /** 196 | * 添加一条数据 197 | * @param tableName 198 | * @param row 199 | * @param columnFamily 200 | * @param column 201 | * @param data 202 | * @throws IOException 203 | */ 204 | public void put(String tableName, String row, String columnFamily, String column, String data) throws IOException { 205 | HTablePool hTablePool = new HTablePool(conf, 1000); 206 | HTableInterface table = hTablePool.getTable(tableName); 207 | Put p1 = new Put(Bytes.toBytes(row)); 208 | p1.add(Bytes.toBytes(columnFamily), Bytes.toBytes(column), Bytes.toBytes(data)); 209 | table.put(p1); 210 | System.out.println("put'" + row + "'," + columnFamily + ":" + column + "','" + data + "'"); 211 | } 212 | 213 | /** 214 | * 查询所有表名 215 | * 216 | * @return 217 | * @throws Exception 218 | */ 219 | public List getALLTable() throws Exception { 220 | ArrayList tables = new ArrayList(); 221 | if (admin != null) { 222 | HTableDescriptor[] listTables = admin.listTables(); 223 | if (listTables.length > 0) { 224 | for (HTableDescriptor tableDesc : listTables) { 225 | tables.add(tableDesc.getNameAsString()); 226 | System.out.println(tableDesc.getNameAsString()); 227 | } 228 | } 229 | } 230 | return tables; 231 | } 232 | 233 | /** 234 | * 创建一张表 235 | * 236 | * @param tableName 237 | * @param column 238 | * @throws Exception 239 | */ 240 | public void createTable(String tableName, String column) throws Exception { 241 | if (admin.tableExists(tableName)) { 242 | System.out.println(tableName + "表已经存在!"); 243 | } else { 244 | HTableDescriptor tableDesc = new HTableDescriptor(tableName); 245 | tableDesc.addFamily(new HColumnDescriptor(column.getBytes())); 246 | admin.createTable(tableDesc); 247 | System.out.println(tableName + "表创建成功!"); 248 | } 249 | } 250 | } 251 | --------------------------------------------------------------------------------