├── .gitignore ├── .settings ├── org.eclipse.m2e.core.prefs ├── org.eclipse.core.resources.prefs └── org.eclipse.jdt.core.prefs ├── src ├── main │ ├── java │ │ └── com │ │ │ └── test │ │ │ └── spider │ │ │ ├── threadpool │ │ │ ├── ThreadPool.java │ │ │ └── FixedThreadPool.java │ │ │ ├── store │ │ │ ├── Storeable.java │ │ │ ├── ConsoleStore.java │ │ │ └── HbaseStore.java │ │ │ ├── process │ │ │ ├── Processable.java │ │ │ └── JdProcess.java │ │ │ ├── repository │ │ │ ├── Repository.java │ │ │ ├── RedisRepository.java │ │ │ ├── QueueRepository.java │ │ │ ├── RandomRedisRepository.java │ │ │ └── RandomRepository.java │ │ │ ├── download │ │ │ ├── Downloadable.java │ │ │ ├── HtmlCleanerDownload.java │ │ │ └── HttpClientDownload.java │ │ │ ├── utils │ │ │ ├── SleepUtils.java │ │ │ ├── Config.java │ │ │ ├── DomainUtils.java │ │ │ ├── HtmlUtils.java │ │ │ ├── RedisUtils.java │ │ │ ├── PageUtils.java │ │ │ └── HbaseUtils.java │ │ │ ├── UrlJob.java │ │ │ ├── UrlManager.java │ │ │ ├── domain │ │ │ └── Page.java │ │ │ ├── SpiderWatcher.java │ │ │ └── Spider.java │ └── resources │ │ └── log4j.properties └── test │ └── java │ └── com │ └── test │ └── spider │ ├── YxTest.java │ ├── SpiderTest.java │ ├── CuratorTest.java │ ├── HttpProxyTest.java │ ├── JsTest.java │ └── LoginTest.java ├── .project ├── .classpath └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | *.jar 4 | target/* -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/threadpool/ThreadPool.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.threadpool; 2 | 3 | public interface ThreadPool { 4 | 5 | void execute(Runnable runnable); 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/store/Storeable.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.store; 2 | 3 | import com.test.spider.domain.Page; 4 | 5 | public interface Storeable { 6 | void store(Page page); 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/process/Processable.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.process; 2 | 3 | import com.test.spider.domain.Page; 4 | 5 | public interface Processable { 6 | void process(Page page); 7 | } 8 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding//src/test/java=UTF-8 5 | encoding//src/test/resources=UTF-8 6 | encoding/=UTF-8 7 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/repository/Repository.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.repository; 2 | 3 | public interface Repository { 4 | 5 | String poll(); 6 | 7 | void add(String nexturl); 8 | 9 | void addHigh(String nexturl); 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/download/Downloadable.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.download; 2 | 3 | import com.test.spider.domain.Page; 4 | 5 | 6 | public interface Downloadable { 7 | /** 8 | * 下载url 9 | * @param url 10 | * @return 11 | */ 12 | Page download(String url); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/SleepUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | public class SleepUtils { 4 | 5 | public static void sleep(long million){ 6 | try { 7 | Thread.sleep(million); 8 | } catch (InterruptedException e) { 9 | e.printStackTrace(); 10 | } 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,stdout 2 | 3 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.Target = System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n 7 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/download/HtmlCleanerDownload.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.download; 2 | 3 | import com.test.spider.domain.Page; 4 | 5 | public class HtmlCleanerDownload implements Downloadable { 6 | 7 | @Override 8 | public Page download(String url) { 9 | //实现下载功能即可。 10 | 11 | return null; 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/store/ConsoleStore.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.store; 2 | 3 | import com.test.spider.domain.Page; 4 | 5 | public class ConsoleStore implements Storeable { 6 | 7 | @Override 8 | public void store(Page page) { 9 | System.out.println(page.getUrl()+"---"+page.getMap().get("price")); 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/Config.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | public class Config { 4 | 5 | /** 6 | * 注意:这些配置的值应该是从数据库或者配置文件中读取过来的 7 | * 8 | */ 9 | public static Integer nThread = 5; 10 | public static long million_1 = 1000; 11 | public static long million_5 = 5000; 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/download/HttpClientDownload.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.download; 2 | 3 | import com.test.spider.domain.Page; 4 | import com.test.spider.utils.PageUtils; 5 | 6 | public class HttpClientDownload implements Downloadable { 7 | 8 | @Override 9 | public Page download(String url) { 10 | Page page = new Page(); 11 | String content = PageUtils.getContent(url); 12 | page.setContent(content); 13 | page.setUrl(url); 14 | return page; 15 | } 16 | 17 | 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/threadpool/FixedThreadPool.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.threadpool; 2 | 3 | import java.util.concurrent.ExecutorService; 4 | import java.util.concurrent.Executors; 5 | 6 | import com.test.spider.utils.Config; 7 | 8 | public class FixedThreadPool implements ThreadPool { 9 | ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(Config.nThread); 10 | @Override 11 | public void execute(Runnable runnable) { 12 | newFixedThreadPool.execute(runnable); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/UrlJob.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import java.util.List; 4 | 5 | import org.quartz.Job; 6 | import org.quartz.JobExecutionContext; 7 | import org.quartz.JobExecutionException; 8 | 9 | import com.test.spider.utils.RedisUtils; 10 | 11 | public class UrlJob implements Job { 12 | 13 | RedisUtils redisUtils = new RedisUtils(); 14 | @Override 15 | public void execute(JobExecutionContext context) 16 | throws JobExecutionException { 17 | List list = redisUtils.lrange(RedisUtils.start_url, 0, -1); 18 | for (String url : list) { 19 | redisUtils.add(RedisUtils.heightkey, url); 20 | } 21 | 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/repository/RedisRepository.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.repository; 2 | 3 | import com.test.spider.utils.RedisUtils; 4 | 5 | public class RedisRepository implements Repository { 6 | 7 | RedisUtils redisUtils = new RedisUtils(); 8 | @Override 9 | public String poll() { 10 | String url = redisUtils.poll(RedisUtils.heightkey); 11 | if(url==null){ 12 | url = redisUtils.poll(RedisUtils.lowkey); 13 | } 14 | return url; 15 | } 16 | 17 | @Override 18 | public void add(String nexturl) { 19 | redisUtils.add(RedisUtils.lowkey, nexturl); 20 | } 21 | 22 | @Override 23 | public void addHigh(String nexturl) { 24 | redisUtils.add(RedisUtils.heightkey, nexturl); 25 | } 26 | 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.7 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.source=1.7 14 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/YxTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.htmlcleaner.HtmlCleaner; 6 | import org.htmlcleaner.TagNode; 7 | import org.junit.Test; 8 | 9 | import com.test.spider.utils.HtmlUtils; 10 | import com.test.spider.utils.PageUtils; 11 | 12 | public class YxTest { 13 | 14 | 15 | @Test 16 | public void test() throws Exception { 17 | String content = PageUtils.getContent("http://www.yixun.com/category.html?YTAG=1.100090000"); 18 | HtmlCleaner htmlCleaner = new HtmlCleaner(); 19 | TagNode rootNode = htmlCleaner.clean(content); 20 | String value = HtmlUtils.getAttributeByName(rootNode, "href", "//*[@id=\"category\"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/a[1]"); 21 | System.out.println(value); 22 | 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/repository/QueueRepository.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.repository; 2 | 3 | import java.util.Queue; 4 | import java.util.concurrent.ConcurrentLinkedQueue; 5 | 6 | /** 7 | * 优先级队列 8 | * @author Administrator 9 | * 10 | */ 11 | public class QueueRepository implements Repository { 12 | Queue lowqueue = new ConcurrentLinkedQueue(); 13 | Queue highqueue = new ConcurrentLinkedQueue(); 14 | 15 | @Override 16 | public String poll() { 17 | String url = highqueue.poll(); 18 | if(url==null){ 19 | url = lowqueue.poll(); 20 | } 21 | return url; 22 | } 23 | 24 | @Override 25 | public void add(String nexturl) { 26 | this.lowqueue.add(nexturl); 27 | } 28 | 29 | @Override 30 | public void addHigh(String nexturl) { 31 | this.highqueue.add(nexturl); 32 | } 33 | 34 | 35 | 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/DomainUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URL; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | public class DomainUtils { 9 | 10 | /** 11 | * 获取url的顶级域名 12 | * @param url 13 | * @return 14 | */ 15 | public static String getTopDomain(String url){ 16 | 17 | try { 18 | String host = new URL(url).getHost().toLowerCase();// 此处获取值转换为小写 19 | Pattern pattern = Pattern.compile("[^\\.]+(\\.com\\.cn|\\.net\\.cn|\\.org\\.cn|\\.gov\\.cn|\\.com|\\.net|\\.cn|\\.org|\\.cc|\\.me|\\.tel|\\.mobi|\\.asia|\\.biz|\\.info|\\.name|\\.tv|\\.hk|\\.公司|\\.中国|\\.网络)"); 20 | Matcher matcher = pattern.matcher(host); 21 | while (matcher.find()) { 22 | return matcher.group(); 23 | } 24 | } catch (MalformedURLException e) { 25 | e.printStackTrace(); 26 | } 27 | return null; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/UrlManager.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import org.quartz.CronTrigger; 4 | import org.quartz.JobDetail; 5 | import org.quartz.Scheduler; 6 | import org.quartz.SchedulerException; 7 | import org.quartz.Trigger; 8 | import org.quartz.impl.StdSchedulerFactory; 9 | 10 | public class UrlManager { 11 | 12 | public static void main(String[] args) { 13 | try { 14 | Scheduler defaultScheduler = StdSchedulerFactory.getDefaultScheduler(); 15 | defaultScheduler.start(); 16 | 17 | String simpleName = UrlJob.class.getSimpleName(); 18 | JobDetail jobDetail = new JobDetail(simpleName, Scheduler.DEFAULT_GROUP, UrlJob.class); 19 | 20 | Trigger trigger = new CronTrigger(simpleName, Scheduler.DEFAULT_GROUP, "0 0 01 * * ? "); 21 | 22 | 23 | 24 | defaultScheduler.scheduleJob(jobDetail , trigger); 25 | 26 | } catch (Exception e) { 27 | e.printStackTrace(); 28 | } 29 | 30 | 31 | 32 | 33 | } 34 | 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/SpiderTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import java.util.List; 4 | 5 | import org.junit.Test; 6 | 7 | import com.test.spider.domain.Page; 8 | import com.test.spider.download.HttpClientDownload; 9 | import com.test.spider.process.JdProcess; 10 | import com.test.spider.store.ConsoleStore; 11 | import com.test.spider.store.HbaseStore; 12 | 13 | public class SpiderTest { 14 | 15 | @Test 16 | public void test() throws Exception { 17 | Spider spider = new Spider(); 18 | // spider.start(); 19 | spider.setDownloadable(new HttpClientDownload()); 20 | spider.setProcessable(new JdProcess()); 21 | spider.setStoreable(new ConsoleStore());// 测试方法,使用控制台输出结果 22 | 23 | // String url = "http://list.jd.com/list.html?cat=9987,653,655"; 24 | String url = "http://item.jd.com/1856581.html"; 25 | Page page = spider.download(url); 26 | spider.process(page); 27 | List urlList = page.getUrlList(); 28 | System.out.println(urlList.size()); 29 | spider.store(page); 30 | 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/CuratorTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.net.InetAddress; 6 | 7 | import org.apache.curator.RetryPolicy; 8 | import org.apache.curator.framework.CuratorFramework; 9 | import org.apache.curator.framework.CuratorFrameworkFactory; 10 | import org.apache.curator.retry.ExponentialBackoffRetry; 11 | import org.apache.zookeeper.CreateMode; 12 | import org.apache.zookeeper.ZooDefs.Ids; 13 | import org.junit.Test; 14 | 15 | public class CuratorTest { 16 | 17 | @Test 18 | public void test() throws Exception { 19 | String connectString = "192.168.1.170:2181"; 20 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 21 | CuratorFramework client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 22 | client.start(); 23 | InetAddress localHost = InetAddress.getLocalHost(); 24 | String hostAddress = localHost.getHostAddress(); 25 | 26 | 27 | client.create().creatingParentsIfNeeded().withMode(CreateMode.EPHEMERAL).withACL(Ids.OPEN_ACL_UNSAFE).forPath("/spider/"+hostAddress, "".getBytes()); 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/repository/RandomRedisRepository.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.repository; 2 | 3 | import java.util.HashMap; 4 | import java.util.Queue; 5 | import java.util.Random; 6 | import java.util.concurrent.ConcurrentLinkedQueue; 7 | 8 | import com.test.spider.utils.DomainUtils; 9 | import com.test.spider.utils.RedisUtils; 10 | 11 | public class RandomRedisRepository implements Repository { 12 | HashMap hashMap = new HashMap(); 13 | Random random = new Random(); 14 | RedisUtils redisUtils = new RedisUtils(); 15 | @Override 16 | public String poll() { 17 | String[] keys = hashMap.keySet().toArray(new String[0]); 18 | int nextInt = random.nextInt(keys.length); 19 | String redis_key = hashMap.get(keys[nextInt]); 20 | return redisUtils.poll(redis_key); 21 | } 22 | 23 | @Override 24 | public void add(String nexturl) { 25 | String topdomain = DomainUtils.getTopDomain(nexturl); 26 | String rediskey = hashMap.get(topdomain); 27 | if(rediskey==null){ 28 | hashMap.put(topdomain, topdomain); 29 | } 30 | redisUtils.add(topdomain, nexturl); 31 | } 32 | 33 | @Override 34 | public void addHigh(String nexturl) { 35 | add(nexturl); 36 | } 37 | 38 | 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/repository/RandomRepository.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.repository; 2 | 3 | import java.util.HashMap; 4 | import java.util.Queue; 5 | import java.util.Random; 6 | import java.util.concurrent.ConcurrentLinkedQueue; 7 | 8 | import com.test.spider.utils.DomainUtils; 9 | 10 | public class RandomRepository implements Repository { 11 | HashMap> hashMap = new HashMap>(); 12 | //HashMap hashMap = new HashMap(); 13 | Random random = new Random(); 14 | @Override 15 | public String poll() { 16 | String[] keys = hashMap.keySet().toArray(new String[0]); 17 | int nextInt = random.nextInt(keys.length); 18 | Queue queue = hashMap.get(keys[nextInt]); 19 | return queue.poll(); 20 | } 21 | 22 | @Override 23 | public void add(String nexturl) { 24 | String topdomain = DomainUtils.getTopDomain(nexturl); 25 | Queue queue = hashMap.get(topdomain); 26 | if(queue==null){ 27 | queue = new ConcurrentLinkedQueue(); 28 | hashMap.put(topdomain, queue); 29 | } 30 | queue.add(nexturl); 31 | } 32 | 33 | @Override 34 | public void addHigh(String nexturl) { 35 | add(nexturl); 36 | } 37 | 38 | 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/store/HbaseStore.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.store; 2 | 3 | import java.util.Map; 4 | 5 | import com.test.spider.domain.Page; 6 | import com.test.spider.utils.HbaseUtils; 7 | import com.test.spider.utils.RedisUtils; 8 | 9 | public class HbaseStore implements Storeable { 10 | 11 | HbaseUtils hbaseUtils = new HbaseUtils(); 12 | RedisUtils redisUtils = new RedisUtils(); 13 | @Override 14 | public void store(Page page) { 15 | String goodsid = page.getGoodsid(); 16 | redisUtils.add("solr_index", goodsid); 17 | Map map = page.getMap(); 18 | try{ 19 | hbaseUtils.put(HbaseUtils.TABLE_NAME, goodsid, HbaseUtils.COLUMNFAMILY_1, HbaseUtils.COLUMNFAMILY_1_DATA_URL, page.getUrl()); 20 | hbaseUtils.put(HbaseUtils.TABLE_NAME, goodsid, HbaseUtils.COLUMNFAMILY_1, HbaseUtils.COLUMNFAMILY_1_PIC_URL, map.get("picurl")); 21 | hbaseUtils.put(HbaseUtils.TABLE_NAME, goodsid, HbaseUtils.COLUMNFAMILY_1, HbaseUtils.COLUMNFAMILY_1_PRICE, map.get("price")); 22 | hbaseUtils.put(HbaseUtils.TABLE_NAME, goodsid, HbaseUtils.COLUMNFAMILY_1, HbaseUtils.COLUMNFAMILY_1_TITLE, map.get("title")); 23 | hbaseUtils.put(HbaseUtils.TABLE_NAME, goodsid, HbaseUtils.COLUMNFAMILY_2, HbaseUtils.COLUMNFAMILY_2_PARAM, map.get("spec")); 24 | }catch(Exception e){ 25 | e.printStackTrace(); 26 | } 27 | 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/HtmlUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | import org.htmlcleaner.TagNode; 4 | import org.htmlcleaner.XPatherException; 5 | 6 | public class HtmlUtils { 7 | 8 | /** 9 | * 获取指定标签的值 10 | * @param rootNode 11 | * @param xpath 12 | * @return 13 | */ 14 | public static String getText(TagNode rootNode,String xpath){ 15 | String value = ""; 16 | Object[] evaluateXPath; 17 | try { 18 | evaluateXPath = rootNode.evaluateXPath(xpath); 19 | if(evaluateXPath.length>0){ 20 | TagNode tagNode = (TagNode)evaluateXPath[0]; 21 | value = tagNode.getText().toString(); 22 | } 23 | } catch (XPatherException e) { 24 | e.printStackTrace(); 25 | } 26 | return value; 27 | } 28 | 29 | /** 30 | * 获取指定标签指定属性的值 31 | * @param rootNode 32 | * @param attr 33 | * @param xpath 34 | * @return 35 | */ 36 | public static String getAttributeByName(TagNode rootNode,String attr,String xpath){ 37 | String value = ""; 38 | Object[] evaluateXPath; 39 | try { 40 | evaluateXPath = rootNode.evaluateXPath(xpath); 41 | if(evaluateXPath.length>0){ 42 | TagNode tagNode = (TagNode)evaluateXPath[0]; 43 | value = tagNode.getAttributeByName(attr); 44 | } 45 | } catch (XPatherException e) { 46 | e.printStackTrace(); 47 | } 48 | return value; 49 | } 50 | 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/RedisUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | 4 | import java.util.List; 5 | 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.JedisPool; 8 | import redis.clients.jedis.JedisPoolConfig; 9 | 10 | public class RedisUtils { 11 | public static String start_url = "start_url"; 12 | 13 | public static String heightkey = "spider.todo.height"; 14 | public static String lowkey = "spider.todo.low"; 15 | 16 | 17 | JedisPool jedisPool = null; 18 | public RedisUtils(){ 19 | JedisPoolConfig poolConfig = new JedisPoolConfig(); 20 | poolConfig.setMaxIdle(10); 21 | poolConfig.setMaxTotal(100); 22 | poolConfig.setMaxWaitMillis(10000); 23 | poolConfig.setTestOnBorrow(true); 24 | jedisPool = new JedisPool(poolConfig, "192.168.1.170", 6379); 25 | } 26 | 27 | public List lrange(String key,int start,int end){ 28 | Jedis resource = jedisPool.getResource(); 29 | 30 | List list = resource.lrange(key, start, end); 31 | jedisPool.returnResourceObject(resource); 32 | return list; 33 | 34 | } 35 | 36 | public void add(String lowKey, String url) { 37 | Jedis resource = jedisPool.getResource(); 38 | resource.lpush(lowKey, url); 39 | jedisPool.returnResourceObject(resource); 40 | } 41 | public String poll(String key) { 42 | Jedis resource = jedisPool.getResource(); 43 | String result = resource.rpop(key); 44 | jedisPool.returnResourceObject(resource); 45 | return result; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/domain/Page.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.domain; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | public class Page { 9 | 10 | /** 11 | * 临时存储下一页及当前页商品的url 12 | */ 13 | private List urlList = new ArrayList(); 14 | 15 | 16 | /** 17 | * 商品ID 18 | */ 19 | private String goodsid; 20 | 21 | /** 22 | * 存储页面的基本信息 23 | */ 24 | private Map map = new HashMap(); 25 | 26 | /** 27 | * 原始url 28 | */ 29 | private String url; 30 | 31 | /** 32 | * 原始页面内容 33 | */ 34 | private String content; 35 | 36 | public String getContent() { 37 | return content; 38 | } 39 | 40 | public void setContent(String content) { 41 | this.content = content; 42 | } 43 | 44 | public String getUrl() { 45 | return url; 46 | } 47 | 48 | public void setUrl(String url) { 49 | this.url = url; 50 | } 51 | 52 | public Map getMap() { 53 | return map; 54 | } 55 | 56 | public void addField(String key ,String value){ 57 | this.map.put(key, value); 58 | } 59 | 60 | public String getGoodsid() { 61 | return goodsid; 62 | } 63 | 64 | public void setGoodsid(String goodsid) { 65 | this.goodsid = goodsid; 66 | } 67 | 68 | public List getUrlList() { 69 | return urlList; 70 | } 71 | 72 | public void addUrl(String url){ 73 | this.urlList.add(url); 74 | } 75 | 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/HttpProxyTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.apache.http.HttpEntity; 6 | import org.apache.http.HttpHost; 7 | import org.apache.http.client.methods.CloseableHttpResponse; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.conn.HttpHostConnectException; 10 | import org.apache.http.impl.client.CloseableHttpClient; 11 | import org.apache.http.impl.client.HttpClientBuilder; 12 | import org.apache.http.impl.client.HttpClients; 13 | import org.apache.http.util.EntityUtils; 14 | import org.junit.Test; 15 | 16 | public class HttpProxyTest { 17 | 18 | /** 19 | * 使用代理IP抓取页面 20 | * Httpclient 4.x 21 | * @throws Exception 22 | */ 23 | @Test 24 | public void test() throws Exception { 25 | HttpClientBuilder builder = HttpClients.custom(); 26 | //下面的代理IP和端口就需要从代理Ip库中读取,代理IP也可使用爬虫实时爬取:http://www.xicidaili.com/ 27 | String ip = "119.188.94.145"; 28 | int port = 80; 29 | HttpHost proxy = new HttpHost(ip, port); 30 | CloseableHttpClient client = builder.setProxy(proxy ).build(); 31 | String url = "http://item.jd.com/1514794.html"; 32 | HttpGet request = new HttpGet(url); 33 | try { 34 | CloseableHttpResponse response = client.execute(request); 35 | HttpEntity entity = response.getEntity(); 36 | System.out.println(EntityUtils.toString(entity)); 37 | } catch (HttpHostConnectException e) { 38 | System.out.println("代理异常,代理IP为:"+ip+",代理端口为:"+port); 39 | } 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/PageUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.http.HttpEntity; 6 | import org.apache.http.client.ClientProtocolException; 7 | import org.apache.http.client.methods.CloseableHttpResponse; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.apache.http.impl.client.HttpClientBuilder; 11 | import org.apache.http.impl.client.HttpClients; 12 | import org.apache.http.util.EntityUtils; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | public class PageUtils { 17 | private static Logger logger = LoggerFactory.getLogger(PageUtils.class); 18 | /** 19 | * 获取页面内容 20 | * @param url 21 | * @return 22 | */ 23 | public static String getContent(String url){ 24 | String content = ""; 25 | HttpClientBuilder builder = HttpClients.custom(); 26 | builder.setUserAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"); 27 | CloseableHttpClient client = builder.build(); 28 | 29 | HttpGet request = new HttpGet(url); 30 | try { 31 | long start_time = System.currentTimeMillis(); 32 | CloseableHttpResponse response = client.execute(request); 33 | HttpEntity entity = response.getEntity(); 34 | content = EntityUtils.toString(entity); 35 | logger.info("页面下载成功:{},消耗时间:{}",url,System.currentTimeMillis()-start_time); 36 | } catch (Exception e) { 37 | logger.error("页面下载失败:{}",url); 38 | e.printStackTrace(); 39 | } 40 | return content; 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/JsTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.FileInputStream; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.net.URL; 10 | 11 | import javax.script.Invocable; 12 | import javax.script.ScriptEngine; 13 | import javax.script.ScriptEngineManager; 14 | 15 | import org.junit.Test; 16 | 17 | public class JsTest { 18 | 19 | /** 20 | * 执行本地js中的函数 21 | * @throws Exception 22 | */ 23 | @Test 24 | public void test() throws Exception { 25 | ScriptEngineManager scriptEngineManager = new ScriptEngineManager(); 26 | ScriptEngine engine = scriptEngineManager.getEngineByExtension("js"); 27 | 28 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream("d:\\test.js"))); 29 | engine.eval(bufferedReader); 30 | 31 | Invocable invocable = (Invocable)engine; 32 | Object result = invocable.invokeFunction("getNum", "3"); 33 | System.out.println(result); 34 | } 35 | 36 | /** 37 | * 执行在线js中方法 38 | * @throws Exception 39 | */ 40 | @Test 41 | public void test1() throws Exception { 42 | ScriptEngineManager scriptEngineManager = new ScriptEngineManager(); 43 | ScriptEngine engine = scriptEngineManager.getEngineByExtension("js"); 44 | 45 | URL url = new URL("http://aaa.com/a.js"); 46 | InputStream inputStream = url.openStream(); 47 | 48 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); 49 | engine.eval(bufferedReader); 50 | 51 | Invocable invocable = (Invocable)engine; 52 | Object result = invocable.invokeFunction("getNum", "3"); 53 | System.out.println(result); 54 | 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/SpiderWatcher.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.curator.RetryPolicy; 7 | import org.apache.curator.framework.CuratorFramework; 8 | import org.apache.curator.framework.CuratorFrameworkFactory; 9 | import org.apache.curator.retry.ExponentialBackoffRetry; 10 | import org.apache.zookeeper.WatchedEvent; 11 | import org.apache.zookeeper.Watcher; 12 | 13 | /** 14 | * 守护进程,需要一直运行 15 | * @author Administrator 16 | * 17 | */ 18 | public class SpiderWatcher implements Watcher { 19 | CuratorFramework client; 20 | List children = new ArrayList(); 21 | public SpiderWatcher() { 22 | String connectString = "192.168.1.170:2181"; 23 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 24 | client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 25 | client.start(); 26 | 27 | try { 28 | children = client.getChildren().usingWatcher(this).forPath("/spider"); 29 | } catch (Exception e) { 30 | e.printStackTrace(); 31 | } 32 | 33 | } 34 | 35 | 36 | 37 | @Override 38 | public void process(WatchedEvent event) { 39 | try { 40 | List newChildren = client.getChildren().usingWatcher(this).forPath("/spider"); 41 | for (String ip : children) { 42 | if(!newChildren.contains(ip)){ 43 | System.out.println("消失的节点IP:"+ip); 44 | //TODO 给管理员发发送邮件或者短信 发邮件的话可以使用javamail 发短信的话可以使用第三方服务,例如:云片 45 | } 46 | } 47 | 48 | for (String ip : newChildren) { 49 | if(!children.contains(ip)){ 50 | System.out.println("新增的节点IP:"+ip); 51 | } 52 | } 53 | //这一行代码非常和重要 54 | this.children = newChildren; 55 | 56 | 57 | } catch (Exception e) { 58 | e.printStackTrace(); 59 | } 60 | } 61 | 62 | 63 | public static void main(String[] args) { 64 | SpiderWatcher spiderWatcher = new SpiderWatcher(); 65 | spiderWatcher.run(); 66 | } 67 | 68 | 69 | private void run() { 70 | while(true){ 71 | ; 72 | } 73 | } 74 | 75 | 76 | 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/com/test/spider/LoginTest.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.net.URI; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import org.apache.http.Header; 10 | import org.apache.http.HttpEntity; 11 | import org.apache.http.NameValuePair; 12 | import org.apache.http.client.entity.UrlEncodedFormEntity; 13 | import org.apache.http.client.methods.CloseableHttpResponse; 14 | import org.apache.http.client.methods.HttpPost; 15 | import org.apache.http.impl.client.CloseableHttpClient; 16 | import org.apache.http.impl.client.HttpClientBuilder; 17 | import org.apache.http.impl.client.HttpClients; 18 | import org.apache.http.message.BasicNameValuePair; 19 | import org.apache.http.util.EntityUtils; 20 | import org.junit.Test; 21 | 22 | public class LoginTest { 23 | 24 | /** 25 | * 模拟登录 26 | * @throws Exception 27 | */ 28 | @Test 29 | public void test() throws Exception { 30 | HttpClientBuilder builder = HttpClients.custom(); 31 | CloseableHttpClient client = builder.build(); 32 | 33 | HttpPost httpPost = new HttpPost("http://svn.jundie.net/user/login");//登录时的请求地址 34 | List parameters = new ArrayList(); 35 | parameters.add(new BasicNameValuePair("uid", "crxy"));//登录的帐号 36 | parameters.add(new BasicNameValuePair("pwd", "www.crxy.cn"));//登录的密码 37 | HttpEntity entity = new UrlEncodedFormEntity(parameters ); 38 | httpPost.setEntity(entity); 39 | 40 | CloseableHttpResponse response = client.execute(httpPost); 41 | 42 | int statusCode = response.getStatusLine().getStatusCode(); 43 | if(statusCode==302){//根据条件判断登录成功 44 | Header[] headers = response.getHeaders("location"); 45 | String redirectUrl = ""; 46 | if(headers.length>0){ 47 | redirectUrl = headers[0].getValue(); 48 | } 49 | 50 | httpPost.setURI(new URI("http://svn.jundie.net"+redirectUrl)); 51 | response = client.execute(httpPost); 52 | System.out.println(EntityUtils.toString(response.getEntity())); 53 | 54 | } 55 | 56 | 57 | 58 | 59 | 60 | 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.test 6 | spider 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | spider 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.12 22 | test 23 | 24 | 25 | org.apache.httpcomponents 26 | httpclient 27 | 4.4 28 | 29 | 30 | net.sourceforge.htmlcleaner 31 | htmlcleaner 32 | 2.10 33 | 34 | 35 | org.json 36 | json 37 | 20140107 38 | 39 | 40 | org.apache.hbase 41 | hbase-client 42 | 0.98.8-hadoop2 43 | 44 | 45 | jdk.tools 46 | jdk.tools 47 | 1.7 48 | system 49 | ${JAVA_HOME}/lib/tools.jar 50 | 51 | 52 | redis.clients 53 | jedis 54 | 2.7.0 55 | 56 | 57 | 58 | org.slf4j 59 | slf4j-api 60 | 1.7.10 61 | 62 | 63 | org.slf4j 64 | slf4j-log4j12 65 | 1.7.10 66 | 67 | 68 | org.quartz-scheduler 69 | quartz 70 | 1.8.4 71 | 72 | 73 | org.apache.curator 74 | curator-framework 75 | 2.7.1 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | org.apache.maven.plugins 85 | maven-compiler-plugin 86 | 2.3.2 87 | 88 | UTF-8 89 | 1.7 90 | 1.7 91 | true 92 | 93 | 94 | 95 | maven-assembly-plugin 96 | 97 | 98 | jar-with-dependencies 99 | 100 | 101 | 102 | com.test.spider.Spider 103 | 104 | 105 | 106 | 107 | 108 | make-assembly 109 | package 110 | 111 | single 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/process/JdProcess.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.process; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import org.htmlcleaner.HtmlCleaner; 7 | import org.htmlcleaner.TagNode; 8 | import org.htmlcleaner.XPatherException; 9 | import org.json.JSONArray; 10 | import org.json.JSONObject; 11 | 12 | import com.test.spider.domain.Page; 13 | import com.test.spider.utils.HtmlUtils; 14 | import com.test.spider.utils.PageUtils; 15 | 16 | public class JdProcess implements Processable { 17 | 18 | @Override 19 | public void process(Page page) { 20 | String content = page.getContent(); 21 | HtmlCleaner htmlCleaner = new HtmlCleaner(); 22 | TagNode rootNode = htmlCleaner.clean(content); 23 | if(page.getUrl().startsWith("http://list.jd.com/list.html")){ 24 | String nexturl = HtmlUtils.getAttributeByName(rootNode, "href", "//*[@id=\"J_topPage\"]/a[2]"); 25 | nexturl = "http://list.jd.com"+nexturl.replace("&", "&"); 26 | page.addUrl(nexturl); 27 | 28 | try { 29 | Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a"); 30 | for (Object object : evaluateXPath) { 31 | TagNode tagNode = (TagNode)object; 32 | page.addUrl(tagNode.getAttributeByName("href")); 33 | } 34 | } catch (XPatherException e) { 35 | e.printStackTrace(); 36 | } 37 | 38 | 39 | }else{ 40 | parseProduct(page, rootNode); 41 | } 42 | } 43 | 44 | /** 45 | * 解析商品明细数据 46 | * @param page 47 | * @param rootNode 48 | */ 49 | public void parseProduct(Page page, TagNode rootNode) { 50 | try { 51 | //标题 52 | 53 | String title = HtmlUtils.getText(rootNode, "//*[@id=\"name\"]/h1"); 54 | page.addField("title", title); 55 | 56 | //图片地址 57 | String picurl = HtmlUtils.getAttributeByName(rootNode, "src", "//*[@id=\"spec-n1\"]/img"); 58 | page.addField("picurl", picurl); 59 | 60 | 61 | //价格 62 | /*evaluateXPath = rootNode.evaluateXPath("//*[@id=\"jd-price\"]"); 63 | if(evaluateXPath.length>0){ 64 | TagNode priceNode = (TagNode)evaluateXPath[0]; 65 | System.out.println("价格:"+priceNode.getText()); 66 | }*/ 67 | String url = page.getUrl(); 68 | Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html"); 69 | Matcher matcher = compile.matcher(url); 70 | String goodsId = ""; 71 | if(matcher.find()){ 72 | goodsId = matcher.group(1); 73 | } 74 | page.setGoodsid("jd_"+goodsId); 75 | String priceJson = PageUtils.getContent("http://p.3.cn/prices/get?skuid=J_"+goodsId); 76 | JSONArray jsonArray = new JSONArray(priceJson); 77 | JSONObject object = (JSONObject)jsonArray.get(0); 78 | page.addField("price", object.getString("p")); 79 | 80 | //规格参数 81 | 82 | Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr"); 83 | JSONArray specjsonArray = new JSONArray(); 84 | for (Object tagobject : evaluateXPath) { 85 | TagNode tagNode = (TagNode)tagobject; 86 | if(!"".equals(tagNode.getText().toString().trim())){ 87 | Object[] thevaluateXPath = tagNode.evaluateXPath("/th"); 88 | JSONObject jsonObject = new JSONObject(); 89 | if(thevaluateXPath.length>0){ 90 | TagNode thtagnode = (TagNode)thevaluateXPath[0]; 91 | jsonObject.put("name", ""); 92 | jsonObject.put("value", thtagnode.getText().toString()); 93 | }else{ 94 | Object[] tdevaluateXPath = tagNode.evaluateXPath("/td"); 95 | TagNode tdtagnode1 = (TagNode)tdevaluateXPath[0]; 96 | TagNode tdtagnode2 = (TagNode)tdevaluateXPath[1]; 97 | jsonObject.put("name", tdtagnode1.getText().toString()); 98 | jsonObject.put("value", tdtagnode2.getText().toString()); 99 | } 100 | specjsonArray.put(jsonObject); 101 | } 102 | } 103 | page.addField("spec", specjsonArray.toString()); 104 | } catch (XPatherException e) { 105 | e.printStackTrace(); 106 | } 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/Spider.java: -------------------------------------------------------------------------------- 1 | package com.test.spider; 2 | 3 | import java.net.InetAddress; 4 | import java.util.List; 5 | import java.util.concurrent.ExecutorService; 6 | import java.util.concurrent.Executors; 7 | 8 | import org.apache.commons.lang.StringUtils; 9 | import org.apache.curator.RetryPolicy; 10 | import org.apache.curator.framework.CuratorFramework; 11 | import org.apache.curator.framework.CuratorFrameworkFactory; 12 | import org.apache.curator.retry.ExponentialBackoffRetry; 13 | import org.apache.zookeeper.CreateMode; 14 | import org.apache.zookeeper.ZooDefs.Ids; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import com.sun.tools.javac.comp.Check; 19 | import com.test.spider.domain.Page; 20 | import com.test.spider.download.Downloadable; 21 | import com.test.spider.download.HttpClientDownload; 22 | import com.test.spider.process.JdProcess; 23 | import com.test.spider.process.Processable; 24 | import com.test.spider.repository.QueueRepository; 25 | import com.test.spider.repository.RedisRepository; 26 | import com.test.spider.repository.Repository; 27 | import com.test.spider.store.ConsoleStore; 28 | import com.test.spider.store.HbaseStore; 29 | import com.test.spider.store.Storeable; 30 | import com.test.spider.threadpool.FixedThreadPool; 31 | import com.test.spider.threadpool.ThreadPool; 32 | import com.test.spider.utils.Config; 33 | import com.test.spider.utils.SleepUtils; 34 | 35 | public class Spider { 36 | Logger logger = LoggerFactory.getLogger(getClass()); 37 | 38 | public Spider() { 39 | String connectString = "192.168.1.170:2181"; 40 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 41 | CuratorFramework client = CuratorFrameworkFactory.newClient(connectString, retryPolicy); 42 | client.start(); 43 | try { 44 | InetAddress localHost = InetAddress.getLocalHost(); 45 | String hostAddress = localHost.getHostAddress(); 46 | client.create().creatingParentsIfNeeded().withMode(CreateMode.EPHEMERAL).withACL(Ids.OPEN_ACL_UNSAFE).forPath("/spider/"+hostAddress, "".getBytes()); 47 | } catch (Exception e) { 48 | e.printStackTrace(); 49 | } 50 | 51 | } 52 | 53 | 54 | private Downloadable downloadable = new HttpClientDownload(); 55 | 56 | private Processable processable; 57 | 58 | private Storeable storeable = new ConsoleStore(); 59 | 60 | 61 | private Repository repository = new QueueRepository(); 62 | 63 | private ThreadPool threadPool = new FixedThreadPool(); 64 | 65 | public void start() { 66 | 67 | check(); 68 | logger.info("爬虫开始爬取数据..."); 69 | while(true){ 70 | final String url = repository.poll(); 71 | if(StringUtils.isNotBlank(url)){ 72 | threadPool.execute(new Runnable() { 73 | public void run() { 74 | // 下载 75 | Page page = Spider.this.download(url); 76 | // 解析 77 | Spider.this.process(page); 78 | List urlList = page.getUrlList(); 79 | for (String nexturl : urlList) { 80 | if (nexturl 81 | .startsWith("http://list.jd.com/list.html")) { 82 | repository.addHigh(nexturl); 83 | } else { 84 | repository.add(nexturl); 85 | } 86 | } 87 | // 存储 88 | if (url.startsWith("http://item.jd.com/")) { 89 | Spider.this.store(page); 90 | } 91 | System.out.println("当前线程ID:"+Thread.currentThread().getId()); 92 | } 93 | }); 94 | SleepUtils.sleep(Config.million_1); 95 | }else{ 96 | System.out.println("沒有url了,休息一会。"); 97 | SleepUtils.sleep(Config.million_5); 98 | } 99 | } 100 | } 101 | 102 | /** 103 | * 检查爬虫的配置 104 | */ 105 | private void check() { 106 | if(processable==null){ 107 | String message = "没有配置爬虫解析类..."; 108 | logger.error(message); 109 | throw new RuntimeException(message); 110 | } 111 | logger.info("=================================================="); 112 | logger.info("downloadable的实现类:{}",downloadable.getClass().getSimpleName()); 113 | logger.info("processable的实现类:{}",processable.getClass().getSimpleName()); 114 | logger.info("storeable的实现类:{}",storeable.getClass().getSimpleName()); 115 | logger.info("repository的实现类:{}",repository.getClass().getSimpleName()); 116 | logger.info("threadPool的实现类:{}",threadPool.getClass().getSimpleName()); 117 | logger.info("=================================================="); 118 | 119 | } 120 | 121 | /** 122 | * 下载网页 123 | * @param url 124 | */ 125 | public Page download(String url) { 126 | Page page = this.downloadable.download(url); 127 | return page; 128 | } 129 | 130 | /** 131 | * 解析网页内容 132 | * @param page 133 | */ 134 | public void process(Page page) { 135 | this.processable.process(page); 136 | } 137 | /** 138 | * 保存网页内容 139 | * @param page 140 | */ 141 | public void store(Page page) { 142 | this.storeable.store(page); 143 | } 144 | 145 | public Downloadable getDownloadable() { 146 | return downloadable; 147 | } 148 | 149 | public void setDownloadable(Downloadable downloadable) { 150 | this.downloadable = downloadable; 151 | } 152 | 153 | public Processable getProcessable() { 154 | return processable; 155 | } 156 | 157 | public void setProcessable(Processable processable) { 158 | this.processable = processable; 159 | } 160 | 161 | public Storeable getStoreable() { 162 | return storeable; 163 | } 164 | 165 | public void setStoreable(Storeable storeable) { 166 | this.storeable = storeable; 167 | } 168 | 169 | public void setSeedUrl(String url){ 170 | this.repository.add(url); 171 | } 172 | 173 | 174 | 175 | public ThreadPool getThreadPool() { 176 | return threadPool; 177 | } 178 | 179 | public void setThreadPool(ThreadPool threadPool) { 180 | this.threadPool = threadPool; 181 | } 182 | 183 | public Repository getRepository() { 184 | return repository; 185 | } 186 | 187 | public void setRepository(Repository repository) { 188 | this.repository = repository; 189 | } 190 | 191 | public static void main(String[] args) { 192 | Spider spider = new Spider(); 193 | spider.setProcessable(new JdProcess()); 194 | spider.setStoreable(new HbaseStore()); 195 | String url = "http://list.jd.com/list.html?cat=9987,653,655"; 196 | spider.setSeedUrl(url); 197 | spider.start(); 198 | 199 | } 200 | 201 | 202 | } 203 | -------------------------------------------------------------------------------- /src/main/java/com/test/spider/utils/HbaseUtils.java: -------------------------------------------------------------------------------- 1 | package com.test.spider.utils; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.hbase.HColumnDescriptor; 9 | import org.apache.hadoop.hbase.HTableDescriptor; 10 | import org.apache.hadoop.hbase.KeyValue; 11 | import org.apache.hadoop.hbase.client.Delete; 12 | import org.apache.hadoop.hbase.client.HBaseAdmin; 13 | import org.apache.hadoop.hbase.client.HTable; 14 | import org.apache.hadoop.hbase.client.HTableInterface; 15 | import org.apache.hadoop.hbase.client.HTablePool; 16 | import org.apache.hadoop.hbase.client.Put; 17 | import org.apache.hadoop.hbase.client.Result; 18 | import org.apache.hadoop.hbase.client.ResultScanner; 19 | import org.apache.hadoop.hbase.client.Scan; 20 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp; 21 | import org.apache.hadoop.hbase.filter.RegexStringComparator; 22 | import org.apache.hadoop.hbase.filter.RowFilter; 23 | import org.apache.hadoop.hbase.util.Bytes; 24 | 25 | public class HbaseUtils { 26 | 27 | /** 28 | * HBASE 表名称 29 | */ 30 | public static final String TABLE_NAME = "spider"; 31 | /** 32 | * 列簇1 商品信息 33 | */ 34 | public static final String COLUMNFAMILY_1 = "goodsinfo"; 35 | /** 36 | * 列簇1中的列 37 | */ 38 | public static final String COLUMNFAMILY_1_DATA_URL = "data_url"; 39 | public static final String COLUMNFAMILY_1_PIC_URL = "pic_url"; 40 | public static final String COLUMNFAMILY_1_TITLE = "title"; 41 | public static final String COLUMNFAMILY_1_PRICE = "price"; 42 | /** 43 | * 列簇2 商品规格 44 | */ 45 | public static final String COLUMNFAMILY_2 = "spec"; 46 | public static final String COLUMNFAMILY_2_PARAM = "param"; 47 | 48 | 49 | HBaseAdmin admin=null; 50 | Configuration conf=null; 51 | /** 52 | * 构造函数加载配置 53 | */ 54 | public HbaseUtils(){ 55 | conf = new Configuration(); 56 | conf.set("hbase.zookeeper.quorum", "192.168.1.176:2181"); 57 | conf.set("hbase.rootdir", "hdfs://192.168.1.176:9000/hbase"); 58 | try { 59 | admin = new HBaseAdmin(conf); 60 | } catch (IOException e) { 61 | e.printStackTrace(); 62 | } 63 | } 64 | public static void main(String[] args) throws Exception { 65 | HbaseUtils hbase = new HbaseUtils(); 66 | //创建一张表 67 | // hbase.createTable("stu","cf"); 68 | // //查询所有表名 69 | hbase.getALLTable(); 70 | // //往表中添加一条记录 71 | // hbase.addOneRecord("stu","key1","cf","name","zhangsan"); 72 | // hbase.addOneRecord("stu","key1","cf","age","24"); 73 | // //查询一条记录 74 | // hbase.getKey("stu","key1"); 75 | // //获取表的所有数据 76 | // hbase.getALLData("stu"); 77 | // //删除一条记录 78 | // hbase.deleteOneRecord("stu","key1"); 79 | // //删除表 80 | // hbase.deleteTable("stu"); 81 | //scan过滤器的使用 82 | // hbase.getScanData("stu","cf","age"); 83 | //rowFilter的使用 84 | //84138413_20130313145955 85 | } 86 | /** 87 | * rowFilter的使用 88 | * @param tableName 89 | * @param reg 90 | * @throws Exception 91 | */ 92 | public void getRowFilter(String tableName, String reg) throws Exception { 93 | HTable hTable = new HTable(conf, tableName); 94 | Scan scan = new Scan(); 95 | // Filter 96 | RowFilter rowFilter = new RowFilter(CompareOp.NOT_EQUAL, new RegexStringComparator(reg)); 97 | scan.setFilter(rowFilter); 98 | ResultScanner scanner = hTable.getScanner(scan); 99 | for (Result result : scanner) { 100 | System.out.println(new String(result.getRow())); 101 | } 102 | } 103 | 104 | public void getScanData(String tableName, String family, String qualifier) throws Exception { 105 | HTable hTable = new HTable(conf, tableName); 106 | Scan scan = new Scan(); 107 | scan.addColumn(family.getBytes(), qualifier.getBytes()); 108 | ResultScanner scanner = hTable.getScanner(scan); 109 | for (Result result : scanner) { 110 | if(result.raw().length==0){ 111 | System.out.println(tableName+" 表数据为空!"); 112 | }else{ 113 | for (KeyValue kv: result.raw()){ 114 | System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue())); 115 | } 116 | } 117 | } 118 | } 119 | private void deleteTable(String tableName) { 120 | try { 121 | if (admin.tableExists(tableName)) { 122 | admin.disableTable(tableName); 123 | admin.deleteTable(tableName); 124 | System.out.println(tableName+"表删除成功!"); 125 | } 126 | } catch (IOException e) { 127 | e.printStackTrace(); 128 | System.out.println(tableName+"表删除失败!"); 129 | } 130 | 131 | } 132 | /** 133 | * 删除一条记录 134 | * @param tableName 135 | * @param rowKey 136 | */ 137 | public void deleteOneRecord(String tableName, String rowKey) { 138 | HTablePool hTablePool = new HTablePool(conf, 1000); 139 | HTableInterface table = hTablePool.getTable(tableName); 140 | Delete delete = new Delete(rowKey.getBytes()); 141 | try { 142 | table.delete(delete); 143 | System.out.println(rowKey+"记录删除成功!"); 144 | } catch (IOException e) { 145 | e.printStackTrace(); 146 | System.out.println(rowKey+"记录删除失败!"); 147 | } 148 | } 149 | /** 150 | * 获取表的所有数据 151 | * @param tableName 152 | */ 153 | public void getALLData(String tableName) { 154 | try { 155 | HTable hTable = new HTable(conf, tableName); 156 | Scan scan = new Scan(); 157 | ResultScanner scanner = hTable.getScanner(scan); 158 | for (Result result : scanner) { 159 | if(result.raw().length==0){ 160 | System.out.println(tableName+" 表数据为空!"); 161 | }else{ 162 | for (KeyValue kv: result.raw()){ 163 | System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue())); 164 | } 165 | } 166 | } 167 | } catch (IOException e) { 168 | e.printStackTrace(); 169 | } 170 | 171 | } 172 | 173 | // 读取一条记录 174 | /*@SuppressWarnings({ "deprecation", "resource" }) 175 | public Article get(String tableName, String row) { 176 | HTablePool hTablePool = new HTablePool(conf, 1000); 177 | HTableInterface table = hTablePool.getTable(tableName); 178 | Get get = new Get(row.getBytes()); 179 | Article article = null; 180 | try { 181 | 182 | Result result = table.get(get); 183 | KeyValue[] raw = result.raw(); 184 | if (raw.length == 4) { 185 | article = new Article(); 186 | article.setId(row); 187 | article.setTitle(new String(raw[3].getValue())); 188 | article.setAuthor(new String(raw[0].getValue())); 189 | article.setContent(new String(raw[1].getValue())); 190 | article.setDescribe(new String(raw[2].getValue())); 191 | } 192 | } catch (IOException e) { 193 | e.printStackTrace(); 194 | } 195 | return article; 196 | }*/ 197 | 198 | 199 | // 添加一条记录 200 | public void put(String tableName, String row, String columnFamily, 201 | String column, String data) throws IOException { 202 | HTablePool hTablePool = new HTablePool(conf, 1000); 203 | HTableInterface table = hTablePool.getTable(tableName); 204 | Put p1 = new Put(Bytes.toBytes(row)); 205 | p1.add(Bytes.toBytes(columnFamily), Bytes.toBytes(column), 206 | Bytes.toBytes(data)); 207 | table.put(p1); 208 | System.out.println("put'" + row + "'," + columnFamily + ":" + column 209 | + "','" + data + "'"); 210 | } 211 | 212 | 213 | /** 214 | * 查询所有表名 215 | * @return 216 | * @throws Exception 217 | */ 218 | public List getALLTable() throws Exception { 219 | ArrayList tables = new ArrayList(); 220 | if(admin!=null){ 221 | HTableDescriptor[] listTables = admin.listTables(); 222 | if (listTables.length>0) { 223 | for (HTableDescriptor tableDesc : listTables) { 224 | tables.add(tableDesc.getNameAsString()); 225 | System.out.println(tableDesc.getNameAsString()); 226 | } 227 | } 228 | } 229 | return tables; 230 | } 231 | /** 232 | * 创建一张表 233 | * @param tableName 234 | * @param column 235 | * @throws Exception 236 | */ 237 | public void createTable(String tableName, String column) throws Exception { 238 | if(admin.tableExists(tableName)){ 239 | System.out.println(tableName+"表已经存在!"); 240 | }else{ 241 | HTableDescriptor tableDesc = new HTableDescriptor(tableName); 242 | tableDesc.addFamily(new HColumnDescriptor(column.getBytes())); 243 | admin.createTable(tableDesc); 244 | System.out.println(tableName+"表创建成功!"); 245 | } 246 | } 247 | } 248 | --------------------------------------------------------------------------------