├── src ├── main │ ├── resources │ │ ├── redis-config.properties │ │ └── mysql-config.properties │ └── java │ │ ├── parse │ │ ├── GoodsPageUrl.java │ │ ├── TagsPage.java │ │ └── CommoditySearchPage.java │ │ ├── ipproxypool │ │ ├── operation │ │ │ ├── IPProxyPool.java │ │ │ └── IPProxyPoolThread.java │ │ ├── timer │ │ │ ├── MyTimer.java │ │ │ └── MyTimeJob.java │ │ ├── jobthread │ │ │ ├── IPProxyGrabThread.java │ │ │ └── CreateIPProxyPool.java │ │ ├── ipfilter │ │ │ └── IPFilter.java │ │ ├── ipmodel │ │ │ └── IPMessage.java │ │ └── grabutils │ │ │ ├── URLFecter.java │ │ │ └── MyHttpResponse.java │ │ ├── utilclass │ │ ├── SerializeUtil.java │ │ ├── BloomFilter.java │ │ └── MD5.java │ │ ├── urlbuild │ │ ├── GoodsUrl.java │ │ └── MainClassifyUrl.java │ │ ├── database │ │ ├── MySQLDB.java │ │ ├── RedisDB.java │ │ ├── MySQL.java │ │ └── MyRedis.java │ │ ├── mainmethod │ │ └── MainMethod.java │ │ ├── mythread │ │ ├── TagBasicPageURLsCacheThread.java │ │ ├── TagBasicPageCrawlerThread.java │ │ └── GoodsDetailsUrlThread.java │ │ └── httpbrower │ │ └── HttpRequest.java └── test │ └── java │ ├── MD5压缩算法 │ └── MD5.java │ └── ThreadLocalMisunderstand.java ├── .idea └── vcs.xml ├── README.md └── pom.xml /src/main/resources/redis-config.properties: -------------------------------------------------------------------------------- 1 | jedis.addr=127.0.0.1 2 | jedis.port=6379 3 | jedis.passwd=6204576387 -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/resources/mysql-config.properties: -------------------------------------------------------------------------------- 1 | jdbc.url=jdbc:mysql://localhost:3306/tao_bao?characterEncoding=utf-8&useSSL=true 2 | jdbc.username=root 3 | jdbc.password=6204576387 4 | jdbc.driverClassName=com.mysql.cj.jdbc.Driver -------------------------------------------------------------------------------- /src/main/java/parse/GoodsPageUrl.java: -------------------------------------------------------------------------------- 1 | package parse; 2 | 3 | /** 4 | * @Author: spider_hgyi 5 | * @Date: Created in 下午5:19 18-1-31. 6 | * @Modified By: 7 | * @Description: 所有基本商品页的url 8 | */ 9 | public class GoodsPageUrl { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/ipproxypool/operation/IPProxyPool.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.operation; 2 | 3 | /** 4 | * @Author: spider_hgyi 5 | * @Date: Created in 上午11:53 18-1-31. 6 | * @Modified By: 7 | * @Description: 执行IP代理池这个后台线程 8 | */ 9 | public class IPProxyPool { 10 | public static void startExecute(Object lock) { 11 | Thread ipProxyPool = new Thread(new IPProxyPoolThread(lock)); 12 | ipProxyPool.setName("ip-proxy-pool"); 13 | ipProxyPool.start(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/ipproxypool/operation/IPProxyPoolThread.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.operation; 2 | 3 | import ipproxypool.timer.MyTimer; 4 | 5 | /** 6 | * @Author: spider_hgyi 7 | * @Date: Created in 上午11:43 18-1-31. 8 | * @Modified By: 9 | * @Description: 创建执行IP代理池的后台线程 10 | */ 11 | public class IPProxyPoolThread implements Runnable { 12 | private final Object lock; 13 | 14 | public IPProxyPoolThread(Object lock) { 15 | this.lock = lock; 16 | } 17 | 18 | @Override 19 | public void run() { 20 | MyTimer.startIPProxyPool(lock); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/ipproxypool/timer/MyTimer.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.timer; 2 | 3 | import java.util.Calendar; 4 | import java.util.Date; 5 | import java.util.Timer; 6 | 7 | /** 8 | * Created by hg_yi on 17-8-11. 9 | * 10 | * @Description: 设定IP代理池的更新时间 11 | */ 12 | public class MyTimer { 13 | public static void startIPProxyPool(Object lock) { 14 | MyTimeJob job = new MyTimeJob(lock); 15 | Timer timer = new Timer(); 16 | 17 | Calendar calendar = Calendar.getInstance(); 18 | Date date = calendar.getTime(); 19 | 20 | // 设置定时任务,从现在开始,每24小时执行一次 21 | timer.schedule(job, date, 24*60*60*1000); 22 | } 23 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/jobthread/IPProxyGrabThread.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.jobthread; 2 | 3 | import java.util.List; 4 | import java.util.Queue; 5 | import java.util.concurrent.locks.ReadWriteLock; 6 | 7 | /** 8 | * Created by hg_yi on 17-8-11. 9 | * 10 | * @Description: 创建IP代理池抓取线程 11 | */ 12 | public class IPProxyGrabThread implements Runnable { 13 | // 所有线程共享任务队列 14 | private Queue urls; 15 | private CreateIPProxyPool createIpProxyPool; 16 | private Object taskLock; 17 | 18 | public IPProxyGrabThread(Queue urls, CreateIPProxyPool createIpProxyPool, Object taskLock) { 19 | this.urls = urls; 20 | this.createIpProxyPool = createIpProxyPool; 21 | this.taskLock = taskLock; 22 | } 23 | 24 | @Override 25 | public void run() { 26 | createIpProxyPool.saveIP(urls, taskLock); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/ipproxypool/ipfilter/IPFilter.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.ipfilter; 2 | 3 | import ipproxypool.ipmodel.IPMessage; 4 | 5 | import java.util.Iterator; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Queue; 9 | 10 | /** 11 | * Created by hg_yi on 17-8-11. 12 | * 13 | * @Description: 对xici代理网的ip进行质量筛选 14 | */ 15 | public class IPFilter { 16 | public static List Filter(List ipMessages1) { 17 | List newIPMessages = new LinkedList<>(); 18 | 19 | for (IPMessage ipMessage : ipMessages1) { 20 | String ipType = ipMessage.getIPType(); 21 | String ipSpeed = ipMessage.getIPSpeed(); 22 | 23 | ipSpeed = ipSpeed.substring(0, ipSpeed.indexOf('秒')); 24 | double Speed = Double.parseDouble(ipSpeed); 25 | 26 | if (ipType.equals("HTTPS") && Speed <= 3.0) { 27 | newIPMessages.add(ipMessage); 28 | } 29 | } 30 | 31 | return newIPMessages; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/utilclass/SerializeUtil.java: -------------------------------------------------------------------------------- 1 | package utilclass; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.ObjectInputStream; 6 | import java.io.ObjectOutputStream; 7 | 8 | /** 9 | * Created by hg_yi on 17-8-11. 10 | * 11 | * @Description: 将对象进行序列化与反序列化 12 | */ 13 | public class SerializeUtil { 14 | public static byte[] serialize(Object object) { 15 | ObjectOutputStream oos; 16 | ByteArrayOutputStream baos; 17 | 18 | try { 19 | // 序列化 20 | baos = new ByteArrayOutputStream(); 21 | oos = new ObjectOutputStream(baos); 22 | oos.writeObject(object); 23 | 24 | byte[] bytes = baos.toByteArray(); 25 | 26 | return bytes; 27 | } catch (Exception e) { 28 | e.printStackTrace(); 29 | } 30 | return null; 31 | } 32 | 33 | public static Object unserialize(byte[] bytes) { 34 | ByteArrayInputStream bais; 35 | ObjectInputStream ois; 36 | 37 | try { 38 | // 反序列化 39 | bais = new ByteArrayInputStream(bytes); 40 | ois = new ObjectInputStream(bais); 41 | 42 | return ois.readObject(); 43 | } catch (Exception e) { 44 | e.printStackTrace(); 45 | } 46 | 47 | return null; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/java/MD5压缩算法/MD5.java: -------------------------------------------------------------------------------- 1 | package MD5压缩算法; 2 | 3 | import java.security.MessageDigest; 4 | import java.security.NoSuchAlgorithmException; 5 | 6 | import static java.lang.System.out; 7 | 8 | /** 9 | * Created by hg_yi on 17-6-2. 10 | */ 11 | 12 | public class MD5 { 13 | public static String getMD5(byte[] source) { 14 | String s = null; 15 | 16 | //用来将字节转换为成16进制表示的字符 17 | char hexDigits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 18 | 'a', 'b', 'c', 'd', 'e', 'f'}; 19 | 20 | try { 21 | MessageDigest md = MessageDigest.getInstance("MD5压缩算法.MD5"); 22 | md.update(source); 23 | 24 | //MD5的运算结果是一个128位的长整数,用字节来表示就是16字节 25 | byte tmp[] = md.digest(); 26 | 27 | //每个字节用16进制表示,使用两个字符,将结果完全表示为16进制则需要32字节 28 | char str[] = new char[16 * 2]; 29 | 30 | int k = 0; 31 | //从第一个字节开始,将MD5中的每个字节转换成十六进制字符 32 | for (int i= 0; i < 16; i++) { 33 | byte byte0 = tmp[i]; 34 | 35 | //取字节高4位的数字进行转换,“>>>”为逻辑右移,将符号位一起右移 36 | str[k++] = hexDigits[byte0 >>> 4 & 0xf]; 37 | //取字节中低4位的的数字进行转换 38 | str[k++] = hexDigits[byte0 & 0xf]; 39 | } 40 | 41 | s = new String(str); 42 | } catch (NoSuchAlgorithmException e) { 43 | e.printStackTrace(); 44 | } 45 | 46 | return s; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/urlbuild/GoodsUrl.java: -------------------------------------------------------------------------------- 1 | package urlbuild; 2 | 3 | import database.MySQL; 4 | import mythread.GoodsDetailsUrlThread; 5 | import utilclass.BloomFilter; 6 | 7 | import java.util.ArrayList; 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | import java.util.Queue; 11 | 12 | /** 13 | * Created by hg_yi on 17-5-28. 14 | * 15 | * @Description: 得到商品详情页的url 16 | */ 17 | 18 | public class GoodsUrl { 19 | public static void getGoodsDetailsPageUrl(Object lock, Object tagBasicPageURLsCacheLock) { 20 | MySQL mySQL = new MySQL(); 21 | BloomFilter bloomFilter = new BloomFilter(); 22 | List threads = new ArrayList<>(); 23 | Queue tagBasicPageUrls = mySQL.getTagBasicPageUrlsFromTagsSearchUrl(); 24 | 25 | System.out.println("tagBasicPageUrls-size: " + tagBasicPageUrls.size()); 26 | 27 | // 创建20个线程,用于解析任务队列 28 | for (int i = 0; i < 20; i++) { 29 | Thread thread = new Thread(new GoodsDetailsUrlThread(lock, tagBasicPageURLsCacheLock, tagBasicPageUrls, 30 | bloomFilter)); 31 | thread.setName("thread-GoodsDetailsUrl-" + i); 32 | threads.add(thread); 33 | thread.start(); 34 | } 35 | 36 | for (Thread thread : threads) { 37 | try { 38 | thread.join(); 39 | } catch (InterruptedException e) { 40 | e.printStackTrace(); 41 | } 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /src/test/java/ThreadLocalMisunderstand.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @Author: spider_hgyi 3 | * @Date: Created in 下午6:05 18-1-31. 4 | * @Modified By: 5 | * @Description: 6 | */ 7 | public class ThreadLocalMisunderstand { 8 | static class Index { 9 | private int num; 10 | 11 | public void increase() { 12 | num++; 13 | } 14 | 15 | public int getValue() { 16 | return num; 17 | } 18 | } 19 | 20 | // 创建一个Index型的线程本地变量 21 | public static final ThreadLocal local = new ThreadLocal() { 22 | @Override 23 | protected Index initialValue() { 24 | return new Index(); 25 | } 26 | }; 27 | 28 | // 计数 29 | static class Counter implements Runnable { 30 | @Override 31 | public void run() { 32 | // 获取当前线程的本地变量,然后累加10000次 33 | Index num = local.get(); 34 | for (int i = 0; i < 10000; i++) { 35 | num.increase(); 36 | } 37 | // 重新设置累加后的本地变量 38 | local.set(num); 39 | System.out.println(Thread.currentThread().getName() + " : " + local.get().getValue()); 40 | } 41 | } 42 | 43 | public static void main(String[] args) throws InterruptedException { 44 | Thread[] threads = new Thread[5]; 45 | for (int i = 0; i < 5; i++) { 46 | threads[i] = new Thread(new Counter(), "CounterThread-[" + i + "]"); 47 | } 48 | for (int i = 0; i < 5; i++) { 49 | threads[i].start(); 50 | } 51 | } 52 | } -------------------------------------------------------------------------------- /src/main/java/parse/TagsPage.java: -------------------------------------------------------------------------------- 1 | package parse; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | import java.util.ArrayList; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | import java.util.Queue; 12 | 13 | /** 14 | * Created by hg_yi on 17-5-23. 15 | * 16 | * @Description: 负责从淘宝分类页面提取出所有的基本分类名,并构造相关URL 17 | */ 18 | 19 | public class TagsPage { 20 | // 提取所有分类商品的源链接 21 | public static Queue getTagURLs(String html) { 22 | Queue tagUrls = new LinkedList<>(); 23 | 24 | Document document = Jsoup.parse(html); 25 | Elements elements = document.select("div[class=" + 26 | "home-category-list J_Module]"); 27 | 28 | for(Element element : elements) { 29 | //在多个div标签中提取多个li标签 30 | Elements lis = element.select("ul[class=category-list]").first(). 31 | select("li"); 32 | 33 | for(Element li : lis) { 34 | //在多个li标签中提取多个a标签 35 | Elements as = li.select("div[class=category-items]").first(). 36 | select("a[class=category-name]"); 37 | 38 | //将a标签中的关于商品分类的关键字提取出来,并进行url的构造 39 | for(Element a : as) { 40 | String name = a.text().replaceAll(" ", ""); 41 | String url = "https://s.taobao.com/search?q=" + name + "&style=grid"; 42 | 43 | tagUrls.offer(url); 44 | } 45 | } 46 | } 47 | 48 | return tagUrls; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/utilclass/BloomFilter.java: -------------------------------------------------------------------------------- 1 | package utilclass; 2 | 3 | import java.util.BitSet; 4 | 5 | /** 6 | * @Author: spider_hgyi 7 | * @Date: Created in 下午6:38 18-2-5. 8 | * @Modified By: 9 | * @Description: 布隆过滤器 10 | */ 11 | public class BloomFilter { 12 | // 设置布隆过滤器的容量为2的25次方,也就是此布隆过滤器大概最少可以处理百万级别的数据 13 | private static final int DEFAULT_SIZE = 2 << 24; 14 | // 产生随机数的种子,可产生6个不同的随机数产生器 15 | private static final int[] seeds = new int[]{7, 11, 13, 31, 37, 61}; 16 | // Java中的按位存储的思想,其算法的具体实现(布隆过滤器) 17 | private static BitSet bits = new BitSet(DEFAULT_SIZE); 18 | 19 | // 得到此 value 所产生的六个信息指纹 20 | public int[] getFingerprint(String value) { 21 | int result = 0; 22 | int[] fingerprints = new int[6]; 23 | for (int i = 0; i < seeds.length; i++) { 24 | for (int j = 0; j < value.length(); j++) { 25 | result = seeds[i] * result + value.charAt(j); 26 | } 27 | 28 | result = (DEFAULT_SIZE - 1) & result; 29 | fingerprints[i] = result; 30 | } 31 | 32 | return fingerprints; 33 | } 34 | 35 | // 判断url是否已经存在于布隆过滤器中 36 | public boolean isExist(int[] fingerprints) { 37 | boolean ret = true; 38 | 39 | for (int fingerprint : fingerprints) { 40 | // 只有六个标志位都为true,才能判断这个url在这个集合中(此处存在误判) 41 | ret = ret && bits.get(fingerprint); 42 | } 43 | 44 | return ret; 45 | } 46 | 47 | // 将url存储进布隆过滤器中 48 | public void saveFingerprints(int[] fingerprints) { 49 | for (int fingerprint : fingerprints) { 50 | bits.set(fingerprint); 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 多线程爬虫--抓取淘宝商品详情页URL 2 | 3 | 本项目是一个Java编写的多线程爬虫系统。此系统与我之前开发的[ip-proxy-pools-regularly][1]结合使用,共抓取了淘宝近3000个页面,从中解析到了近9万的商品详情页URL。 4 | 5 | 我并没有直接将这些商品详情页中最具价值的数据(商品信息)提取出来,因为这些富有价值的数据对于目前的我来说并不是特别具有吸引力。开发这个项目当初的本意也只是为了锻炼自己开发多线程应用程序的能力,并且真正的与反爬虫做对抗,最终我成功了~ 6 | 7 | 我会将抓取到的数据(近9万商品详情页URL)提供给大家,如果大家需要真正的商品信息,而你们又没有什么好的办法,那么就花半天时间阅读一下此项目的源码吧,最后只要在这个代码的框架上稍作修改,这个多线程爬虫系统将完全满足你们的需求。 8 | 9 | ## 环境需求 10 | 11 | > - JDK 1.8 12 | > - MySQL 13 | > - Redis 14 | > - IDEA 15 | > - Maven 16 | 17 | ## 实现架构 18 | 19 | 包名 | 功能 20 | ---|--- 21 | database | 有关MySQL与Redis数据库的配置类及操作类 22 | httpbrower | 发送HTTP请求,接收Response相关类 23 | ipproxypool | IP代理池 24 | mainmethod | Main方法入口 25 | mythread | 项目相关线程类 26 | parse | 网页源码解析类 27 | urlbuild | URL创建类 28 | utilclass | 工具类 29 | 30 | 项目中的每个类我都写了Description,如果你在使用的过程中还有任何疑问,欢迎[与我联系](#联系我)。 31 | 32 | 关于项目实现的技术细节,请戳博客链接:[Java网络爬虫(十四)--多线程爬虫(抓取淘宝商品详情页URL)][2] 33 | 34 | ## 使用说明 35 | MySQL配置文件下载(附带数据): 36 | [![xiyoulinux.sql][3]][4] 37 | 38 | ## TODO 39 | 1. 项目中抓取带有页面参数的商品搜索页URL及商品详情页URL会产生死锁,分别是近2000与近4000数量的待抓取任务,然而每次都会剩余不到10个任务无法成功抓取,目前猜测有可能是死锁,也有可能是由HttpClient包引起的未知bug 40 | 2. 线程调度,任务分配,线程安全这三方面还需要不断优化与完善 41 | 3. 爬虫并不智能,考虑开发自动化智能爬虫 42 | 4. 考虑将此系统设计成一个爬虫框架,可让用户指定任务进行抓取 43 | 5. 可视化处理... ... 44 | 45 | ## 版本说明 46 | ![version 1.0][5] 47 | 48 | ## 联系我 49 | ``` 50 | dhengyi@outlook.com 51 | ``` 52 | 53 | 54 | [1]: https://github.com/championheng/ip-proxy-pools-regularly/tree/master/ip%E4%BB%A3%E7%90%86%E4%B8%8E%E5%AE%9A%E7%82%B9%E7%88%AC%E5%8F%96%28%E9%87%8D%E6%9E%84%29 55 | [2]: http://blog.csdn.net/championhengyi/article/details/79416748 56 | [3]: https://img.shields.io/badge/download-MySQL-brightgreen.svg 57 | [4]: https://pan.baidu.com/s/1Y02g6U1ZDGeZpY674bIskw 58 | [5]: https://img.shields.io/badge/version-1.0-blue.svg 59 | -------------------------------------------------------------------------------- /src/main/java/parse/CommoditySearchPage.java: -------------------------------------------------------------------------------- 1 | package parse; 2 | 3 | import utilclass.BloomFilter; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | /** 11 | * Created by hg_yi on 17-5-28. 12 | * 13 | * @Description: 对商品搜索页面进行解析 14 | */ 15 | 16 | public class CommoditySearchPage { 17 | // 得到搜索商品所占的页面总数 18 | public static int getPagesCount(String html) { 19 | int pageCount = 0; 20 | // 使用正则表达式将本商品所占的页数解析出来 21 | Pattern pattern = Pattern.compile("\"totalPage\":[0-9]*?,"); 22 | Matcher matcher = pattern.matcher(html); 23 | 24 | if (matcher.find()) { 25 | // matcher.group()返回匹配到的子字符串 26 | String string = matcher.group(); 27 | int start = string.indexOf(":"); 28 | pageCount = Integer.parseInt(string.substring(start+1, string.length()-1)); 29 | } 30 | 31 | return pageCount; 32 | } 33 | 34 | // 拿到本商品搜索页面中部分商品的id 35 | public static List getGoodsId(String html) { 36 | List goodsDetailsIds = new ArrayList<>(); 37 | 38 | // 使用正则表达式将本页所有商品的id提取出来(JSON数据串) 39 | Pattern pattern = Pattern.compile("\"auctionNids\":\\[.*?\\]"); 40 | Matcher matcher = pattern.matcher(html); 41 | 42 | if (matcher.find()) { 43 | // matcher.group()返回匹配到的子字符串 44 | String string = matcher.group(); 45 | 46 | int start = string.indexOf('['); 47 | String idStr = string.substring(start+1, string.length()-1); 48 | 49 | for (String idStars : idStr.split(",")) { 50 | String singleId = idStars.substring(1, idStars.length()-1); 51 | goodsDetailsIds.add(singleId); 52 | } 53 | } 54 | 55 | return goodsDetailsIds; 56 | } 57 | } -------------------------------------------------------------------------------- /src/main/java/database/MySQLDB.java: -------------------------------------------------------------------------------- 1 | package database; 2 | 3 | import java.sql.*; 4 | import java.util.ResourceBundle; 5 | 6 | /** 7 | * @Author: spider_hgyi 8 | * @Date: Created in 下午5:48 18-2-1. 9 | * @Modified By: 10 | * @Description: JDBC的资源准备 11 | */ 12 | public class MySQLDB { 13 | private static String URL; 14 | private static String USERNAME; 15 | private static String PASSWORD; 16 | 17 | // 加载配置文件 18 | private static ResourceBundle resourceBundle = ResourceBundle.getBundle("mysql-config"); 19 | 20 | // 静态代码块在加载类时只执行一次 21 | static { 22 | URL = resourceBundle.getString("jdbc.url"); 23 | USERNAME = resourceBundle.getString("jdbc.username"); 24 | PASSWORD = resourceBundle.getString("jdbc.password"); 25 | String driverClassName = resourceBundle.getString("jdbc.driverClassName"); 26 | 27 | try { 28 | Class.forName(driverClassName); 29 | } catch (ClassNotFoundException e) { 30 | e.printStackTrace(); 31 | } 32 | } 33 | 34 | // 得到数据库链接 35 | public static Connection getConnection() { 36 | Connection conn = null; 37 | try { 38 | conn = DriverManager.getConnection(URL, USERNAME, PASSWORD); 39 | } catch (SQLException e) { 40 | e.printStackTrace(); 41 | } 42 | 43 | return conn; 44 | } 45 | 46 | // 关闭数据库链接 47 | public static void closeConnection(ResultSet rs, Statement stat, Connection conn) { 48 | try { 49 | if (rs != null) { 50 | rs.close(); 51 | } 52 | if (stat != null) { 53 | stat.close(); 54 | } 55 | 56 | if (conn != null) { 57 | conn.close(); 58 | } 59 | } catch (SQLException e) { 60 | e.printStackTrace(); 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/java/mainmethod/MainMethod.java: -------------------------------------------------------------------------------- 1 | package mainmethod; 2 | 3 | import database.MySQL; 4 | import mythread.TagBasicPageURLsCacheThread; 5 | import urlbuild.GoodsUrl; 6 | import urlbuild.MainClassifyUrl; 7 | import ipproxypool.operation.IPProxyPool; 8 | 9 | import java.io.FileNotFoundException; 10 | import java.io.PrintStream; 11 | import java.util.Queue; 12 | 13 | /** 14 | * Created by hg_yi on 17-5-23. 15 | * 16 | * @Description: 多线程抓取淘宝程序主方法入口 17 | */ 18 | 19 | public class MainMethod { 20 | public static void main(String[] args) throws FileNotFoundException { 21 | // 设置sout输出至文件 22 | // PrintStream ps = new PrintStream("/home/hg_yi/temp"); 23 | // System.setOut(ps); 24 | 25 | // 创建生产者(ip-proxy-pool)与消费者(thread-tagBasicPageURL-i)等待/通知机制所需的对象锁 26 | Object lock = new Object(); 27 | // 创建 tagBasicPageURLs-cache 线程与 thread-GoodsDetailsUrl-i 线程协作所需的对象锁 28 | Object tagBasicPageURLsCacheLock = new Object(); 29 | 30 | // 创建一个 ip-proxy-pool 线程,执行IP代理池 31 | IPProxyPool.startExecute(lock); 32 | 33 | /** 34 | * 使用等待/通知机制,如果此时ip-proxy-pool里面没IP,则进行等待,并让IP代理池 35 | * 生产IP,直到生产完整,通知所有工作的线程继续开始工作 36 | */ 37 | 38 | // 拿到淘宝基本分类商品的源链接(使用本机IP) 39 | // Queue tagBasicUrls = MainClassifyUrl.getMainClassifyUrls(); 40 | // System.out.println("共有" + tagBasicUrls.size() + "大小的URL待抓取"); 41 | 42 | // 拿到带有页面参数的基本分类商品的源链接, 并保存在MySQL数据库中(使用代理IP) 43 | // Queue tagBasicPageURLs = MainClassifyUrl.getMainClassifyPageUrlsByProxy(tagBasicUrls, lock); 44 | 45 | // 创建一个 tagBasicPageURLs-cache 线程,每抓取成功100个任务,就将MySQL中存储的任务标记为true 46 | TagBasicPageURLsCacheThread.start(tagBasicPageURLsCacheLock); 47 | 48 | // 得到商品详情页的url,使用布隆过滤器,并及时持久化进MySQL数据库 49 | GoodsUrl.getGoodsDetailsPageUrl(lock, tagBasicPageURLsCacheLock); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/urlbuild/MainClassifyUrl.java: -------------------------------------------------------------------------------- 1 | package urlbuild; 2 | 3 | import mythread.TagBasicPageCrawlerThread; 4 | import httpbrower.HttpRequest; 5 | import parse.TagsPage; 6 | 7 | import java.util.ArrayList; 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | import java.util.Queue; 11 | 12 | /** 13 | * Created by hg_yi on 17-5-23. 14 | * 15 | * @Description: 拿到淘宝商品最基本分类中的各个类别的URL 16 | */ 17 | 18 | public class MainClassifyUrl { 19 | // 拿到淘宝所有分类商品的源链接 20 | public static Queue getMainClassifyUrls() { 21 | String url = "https://www.taobao.com/tbhome/page/market-list"; 22 | 23 | // 得到淘宝分类页面的源代码 24 | String html = HttpRequest.getHtml(url); 25 | 26 | // 对得到的源代码进行解析,拿到每个分类页面的源链接 27 | return TagsPage.getTagURLs(html); 28 | } 29 | 30 | // 拿到淘宝所有分类商品带有页面参数的源链接(使用多线程) 31 | public static Queue getMainClassifyPageUrlsByProxy(Queue tagBasicUrls, Object lock) { 32 | // 带页面参数URL解析线程收集器 33 | List threads = new ArrayList<>(); 34 | // 保存所有带有分类商品页面参数源链接的任务队列 35 | Queue tagBasicPageURLs = new LinkedList<>(); 36 | // 创建一个有关任务队列的对象锁 37 | Object taskLock = new Object(); 38 | 39 | for (int i = 0; i < 30; i++) { 40 | Thread thread = new Thread(new TagBasicPageCrawlerThread(tagBasicUrls, lock, tagBasicPageURLs, taskLock)); 41 | thread.setName("thread-tagBasicPageURL-" + i); 42 | 43 | threads.add(thread); 44 | thread.start(); 45 | } 46 | 47 | for (Thread thread1 : threads) { 48 | try { 49 | thread1.join(); 50 | System.out.println("当前线程:" + thread1.getName() + ", 已完成抓取任务"); 51 | } catch (InterruptedException e) { 52 | e.printStackTrace(); 53 | } 54 | } 55 | 56 | return tagBasicPageURLs; 57 | } 58 | } -------------------------------------------------------------------------------- /src/main/java/mythread/TagBasicPageURLsCacheThread.java: -------------------------------------------------------------------------------- 1 | package mythread; 2 | 3 | import database.MyRedis; 4 | import database.MySQL; 5 | 6 | import java.util.List; 7 | 8 | import static java.lang.Thread.MAX_PRIORITY; 9 | 10 | /** 11 | * @Author: spider_hgyi 12 | * @Date: Created in 上午11:51 18-2-6. 13 | * @Modified By: 14 | * @Description: 处理缓存的线程,将 tag-basic-page-urls 中存在的url标记进MySQL数据库中 15 | */ 16 | public class TagBasicPageURLsCacheThread implements Runnable { 17 | private final Object tagBasicPageURLsCacheLock; 18 | 19 | public TagBasicPageURLsCacheThread(Object tagBasicPageURLsCacheLock) { 20 | this.tagBasicPageURLsCacheLock = tagBasicPageURLsCacheLock; 21 | } 22 | 23 | public static void start(Object tagBasicPageURLsCacheLock) { 24 | Thread thread = new Thread(new TagBasicPageURLsCacheThread(tagBasicPageURLsCacheLock)); 25 | thread.setName("tagBasicPageURLs-cache"); 26 | thread.setPriority(MAX_PRIORITY); // 将这个线程的优先级设置最大,允许出现误差 27 | thread.start(); 28 | } 29 | 30 | @Override 31 | public void run() { 32 | MyRedis myRedis = new MyRedis(); 33 | MySQL mySQL = new MySQL(); 34 | 35 | while (true) { 36 | synchronized (tagBasicPageURLsCacheLock) { 37 | while (myRedis.tagBasicPageURLsCacheIsOk()) { 38 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", " + 39 | "准备开始将 tag-basic-page-urls-cache 中的url在MySQL中进行标记"); 40 | 41 | List tagBasicPageURLs = myRedis.getTagBasicPageURLsFromCache(); 42 | System.out.println("tagBasicPageURLs-size: " + tagBasicPageURLs.size()); 43 | 44 | // 将MySQL数据库中对应的url标志位置为true 45 | mySQL.setFlagFromTagsSearchUrl(tagBasicPageURLs); 46 | } 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.HowYialways 8 | distribute_crawler 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 1.7 17 | 1.7 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | org.apache.httpcomponents 27 | httpclient 28 | 4.5.3 29 | 30 | 31 | 32 | 33 | redis.clients 34 | jedis 35 | 2.9.0 36 | 37 | 38 | 39 | 40 | org.jsoup 41 | jsoup 42 | 1.8.3 43 | 44 | 45 | 46 | 47 | mysql 48 | mysql-connector-java 49 | 6.0.6 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/main/java/database/RedisDB.java: -------------------------------------------------------------------------------- 1 | package database; 2 | 3 | import redis.clients.jedis.Jedis; 4 | import redis.clients.jedis.JedisPoolConfig; 5 | 6 | import java.util.ResourceBundle; 7 | 8 | /** 9 | * Created by paranoid on 17-4-12. 10 | * 11 | * @Description: Jedis的资源准备 12 | */ 13 | public class RedisDB { 14 | private static final String addr; 15 | private static final int port; 16 | private static final String passwd; 17 | 18 | // 加载配置文件 19 | private static ResourceBundle rb = ResourceBundle.getBundle("redis-config"); 20 | 21 | // 初始化连接 22 | static { 23 | addr = rb.getString("jedis.addr"); 24 | port = Integer.parseInt(rb.getString("jedis.port")); 25 | passwd = rb.getString("jedis.passwd"); 26 | 27 | try { 28 | // 先进行redis数据的参数配置 29 | JedisPoolConfig config = new JedisPoolConfig(); 30 | // 链接耗尽时是否阻塞,false时抛出异常,默认是true,阻塞超时之后抛出异常 31 | config.setBlockWhenExhausted(true); 32 | // 逐出策略类名,当连接超过最大空闲时间或最大空闲数抛出异常 33 | config.setEvictionPolicyClassName("org.apache.commons.pool2." + 34 | "impl.DefaultEvictionPolicy"); 35 | // 是否启用pool的jmx管理功能,默认是true 36 | config.setJmxEnabled(true); 37 | // 最大空闲数,默认为8,一个pool最多有多少空闲的Jedis实例 38 | config.setMaxIdle(60); 39 | // 最大连接数 40 | config.setMaxTotal(100); 41 | // 当引入一个Jedis实例时,最大的等待时间,如果超过等待时间,抛出异常 42 | config.setMaxWaitMillis(1000*10); 43 | // 获得一个jedis实例的时候是否检查连接可用性(ping()) 44 | config.setTestOnBorrow(true); 45 | } catch(Exception e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | 50 | // 获取Jedis实例 51 | public synchronized static Jedis getJedis() { 52 | // 连接本地的 Redis 服务 53 | Jedis jedis = new Jedis(addr, port); 54 | // 权限认证 55 | jedis.auth(passwd); 56 | 57 | return jedis; 58 | } 59 | 60 | // 释放Jedis资源 61 | public static void close(final Jedis jedis) { 62 | if (jedis != null) { 63 | jedis.close(); 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/ipmodel/IPMessage.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.ipmodel; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Created by hg_yi on 17-8-11. 7 | * 8 | * @Description: IPMessage JavaBean 9 | */ 10 | public class IPMessage implements Serializable { 11 | private static final long serialVersionUID = 1L; 12 | private String IPAddress; 13 | private String IPPort; 14 | private String IPType; 15 | private String IPSpeed; 16 | private int useCount; // 使用计数器,连续三次这个IP不能使用,就将其从IP代理池中进行清除 17 | 18 | public IPMessage() { this.useCount = 0; } 19 | 20 | public IPMessage(String IPAddress, String IPPort, String IPType, String IPSpeed) { 21 | this.IPAddress = IPAddress; 22 | this.IPPort = IPPort; 23 | this.IPType = IPType; 24 | this.IPSpeed = IPSpeed; 25 | this.useCount = 0; 26 | } 27 | 28 | public String getIPAddress() { 29 | return IPAddress; 30 | } 31 | 32 | public void setIPAddress(String IPAddress) { 33 | this.IPAddress = IPAddress; 34 | } 35 | 36 | public String getIPPort() { 37 | return IPPort; 38 | } 39 | 40 | public void setIPPort(String IPPort) { 41 | this.IPPort = IPPort; 42 | } 43 | 44 | public String getIPType() { 45 | return IPType; 46 | } 47 | 48 | public void setIPType(String IPType) { 49 | this.IPType = IPType; 50 | } 51 | 52 | public String getIPSpeed() { 53 | return IPSpeed; 54 | } 55 | 56 | public void setIPSpeed(String IPSpeed) { 57 | this.IPSpeed = IPSpeed; 58 | } 59 | 60 | public int getUseCount() { 61 | return useCount; 62 | } 63 | 64 | public void setUseCount() { 65 | this.useCount++; 66 | } 67 | 68 | public void initCount() { 69 | this.useCount = 0; 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | return "IPMessage{" + 75 | "IPAddress='" + IPAddress + '\'' + 76 | ", IPPort='" + IPPort + '\'' + 77 | ", IPType='" + IPType + '\'' + 78 | ", IPSpeed='" + IPSpeed + '\'' + 79 | ", useCount='" + useCount + '\'' + 80 | '}'; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/utilclass/MD5.java: -------------------------------------------------------------------------------- 1 | package utilclass; 2 | 3 | import java.security.MessageDigest; 4 | import java.security.NoSuchAlgorithmException; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | 8 | /** 9 | * Created by hg_yi on 17-6-2. 10 | * 11 | * @Description: MD5压缩算法 12 | */ 13 | 14 | public class MD5 { 15 | public static Set getMD5Urls(Set preUrls) { 16 | String s; 17 | Set lastUrls = new HashSet(); 18 | 19 | //用来将字节转换为成16进制表示的字符 20 | char hexDigits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 21 | 'a', 'b', 'c', 'd', 'e', 'f'}; 22 | 23 | try { 24 | MessageDigest md = MessageDigest.getInstance("MD5"); 25 | 26 | for (String url : preUrls) { 27 | byte[] source = url.getBytes(); 28 | md.update(source); 29 | 30 | //MD5的运算结果是一个128位的长整数,用字节来表示就是16字节 31 | byte tmp[] = md.digest(); 32 | 33 | //每个字节用16进制表示,使用两个字符,将结果完全表示为16进制则需要32字节 34 | char str[] = new char[16 * 2]; 35 | 36 | int k = 0; 37 | //从第一个字节开始,将MD5中的每个字节转换成十六进制字符 38 | for (int i = 0; i < 16; i++) { 39 | byte byte0 = tmp[i]; 40 | 41 | //取字节高4位的数字进行转换,“>>>”为逻辑右移,将符号位一起右移(忽略符号位) 42 | str[k++] = hexDigits[byte0 >>> 4 & 0xf]; 43 | //取字节中低4位的的数字进行转换 44 | str[k++] = hexDigits[byte0 & 0xf]; 45 | } 46 | 47 | s = new String(str); 48 | lastUrls.add(s); 49 | } 50 | } catch (NoSuchAlgorithmException e) { 51 | e.printStackTrace(); 52 | } 53 | 54 | return lastUrls; 55 | } 56 | 57 | public static Set getUrls(Set urlsMD5) { 58 | Set urls = new HashSet(); 59 | 60 | for (String url : urlsMD5) { 61 | byte[] a = url.getBytes(); 62 | 63 | for (int i = 0; i < a.length; i++) { 64 | a[i] = (byte) (a[i] ^ 't'); 65 | } 66 | 67 | String k = new String(a); 68 | urls.add(k); 69 | } 70 | 71 | return urls; 72 | } 73 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/grabutils/URLFecter.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.grabutils; 2 | 3 | 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.select.Elements; 7 | import ipproxypool.ipmodel.IPMessage; 8 | 9 | import java.util.List; 10 | import java.util.Queue; 11 | 12 | 13 | /** 14 | * Created by paranoid on 17-4-10. 15 | * 16 | * @Description: 对xici代理网的html源码进行解析,提取出其中的代理ip 17 | */ 18 | 19 | public class URLFecter { 20 | // 使用本机IP爬取xici代理网站的第一页 21 | public static void urlParse(List ipMessages) { 22 | String url = "http://www.xicidaili.com/nn/1"; 23 | String html = MyHttpResponse.getHtml(url); 24 | 25 | // 将html解析成DOM结构 26 | Document document = Jsoup.parse(html); 27 | 28 | // 提取所需要的数据 29 | Elements trs = document.select("table[id=ip_list]").select("tbody").select("tr"); 30 | getIPMessages(ipMessages, trs); 31 | } 32 | 33 | // 使用代理进行爬取 34 | public static boolean urlParse(String url, String ip, String port, 35 | List ipMessages1) { 36 | String html = MyHttpResponse.getHtml(url, ip, port); 37 | 38 | if(html != null) { 39 | Document document = Jsoup.parse(html); 40 | Elements trs = document.select("table[id=ip_list]").select("tbody").select("tr"); 41 | 42 | getIPMessages(ipMessages1, trs); 43 | 44 | return true; 45 | } 46 | 47 | return false; 48 | } 49 | 50 | public static void getIPMessages(List ipMessages, Elements trs) { 51 | for (int i = 1; i < trs.size(); i++) { 52 | IPMessage ipMessage = new IPMessage(); 53 | 54 | String ipAddress = trs.get(i).select("td").get(1).text(); 55 | String ipPort = trs.get(i).select("td").get(2).text(); 56 | String ipType = trs.get(i).select("td").get(5).text(); 57 | String ipSpeed = trs.get(i).select("td").get(6).select("div[class=bar]"). 58 | attr("title"); 59 | 60 | ipMessage.setIPAddress(ipAddress); 61 | ipMessage.setIPPort(ipPort); 62 | ipMessage.setIPType(ipType); 63 | ipMessage.setIPSpeed(ipSpeed); 64 | 65 | ipMessages.add(ipMessage); 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/jobthread/CreateIPProxyPool.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.jobthread; 2 | 3 | import ipproxypool.grabutils.URLFecter; 4 | import ipproxypool.ipfilter.IPFilter; 5 | import ipproxypool.ipmodel.IPMessage; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Queue; 10 | import java.util.concurrent.locks.ReadWriteLock; 11 | import java.util.concurrent.locks.ReentrantReadWriteLock; 12 | 13 | /** 14 | * Created by hg_yi on 17-8-11. 15 | * 16 | * @Description: 抓取xici代理网的分配线程 17 | * 抓取不同页面的xici代理网的html源码,就使用不同的代理IP,在对IP进行过滤之后进行合并 18 | */ 19 | public class CreateIPProxyPool { 20 | // 成员变量(非线程安全) 21 | private List ipMessages; 22 | // 创建供上述变量使用的读写锁 23 | private ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); 24 | 25 | public CreateIPProxyPool(List ipMessages) { 26 | this.ipMessages = ipMessages; 27 | } 28 | 29 | public void saveIP(Queue urls, Object taskLock) { 30 | int rand = 0; 31 | readWriteLock.writeLock().lock(); 32 | String ipAddress = ipMessages.get(rand).getIPAddress(); 33 | String ipPort = ipMessages.get(rand).getIPPort(); 34 | readWriteLock.writeLock().unlock(); 35 | 36 | while (true) { 37 | /** 38 | * 随机挑选代理IP(本步骤由于其他线程有可能在位置确定之后对ipMessages数量进行 39 | * 增加,虽说不会改变已经选择的ip代理的位置,但合情合理还是在对共享变量进行读写的时候要保证 40 | * 其原子性,否则极易发生脏读) 41 | */ 42 | // 每个线程先将自己抓取下来的ip保存下来并进行过滤 43 | List ipMessages1 = new ArrayList<>(); 44 | String url; 45 | 46 | // 任务队列是共享变量,对其的读写必须进行正确的同步 47 | synchronized (taskLock) { 48 | if (urls.isEmpty()) { 49 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 发现任务队列已空"); 50 | break; 51 | } 52 | url = urls.poll(); 53 | } 54 | 55 | boolean success = URLFecter.urlParse(url, ipAddress, ipPort, ipMessages1); 56 | // 如果ip代理池里面的ip不能用,或本页抓取失败,则切换下一个IP对本页进行重新抓取 57 | if (!success) { 58 | // 当抓取失败的时候重新拿取代理ip 59 | readWriteLock.writeLock().lock(); 60 | rand = (int) (Math.random() * ipMessages.size()); 61 | ipAddress = ipMessages.get(rand).getIPAddress(); 62 | ipPort = ipMessages.get(rand).getIPPort(); 63 | readWriteLock.writeLock().unlock(); 64 | 65 | synchronized (taskLock) { 66 | urls.offer(url); 67 | } 68 | continue; 69 | } 70 | 71 | // 对ip重新进行过滤,只要速度在三秒以内的并且类型为HTTPS的 72 | ipMessages1 = IPFilter.Filter(ipMessages1); 73 | 74 | // 将质量合格的ip合并到共享变量ipMessages中,进行合并的时候保证原子性 75 | readWriteLock.writeLock().lock(); 76 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 已进入合并区, " + 77 | "待合并大小 ipMessages1:" + ipMessages1.size()); 78 | ipMessages.addAll(ipMessages1); 79 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 已成功合并, " + 80 | "合并后ipMessage大小:" + ipMessages.size()); 81 | readWriteLock.writeLock().unlock(); 82 | } 83 | } 84 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/timer/MyTimeJob.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.timer; 2 | 3 | import ipproxypool.grabutils.URLFecter; 4 | import ipproxypool.ipfilter.IPFilter; 5 | import ipproxypool.ipmodel.IPMessage; 6 | import ipproxypool.jobthread.CreateIPProxyPool; 7 | import ipproxypool.jobthread.IPProxyGrabThread; 8 | import database.MyRedis; 9 | 10 | import java.util.*; 11 | 12 | /** 13 | * Created by hg_yi on 17-8-11. 14 | * 15 | * @Description: IP代理池的整体构建逻辑 16 | */ 17 | public class MyTimeJob extends TimerTask { 18 | // IP代理池线程是生产者,此锁用来实现等待/通知机制,实现生产者与消费者模型 19 | private final Object lock; 20 | 21 | MyTimeJob(Object lock) { 22 | this.lock = lock; 23 | } 24 | 25 | @Override 26 | public void run() { 27 | MyRedis myRedis = new MyRedis(); 28 | // 创建一个有关任务队列的读写锁 29 | Object taskLock = new Object(); 30 | 31 | // 如果IP代理池中没有ip信息,则IP代理池进行工作 32 | while (true) { 33 | while (myRedis.isEmpty()) { 34 | synchronized (lock) { 35 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 开始更新IP代理池"); 36 | // 存放爬取下来的ip信息 37 | List ipMessages = new LinkedList<>(); 38 | // 创建任务队列 39 | Queue urls = new LinkedList<>(); 40 | // 对创建的子线程进行收集 41 | List threads = new ArrayList<>(); 42 | 43 | // 首先使用本机ip爬取xici代理网第一页 44 | URLFecter.urlParse(ipMessages); 45 | // 对得到的IP进行筛选,将IP速度在三秒以内的并且类型是https的留下,其余删除 46 | ipMessages = IPFilter.Filter(ipMessages); 47 | 48 | for (IPMessage ipMessage : ipMessages) { 49 | System.out.println(ipMessage.toString()); 50 | } 51 | 52 | // 构造种子url(2000条ip) 53 | for (int i = 2; i <= 21; i++) { 54 | urls.offer("http://www.xicidaili.com/nn/" + i); 55 | } 56 | 57 | // 使用多线程对urls进行解析并过滤,拿到所有目标IP,将所有的IP存储进ipMessages这个共享变量中 58 | CreateIPProxyPool createIpProxyPool = new CreateIPProxyPool(ipMessages); 59 | for (int i = 0; i < 20; i++) { 60 | IPProxyGrabThread IPProxyGrabThread = new IPProxyGrabThread(urls, createIpProxyPool, taskLock); 61 | Thread thread = new Thread(IPProxyGrabThread); 62 | thread.setName("ip-proxy-pool-thread-" + i); 63 | threads.add(thread); 64 | thread.start(); 65 | } 66 | 67 | for (Thread thread : threads) { 68 | try { 69 | thread.join(); 70 | } catch (InterruptedException e) { 71 | e.printStackTrace(); 72 | } 73 | } 74 | 75 | // 将爬取下来的ip信息写进Redis数据库中(List集合) 76 | myRedis.setIPToList(ipMessages); 77 | 78 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", IP代理池已经更新完毕"); 79 | 80 | lock.notifyAll(); 81 | } 82 | } 83 | } 84 | } 85 | } -------------------------------------------------------------------------------- /src/main/java/database/MySQL.java: -------------------------------------------------------------------------------- 1 | package database; 2 | 3 | import java.sql.Connection; 4 | import java.sql.PreparedStatement; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Queue; 10 | 11 | /** 12 | * @Author: spider_hgyi 13 | * @Date: Created in 下午7:37 18-2-1. 14 | * @Modified By: 15 | * @Description: 集成对MySQL数据库的操作 16 | */ 17 | public class MySQL { 18 | private Connection connection = MySQLDB.getConnection(); 19 | 20 | private static final String SAVE_TAG_BASIC_PAGE_URLS = "INSERT INTO tags_search_url(url) VALUES (?)"; 21 | 22 | private static final String GET_TAG_BASIC_PAGE_URLS = "SELECT url FROM tags_search_url"; 23 | 24 | private static final String SET_FLAG_FROM_TAGS_SEARCH_URL = "UPDATE tags_search_url SET flag = TRUE WHERE url = ?"; 25 | 26 | private static final String SAVE_GOODS_DETAILS_URLS = "INSERT INTO goods_details_url(url) VALUES (?)"; 27 | 28 | // 将抓取下来的带有页面参数的主分类商品URL存储进MySQL数据库中 29 | public void saveTagBasicPageUrlsToTagsSearchUrl(Queue tagBasicPageUrls) { 30 | PreparedStatement statement; 31 | 32 | for (String tagBasicPageUrl : tagBasicPageUrls) { 33 | try { 34 | statement = connection.prepareStatement(SAVE_TAG_BASIC_PAGE_URLS); 35 | 36 | statement.setString(1, tagBasicPageUrl); 37 | statement.execute(); 38 | } catch (SQLException e) { 39 | e.printStackTrace(); 40 | } 41 | } 42 | } 43 | 44 | // 将抓取下来的带有页面参数的主分类商品URL取出來 45 | public Queue getTagBasicPageUrlsFromTagsSearchUrl() { 46 | Queue tagBasicPageUrls = new LinkedList<>(); 47 | 48 | try { 49 | PreparedStatement statement = connection.prepareStatement(GET_TAG_BASIC_PAGE_URLS); 50 | ResultSet resultSet = statement.executeQuery(); 51 | 52 | while (resultSet.next()) { 53 | tagBasicPageUrls.offer(resultSet.getString(1)); 54 | } 55 | } catch (SQLException e) { 56 | e.printStackTrace(); 57 | } 58 | 59 | return tagBasicPageUrls; 60 | } 61 | 62 | // 将 tags_search_url 表中对应的url标志位置换为true 63 | public void setFlagFromTagsSearchUrl(List tagBasicPageUrls) { 64 | PreparedStatement statement; 65 | 66 | for (String tagBasicPageUrl : tagBasicPageUrls) { 67 | try { 68 | statement = connection.prepareStatement(SET_FLAG_FROM_TAGS_SEARCH_URL); 69 | statement.setString(1, tagBasicPageUrl); 70 | statement.execute(); 71 | } catch (SQLException e) { 72 | e.printStackTrace(); 73 | } 74 | } 75 | } 76 | 77 | // 将商品详情页的url放进 goods_details_url 78 | public void saveGoodsDetailsUrlsToGoodsDetailsUrl(List goodsDetailsUrls) { 79 | PreparedStatement statement; 80 | 81 | for (String goodsDetailsUrl : goodsDetailsUrls) { 82 | try { 83 | statement = connection.prepareStatement(SAVE_GOODS_DETAILS_URLS); 84 | statement.setString(1, goodsDetailsUrl); 85 | statement.execute(); 86 | } catch (SQLException e) { 87 | e.printStackTrace(); 88 | } 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /src/main/java/database/MyRedis.java: -------------------------------------------------------------------------------- 1 | package database; 2 | 3 | import ipproxypool.ipmodel.IPMessage; 4 | import utilclass.SerializeUtil; 5 | import redis.clients.jedis.Jedis; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.concurrent.locks.ReadWriteLock; 10 | import java.util.concurrent.locks.ReentrantReadWriteLock; 11 | 12 | /** 13 | * Created by hg_yi on 17-8-9. 14 | * 15 | * @Description: 集成对Redis数据库的操作 16 | * 17 | * 争取将MyRedis设计成一个线程安全的类 18 | */ 19 | public class MyRedis { 20 | private final Jedis jedis = RedisDB.getJedis(); 21 | // 创建一个读写锁 22 | private static ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); 23 | // 创建一个关于 tagBasicPageURLs-cache 的锁 24 | private static ReadWriteLock tagBasicPageURLsCacheReadWriteLock = new ReentrantReadWriteLock(); 25 | 26 | // 将单个ip信息保存在Redis列表中 27 | public void setIPToList(IPMessage ipMessage) { 28 | // 首先将ipMessage进行序列化 29 | byte[] bytes = SerializeUtil.serialize(ipMessage); 30 | 31 | readWriteLock.writeLock().lock(); 32 | jedis.rpush("ip-proxy-pool".getBytes(), bytes); 33 | readWriteLock.writeLock().unlock(); 34 | } 35 | 36 | // 将多个ip信息保存在Redis列表中 37 | public void setIPToList(List ipMessages) { 38 | for (IPMessage ipMessage : ipMessages) { 39 | // 首先将ipMessage进行序列化 40 | byte[] bytes = SerializeUtil.serialize(ipMessage); 41 | 42 | readWriteLock.writeLock().lock(); 43 | jedis.rpush("ip-proxy-pool".getBytes(), bytes); 44 | readWriteLock.writeLock().unlock(); 45 | } 46 | } 47 | 48 | // 将Redis中保存的对象进行反序列化 49 | public IPMessage getIPByList() { 50 | readWriteLock.writeLock().lock(); 51 | Object o = SerializeUtil.unserialize(jedis.lpop("ip-proxy-pool".getBytes())); 52 | readWriteLock.writeLock().unlock(); 53 | 54 | return (IPMessage) o; 55 | } 56 | 57 | // 判断IP代理池是否为空 58 | public boolean isEmpty() { 59 | readWriteLock.readLock().lock(); 60 | Long flag = jedis.llen("ip-proxy-pool".getBytes()); 61 | readWriteLock.readLock().unlock(); 62 | 63 | return flag <= 0; 64 | } 65 | 66 | // 将url存储到 tagBasicPageURLs-cache 中 67 | public void setTagBasicPageURLToCache(String tagBasicPageURL) { 68 | tagBasicPageURLsCacheReadWriteLock.writeLock().lock(); 69 | jedis.rpush("tag-basic-page-urls-cache", tagBasicPageURL); 70 | tagBasicPageURLsCacheReadWriteLock.writeLock().unlock(); 71 | } 72 | 73 | // 判断 tagBasicPageURLs-cache 中的url数量是否达到100条 74 | public boolean tagBasicPageURLsCacheIsOk() { 75 | tagBasicPageURLsCacheReadWriteLock.readLock().lock(); 76 | Long flag = jedis.llen("tag-basic-page-urls-cache"); 77 | tagBasicPageURLsCacheReadWriteLock.readLock().unlock(); 78 | 79 | return flag >= 100; 80 | } 81 | 82 | // 从 tagBasicPageURLs-cache 中将url取出 83 | public List getTagBasicPageURLsFromCache() { 84 | List tagBasicPageURLs = new ArrayList<>(); 85 | 86 | tagBasicPageURLsCacheReadWriteLock.writeLock().lock(); 87 | Long flag = jedis.llen("tag-basic-page-urls-cache"); 88 | 89 | for (int i = 0; i < flag; i++) { 90 | String o = jedis.lpop("tag-basic-page-urls-cache"); 91 | tagBasicPageURLs.add(o); 92 | } 93 | tagBasicPageURLsCacheReadWriteLock.writeLock().unlock(); 94 | 95 | return tagBasicPageURLs; 96 | } 97 | 98 | // 释放Redis资源 99 | public void close() { 100 | RedisDB.close(jedis); 101 | } 102 | } -------------------------------------------------------------------------------- /src/main/java/mythread/TagBasicPageCrawlerThread.java: -------------------------------------------------------------------------------- 1 | package mythread; 2 | 3 | import database.MySQL; 4 | import httpbrower.HttpRequest; 5 | import ipproxypool.ipmodel.IPMessage; 6 | import database.MyRedis; 7 | import parse.CommoditySearchPage; 8 | 9 | import java.util.LinkedList; 10 | import java.util.Queue; 11 | 12 | /** 13 | * @Author: spider_hgyi 14 | * @Date: Created in 下午1:01 18-2-1. 15 | * @Modified By: 16 | * @Description: 得到带有分页参数的主分类搜索页面的URL 17 | */ 18 | public class TagBasicPageCrawlerThread implements Runnable { 19 | private final Object lock; // 有关生产者、消费者的锁 20 | private Queue tagBasicUrls; // 任务队列(共享变量) 21 | private final Object taskLock; // 有关任务队列的锁 22 | private Queue tagBasicPageUrls; // 存放所有线程抓取结果的共享队列(共享变量) 23 | 24 | public TagBasicPageCrawlerThread(Queue tagBasicUrls, Object lock, Queue tagBasicPageUrls, 25 | Object taskLock) { 26 | this.tagBasicUrls = tagBasicUrls; 27 | this.lock = lock; 28 | this.tagBasicPageUrls = tagBasicPageUrls; 29 | this.taskLock = taskLock; 30 | } 31 | 32 | @Override 33 | public void run() { 34 | MyRedis myRedis = new MyRedis(); 35 | MySQL mySQL = new MySQL(); 36 | 37 | String tagBasicUrl; 38 | IPMessage ipMessage = null; 39 | // 此flag用于--->如果ip可以进行抓取,则一直使用此ip,不在ip代理池中重新拿取新ip的逻辑判断 40 | boolean flag = true; 41 | 42 | // 每个URL用单独的代理IP进行分析 43 | while (true) { 44 | if (flag) { 45 | synchronized (lock) { 46 | while (myRedis.isEmpty()) { 47 | try { 48 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", " + 49 | "发现ip-proxy-pool已空, 开始进行等待... ..."); 50 | lock.wait(); 51 | } catch (InterruptedException e) { 52 | e.printStackTrace(); 53 | } 54 | } 55 | 56 | ipMessage = myRedis.getIPByList(); 57 | } 58 | } 59 | 60 | if (ipMessage.getUseCount() >= 30) { 61 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 发现此ip:" + 62 | ipMessage.getIPAddress() + ":" + ipMessage.getIPPort() + ", 已经连续30次不能使用, 进行舍弃"); 63 | continue; 64 | } 65 | 66 | // 任务队列是一个共享变量,必须对其进行正确的同步 67 | synchronized (taskLock) { 68 | if (tagBasicUrls.isEmpty()) { 69 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 发现任务队列已空"); 70 | break; 71 | } 72 | 73 | tagBasicUrl = tagBasicUrls.poll(); 74 | } 75 | 76 | String html = HttpRequest.getHtmlByProxy(tagBasicUrl, ipMessage, lock); 77 | if (html != null) { 78 | int pageCount = CommoditySearchPage.getPagesCount(html); 79 | // 抓取的带页面参数的主分类搜索页面URL暂存器 80 | Queue tempUrlsStorage = new LinkedList<>(); 81 | tempUrlsStorage.offer(tagBasicUrl); 82 | if (pageCount >= 2) { 83 | tempUrlsStorage.offer(tagBasicUrl + "&s=44"); 84 | } 85 | 86 | // 将本次解析出来的页面,直接写入MySQL数据库 87 | synchronized (tagBasicPageUrls) { 88 | mySQL.saveTagBasicPageUrlsToTagsSearchUrl(tempUrlsStorage); 89 | } 90 | 91 | flag = false; 92 | } else { 93 | synchronized (taskLock) { 94 | tagBasicUrls.offer(tagBasicUrl); 95 | } 96 | flag = true; 97 | } 98 | } 99 | 100 | // 对此线程抓取下来的带有页面参数的URL进行合并 101 | // synchronized (tagBasicPageUrls) { 102 | // System.out.println("当前线程:" + Thread.currentThread().getName() + ", 已进入合并区, " + 103 | // "待合并大小 tempUrlStorage:" + tempUrlsStorage.size()); 104 | // tagBasicPageUrls.addAll(tempUrlsStorage); 105 | // System.out.println("当前线程:" + Thread.currentThread().getName() + ", 已成功合并, " + 106 | // "合并后tagBasicPageUrls大小:" + tagBasicPageUrls.size()); 107 | // } 108 | 109 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 已完成任务,将要退出"); 110 | } 111 | } -------------------------------------------------------------------------------- /src/main/java/ipproxypool/grabutils/MyHttpResponse.java: -------------------------------------------------------------------------------- 1 | package ipproxypool.grabutils; 2 | 3 | import org.apache.http.HttpHost; 4 | import org.apache.http.client.config.RequestConfig; 5 | import org.apache.http.client.methods.CloseableHttpResponse; 6 | import org.apache.http.client.methods.HttpGet; 7 | import org.apache.http.impl.client.CloseableHttpClient; 8 | import org.apache.http.impl.client.HttpClients; 9 | import org.apache.http.util.EntityUtils; 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * Created by hg_yi on 17-8-11. 15 | * 16 | * @Description: 抓取xici代理网站的html源码 17 | */ 18 | public class MyHttpResponse { 19 | // 使用本机IP进行网站抓取 20 | public static String getHtml(String url) { 21 | String entity = null; 22 | int httpStatus; 23 | CloseableHttpResponse httpResponse = null; 24 | CloseableHttpClient httpClient = HttpClients.createDefault(); 25 | 26 | // 设置超时处理(猜测setConnectTimeout是与网站建立HTTP链接的时间,setSocketTimeout是从网站获取数据的时间) 27 | RequestConfig config = RequestConfig.custom().setConnectTimeout(3000). 28 | setSocketTimeout(3000).build(); 29 | HttpGet httpGet = new HttpGet(url); 30 | httpGet.setConfig(config); 31 | 32 | httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;" + 33 | "q=0.9,image/webp,*/*;q=0.8"); 34 | httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch"); 35 | httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); 36 | httpGet.setHeader("Cache-Control", "no-cache"); 37 | httpGet.setHeader("Connection", "keep-alive"); 38 | httpGet.setHeader("Host", "www.xicidaili.com"); 39 | httpGet.setHeader("Pragma", "no-cache"); 40 | httpGet.setHeader("Upgrade-Insecure-Requests", "1"); 41 | httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + 42 | "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"); 43 | 44 | try { 45 | // 客户端执行httpGet方法,返回响应 46 | httpResponse = httpClient.execute(httpGet); 47 | 48 | // 得到服务响应状态码 49 | httpStatus = httpResponse.getStatusLine().getStatusCode(); 50 | if (httpStatus == 200) { 51 | entity = EntityUtils.toString(httpResponse.getEntity(), "utf-8"); 52 | } else { 53 | System.out.println("本机IP抓取xici代理网第一页IP返回状态码:" + httpStatus); 54 | } 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | } finally { 58 | try { 59 | if (httpResponse != null) { 60 | httpResponse.close(); 61 | } 62 | httpClient.close(); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | 68 | return entity; 69 | } 70 | 71 | // 对上一个方法的重载,使用代理进行网站爬取 72 | public static String getHtml(String url, String ip, String port) { 73 | String entity = null; 74 | int httpStatus; 75 | CloseableHttpClient httpClient = HttpClients.createDefault(); 76 | CloseableHttpResponse httpResponse = null; 77 | 78 | // 设置代理访问和超时处理 79 | HttpHost proxy = new HttpHost(ip, Integer.parseInt(port)); 80 | RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectTimeout(1000). 81 | setSocketTimeout(1000).build(); 82 | HttpGet httpGet = new HttpGet(url); 83 | httpGet.setConfig(config); 84 | 85 | httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;" + 86 | "q=0.9,image/webp,*/*;q=0.8"); 87 | httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch"); 88 | httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); 89 | httpGet.setHeader("Cache-Control", "no-cache"); 90 | httpGet.setHeader("Connection", "keep-alive"); 91 | httpGet.setHeader("Host", "www.xicidaili.com"); 92 | httpGet.setHeader("Pragma", "no-cache"); 93 | httpGet.setHeader("Upgrade-Insecure-Requests", "1"); 94 | httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + 95 | "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"); 96 | 97 | try { 98 | // 客户端执行httpGet方法,返回响应 99 | httpResponse = httpClient.execute(httpGet); 100 | 101 | // 得到服务响应状态码 102 | httpStatus = httpResponse.getStatusLine().getStatusCode(); 103 | if (httpStatus == 200) { 104 | entity = EntityUtils.toString(httpResponse.getEntity(), "utf-8"); 105 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 使用的代理IP:" + 106 | ip + ":" + port + ", 成功抓取xici代理网:" + url); 107 | } else { 108 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 使用的代理IP:" + 109 | ip + ":" + port + ", 抓取xici代理网:" + url + ", 返回状态码:" + httpStatus); 110 | } 111 | } catch (IOException e) { 112 | entity = null; 113 | } finally { 114 | try { 115 | if (httpResponse != null) { 116 | httpResponse.close(); 117 | } 118 | httpClient.close(); 119 | } catch (IOException e) { 120 | e.printStackTrace(); 121 | } 122 | } 123 | 124 | return entity; 125 | } 126 | } -------------------------------------------------------------------------------- /src/main/java/mythread/GoodsDetailsUrlThread.java: -------------------------------------------------------------------------------- 1 | package mythread; 2 | 3 | import database.MyRedis; 4 | import database.MySQL; 5 | import httpbrower.HttpRequest; 6 | import ipproxypool.ipmodel.IPMessage; 7 | import parse.CommoditySearchPage; 8 | import utilclass.BloomFilter; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Queue; 13 | import java.util.concurrent.locks.ReadWriteLock; 14 | import java.util.concurrent.locks.ReentrantReadWriteLock; 15 | 16 | /** 17 | * @Author: spider_hgyi 18 | * @Date: Created in 下午4:25 18-2-6. 19 | * @Modified By: 20 | * @Description: 负责解析带有页面参数的商品搜索页url,得到本页面中的商品id 21 | */ 22 | public class GoodsDetailsUrlThread implements Runnable { 23 | private final Object lock; // 用于与 ip-proxy-pool 进行协作的锁 24 | private final Object tagBasicPageURLsCacheLock; // 与 tagBasicPageURLs-cache 线程进行协作的锁 25 | private Queue tagBasicPageUrls; // 任务队列 26 | private BloomFilter bloomFilter; // 布隆过滤器 27 | // 有关布隆过滤器的读写锁 28 | private static ReadWriteLock bloomFilterReadWriteLock = new ReentrantReadWriteLock(); 29 | // 关于MySQL数据库的锁 30 | private static final Object mySQLLock = new Object(); 31 | 32 | public GoodsDetailsUrlThread(Object lock, Object tagBasicPageURLsCacheLock, Queue tagBasicPageUrls, 33 | BloomFilter bloomFilter) { 34 | this.lock = lock; 35 | this.tagBasicPageURLsCacheLock = tagBasicPageURLsCacheLock; 36 | this.tagBasicPageUrls = tagBasicPageUrls; 37 | this.bloomFilter = bloomFilter; 38 | } 39 | 40 | @Override 41 | public void run() { 42 | MyRedis myRedis = new MyRedis(); 43 | MySQL mySQL = new MySQL(); 44 | 45 | IPMessage ipMessage = null; 46 | String tagBasicPageUrl; 47 | boolean flag = true; 48 | 49 | while (true) { 50 | if (flag) { 51 | synchronized (lock) { 52 | while (myRedis.isEmpty()) { 53 | try { 54 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", " + 55 | "发现ip-proxy-pool已空, 开始进行等待... ..."); 56 | lock.wait(); 57 | } catch (InterruptedException e) { 58 | e.printStackTrace(); 59 | } 60 | } 61 | 62 | ipMessage = myRedis.getIPByList(); 63 | } 64 | } 65 | 66 | if (ipMessage.getUseCount() >= 30) { 67 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 发现此ip:" + 68 | ipMessage.getIPAddress() + ":" + ipMessage.getIPPort() + ", 已经连续30次不能使用, 进行舍弃"); 69 | continue; 70 | } 71 | 72 | synchronized (tagBasicPageUrls) { 73 | if (!tagBasicPageUrls.isEmpty()) { 74 | tagBasicPageUrl = tagBasicPageUrls.poll(); 75 | } else { 76 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 发现任务队列已空"); 77 | break; 78 | } 79 | } 80 | 81 | String html = HttpRequest.getHtmlByProxy(tagBasicPageUrl, ipMessage, lock); 82 | if (html != null) { 83 | List goodsDetailsIds = CommoditySearchPage.getGoodsId(html); 84 | List goodsDetailsUrls = new ArrayList<>(); 85 | 86 | // 使用布隆过滤器对得到的商品id进行判重 87 | for (String goodsDetailsId : goodsDetailsIds) { 88 | bloomFilterReadWriteLock.readLock().lock(); 89 | int[] fingerprints = bloomFilter.getFingerprint(goodsDetailsId); 90 | boolean exist = bloomFilter.isExist(fingerprints); 91 | bloomFilterReadWriteLock.readLock().unlock(); 92 | 93 | if (!exist) { 94 | bloomFilterReadWriteLock.writeLock().lock(); 95 | bloomFilter.saveFingerprints(fingerprints); 96 | bloomFilterReadWriteLock.writeLock().unlock(); 97 | } else { 98 | continue; 99 | } 100 | 101 | String goodsDetailsUrl = "https://item.taobao.com/item.htm?id=" + goodsDetailsId; 102 | goodsDetailsUrls.add(goodsDetailsUrl); 103 | } 104 | 105 | // 将goodsDetailsUrls写进MySQL数据库 106 | synchronized (mySQLLock) { 107 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", " + 108 | "准备将goodsDetailsUrls写进MySQL数据库, goodsDetailsUrls-size:" + goodsDetailsUrls.size()); 109 | mySQL.saveGoodsDetailsUrlsToGoodsDetailsUrl(goodsDetailsUrls); 110 | } 111 | 112 | // 将tagBasicPageUrl写进Redis数据库 113 | synchronized (tagBasicPageURLsCacheLock) { 114 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", " + 115 | "准备将tagBasicPageUrl写进Redis数据库, tagBasicPageUrl:" + tagBasicPageUrl); 116 | myRedis.setTagBasicPageURLToCache(tagBasicPageUrl); 117 | } 118 | 119 | flag = false; 120 | } else { 121 | synchronized (tagBasicPageUrls) { 122 | tagBasicPageUrls.offer(tagBasicPageUrl); 123 | } 124 | flag = true; 125 | } 126 | } 127 | } 128 | } -------------------------------------------------------------------------------- /src/main/java/httpbrower/HttpRequest.java: -------------------------------------------------------------------------------- 1 | package httpbrower; 2 | 3 | import ipproxypool.ipmodel.IPMessage; 4 | import database.MyRedis; 5 | import org.apache.http.HttpHost; 6 | import org.apache.http.client.config.RequestConfig; 7 | import org.apache.http.client.methods.CloseableHttpResponse; 8 | import org.apache.http.client.methods.HttpGet; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.apache.http.impl.client.HttpClients; 11 | import org.apache.http.util.EntityUtils; 12 | 13 | import java.io.IOException; 14 | 15 | /** 16 | * Created by hg_yi on 17-5-23. 17 | * 18 | * @Description: 对淘宝页面的请求,得到页面的源码 19 | * setConnectTimeout:设置连接超时时间,单位毫秒. 20 | * setSocketTimeout:请求获取数据的超时时间,单位毫秒.如果访问一个接口, 21 | * 多少时间内无法返回数据,就直接放弃此次调用。 22 | */ 23 | public class HttpRequest { 24 | // 成功抓取淘宝页面计数器 25 | public static int pageCount = 0; 26 | 27 | // 请求淘宝商品分类页面,返回页面实体(使用本机IP) 28 | public static String getHtml(String requestUrl) { 29 | String html = null; 30 | 31 | // 创建客户端 32 | CloseableHttpClient closeableHttpClient = HttpClients.createDefault(); 33 | CloseableHttpResponse closeableHttpResponse = null; 34 | 35 | // 创建请求Get实例 36 | HttpGet httpGet = new HttpGet(requestUrl); 37 | 38 | // 设置头部信息进行浏览器模拟行为 39 | httpGet.setHeader("Accept", "text/html,application/xhtml+xml," + 40 | "application/xml;q=0.9,image/webp,*/*;q=0.8"); 41 | httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch, br"); 42 | httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); 43 | httpGet.setHeader("Connection", "keep-alive"); 44 | httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) " + 45 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 " + 46 | "Safari/537.36"); 47 | 48 | try { 49 | // 客户端执行httpGet方法,返回响应 50 | closeableHttpResponse = closeableHttpClient. 51 | execute(httpGet); 52 | 53 | // 得到服务响应状态码 54 | if (closeableHttpResponse.getStatusLine().getStatusCode() == 200) { 55 | // 得到响应实体 56 | html = EntityUtils.toString(closeableHttpResponse.getEntity(), 57 | "utf-8"); 58 | } else { 59 | System.out.println(closeableHttpResponse.getStatusLine().getStatusCode()); 60 | } 61 | } catch (IOException e) { 62 | e.printStackTrace(); 63 | } finally { 64 | try { 65 | if (closeableHttpResponse != null) { 66 | closeableHttpResponse.close(); 67 | } 68 | closeableHttpClient.close(); 69 | } catch (IOException e) { 70 | e.printStackTrace(); 71 | } 72 | } 73 | return html; 74 | } 75 | 76 | // 使用代理IP进行网页的获取 77 | public static String getHtmlByProxy(String requestUrl, IPMessage ipMessage, Object lock) { 78 | MyRedis myRedis = new MyRedis(); 79 | String html = null; 80 | CloseableHttpResponse httpResponse = null; 81 | 82 | // 创建客户端 83 | CloseableHttpClient closeableHttpClient = HttpClients.createDefault(); 84 | 85 | // 设置代理访问和超时处理 86 | HttpHost proxy = new HttpHost(ipMessage.getIPAddress(), Integer.parseInt(ipMessage.getIPPort())); 87 | RequestConfig config = RequestConfig.custom().setProxy(proxy). 88 | setConnectTimeout(1000).setSocketTimeout(1000).build(); 89 | 90 | // 创建请求Get实例 91 | HttpGet httpGet = new HttpGet(requestUrl); 92 | httpGet.setConfig(config); 93 | 94 | // 设置头部信息进行浏览器模拟行为 95 | httpGet.setHeader("Accept", "text/html,application/xhtml+xml," + 96 | "application/xml;q=0.9,image/webp,*/*;q=0.8"); 97 | httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch, br"); 98 | httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); 99 | httpGet.setHeader("Connection", "keep-alive"); 100 | httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) " + 101 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 " + 102 | "Safari/537.36"); 103 | 104 | try { 105 | // 客户端执行httpGet方法,返回响应 106 | httpResponse = closeableHttpClient.execute(httpGet); 107 | int statusCode = httpResponse.getStatusLine().getStatusCode(); 108 | 109 | // 得到服务响应状态码 110 | if (statusCode == 200) { 111 | // 得到响应实体 112 | html = EntityUtils.toString(httpResponse.getEntity(), "utf-8"); 113 | synchronized (HttpRequest.class) { 114 | pageCount++; 115 | } 116 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 使用的代理IP:" + 117 | ipMessage.getIPAddress() + ":" + ipMessage.getIPPort() + ", 成功抓取_淘宝_:" + requestUrl + 118 | ", 页面计数器:" + pageCount); 119 | } else { 120 | System.out.println("当前线程:" + Thread.currentThread().getName() + ", 使用的代理IP:" + 121 | ipMessage.getIPAddress() + ":" + ipMessage.getIPPort() + ", 抓取_淘宝_:" + requestUrl + 122 | ", 返回状态码:" + statusCode); 123 | } 124 | 125 | ipMessage.initCount(); 126 | } catch (IOException e) { 127 | html = null; 128 | ipMessage.setUseCount(); 129 | synchronized (lock) { 130 | myRedis.setIPToList(ipMessage); 131 | } 132 | } finally { 133 | try { 134 | if (httpResponse != null) { 135 | httpResponse.close(); 136 | } 137 | closeableHttpClient.close(); 138 | } catch (IOException e) { 139 | e.printStackTrace(); 140 | } 141 | } 142 | 143 | return html; 144 | } 145 | } 146 | --------------------------------------------------------------------------------