├── .gitignore ├── src ├── main │ ├── java │ │ └── com │ │ │ └── zyy │ │ │ ├── wechat │ │ │ └── mq │ │ │ │ └── spider │ │ │ │ ├── annotation │ │ │ │ └── ServiceLog.java │ │ │ │ ├── service │ │ │ │ ├── WechatMqService.java │ │ │ │ ├── SpiderQueueService.java │ │ │ │ └── ArticleService.java │ │ │ │ ├── dao │ │ │ │ ├── WechatMqRepository.java │ │ │ │ ├── ArticleRepository.java │ │ │ │ └── SpiderQueueRepository.java │ │ │ │ ├── config │ │ │ │ ├── SpiderConfiguration.java │ │ │ │ └── SpiderProperties.java │ │ │ │ ├── entity │ │ │ │ ├── SpiderConfig.java │ │ │ │ ├── SpiderQueue.java │ │ │ │ ├── WechatMq.java │ │ │ │ └── Article.java │ │ │ │ ├── task │ │ │ │ └── ImageDownloadTask.java │ │ │ │ ├── utils │ │ │ │ ├── StringUtils.java │ │ │ │ └── QiniuUtil.java │ │ │ │ ├── aspectj │ │ │ │ └── ServiceLogAspect.java │ │ │ │ └── controller │ │ │ │ └── SpiderController.java │ │ │ └── WechatMqSpiderApplication.java │ └── resources │ │ └── application.properties └── test │ └── java │ └── com │ └── zyy │ └── WechatMqSpiderApplicationTests.java ├── LICENSE ├── pom.xml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | ### NetBeans ### 19 | nbproject/private/ 20 | build/ 21 | nbbuild/ 22 | dist/ 23 | nbdist/ 24 | .nb-gradle/ 25 | 26 | .mvn 27 | mvnw 28 | *.cmd 29 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/annotation/ServiceLog.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.annotation; 2 | 3 | import java.lang.annotation.*; 4 | 5 | /** 6 | * Created by akinoneko on 2017/4/13. 7 | */ 8 | @Target({ElementType.PARAMETER, ElementType.METHOD}) 9 | @Retention(RetentionPolicy.RUNTIME) 10 | @Documented 11 | public @interface ServiceLog { 12 | 13 | String description() default ""; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/service/WechatMqService.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.service; 2 | 3 | import com.zyy.wechat.mq.spider.dao.WechatMqRepository; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | /** 9 | * Created by akinoneko on 2017/3/25. 10 | */ 11 | public class WechatMqService { 12 | 13 | private final static Logger LOGGER = LoggerFactory.getLogger(WechatMqService.class); 14 | @Autowired 15 | private WechatMqRepository wechatMqRepository; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/WechatMqSpiderApplication.java: -------------------------------------------------------------------------------- 1 | package com.zyy; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | import org.springframework.context.annotation.ComponentScan; 6 | import org.springframework.scheduling.annotation.EnableScheduling; 7 | 8 | @SpringBootApplication 9 | @ComponentScan 10 | @EnableScheduling //开启定时任务 11 | public class WechatMqSpiderApplication { 12 | 13 | public static void main(String[] args) { 14 | SpringApplication.run(WechatMqSpiderApplication.class, args); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.datasource.url=jdbc:mysql://localhost:3306/wechat 2 | spring.datasource.username=root 3 | spring.datasource.password=kurimu 4 | spring.datasource.driver-class-name=com.mysql.jdbc.Driver 5 | spring.jpa.hibernate.ddl-auto=update 6 | spring.jpa.database=mysql 7 | spring.jpa.generate-ddl=true 8 | spring.jpa.hibernate.naming.strategy=org.hibernate.cfg.ImprovedNamingStrategy 9 | spring.jpa.show-sql=false 10 | logging.level.root=INFO 11 | logging.level.com.zyy=DEBUG 12 | #系统配置 13 | wechat.mq.spider.imgUrlDomain=ongpq066j.bkt.clouddn.com 14 | wechat.mq.spider.accessKey=GxVgt4Ifg1rZWuM543M-dhMGCs-caNtTkWA15yUF 15 | wechat.mq.spider.secretKey=sbcKlioM5kSzhVIXyLVyvGDPqLcX2HJJBX5oVoNk 16 | wechat.mq.spider.bucketName=wechat 17 | 18 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/dao/WechatMqRepository.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.dao; 2 | 3 | import com.zyy.wechat.mq.spider.entity.WechatMq; 4 | import org.springframework.data.domain.Page; 5 | import org.springframework.data.domain.Pageable; 6 | import org.springframework.data.jpa.repository.Query; 7 | import org.springframework.data.repository.CrudRepository; 8 | import org.springframework.data.repository.query.Param; 9 | 10 | /** 11 | * Created by akinoneko on 2017/3/25. 12 | */ 13 | public interface WechatMqRepository extends CrudRepository { 14 | 15 | @Query("select w from WechatMq w where biz=:biz") 16 | public WechatMq findByBiz(@Param("biz") String biz); 17 | 18 | @Query("select w from WechatMq w") 19 | public Page findByPage(Pageable pageable); 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/dao/ArticleRepository.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.dao; 2 | 3 | import com.zyy.wechat.mq.spider.entity.Article; 4 | import org.springframework.data.jpa.repository.Query; 5 | import org.springframework.data.repository.CrudRepository; 6 | import org.springframework.data.repository.query.Param; 7 | 8 | /** 9 | * Created by akinoneko on 2017/3/25. 10 | */ 11 | public interface ArticleRepository extends CrudRepository { 12 | 13 | @Query("select a from Article a where contentUrl = :contentUrl") 14 | public Article findOneByContentUrl(@Param("contentUrl") String contentUrl); 15 | 16 | @Query("select a from Article a where biz=:biz and contentUrl like CONCAT('%',:sn,'%')") 17 | public Article findBySnAndBiz(@Param("sn")String sn,@Param("biz")String biz); 18 | } 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Yuyang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/dao/SpiderQueueRepository.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.dao; 2 | 3 | import com.zyy.wechat.mq.spider.entity.SpiderQueue; 4 | import org.springframework.data.domain.Page; 5 | import org.springframework.data.domain.Pageable; 6 | import org.springframework.data.jpa.repository.Modifying; 7 | import org.springframework.data.jpa.repository.Query; 8 | import org.springframework.data.repository.CrudRepository; 9 | import org.springframework.data.repository.query.Param; 10 | 11 | /** 12 | * Created by akinoneko on 2017/3/25. 13 | */ 14 | public interface SpiderQueueRepository extends CrudRepository { 15 | 16 | @Modifying 17 | @Query("delete from SpiderQueue where loading = 1") 18 | public void deleteLoading(); 19 | 20 | /** 21 | * 分页查询队列 22 | * 23 | * @param pageable 分页参数 24 | * @return 25 | */ 26 | @Query("select s from SpiderQueue s") 27 | public Page findByPage(Pageable pageable); 28 | 29 | @Query("select s from SpiderQueue s where contentUrl like CONCAT('%',:sn,'%')") 30 | public void deleteBySn(@Param("sn") String sn); 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/config/SpiderConfiguration.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.config; 2 | 3 | import com.zyy.wechat.mq.spider.entity.SpiderConfig; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.boot.context.properties.EnableConfigurationProperties; 6 | import org.springframework.context.annotation.Bean; 7 | import org.springframework.context.annotation.Configuration; 8 | 9 | /** 10 | * Created by akinoneko on 2017/3/28. 11 | */ 12 | @Configuration 13 | @EnableConfigurationProperties(SpiderProperties.class) 14 | public class SpiderConfiguration { 15 | 16 | @Autowired 17 | private SpiderProperties spiderProperties; 18 | 19 | @Bean 20 | public SpiderConfig spiderConfig() { 21 | SpiderConfig spiderConfig = new SpiderConfig(); 22 | spiderConfig.setImgUrlDomain(spiderProperties.getImgUrlDomain()); 23 | spiderConfig.setAccessKey(spiderProperties.getAccessKey()); 24 | spiderConfig.setSecretKey(spiderProperties.getSecretKey()); 25 | spiderConfig.setBucketName(spiderProperties.getBucketName()); 26 | return spiderConfig; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/entity/SpiderConfig.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.entity; 2 | 3 | /** 4 | * Created by akinoneko on 2017/3/28. 5 | */ 6 | public class SpiderConfig { 7 | 8 | private String imgUrlDomain; 9 | 10 | private String accessKey; 11 | 12 | private String secretKey; 13 | 14 | private String bucketName; 15 | 16 | public String getImgUrlDomain() { 17 | return imgUrlDomain; 18 | } 19 | 20 | public void setImgUrlDomain(String imgUrlDomain) { 21 | this.imgUrlDomain = imgUrlDomain; 22 | } 23 | 24 | public String getAccessKey() { 25 | return accessKey; 26 | } 27 | 28 | public void setAccessKey(String accessKey) { 29 | this.accessKey = accessKey; 30 | } 31 | 32 | public String getSecretKey() { 33 | return secretKey; 34 | } 35 | 36 | public void setSecretKey(String secretKey) { 37 | this.secretKey = secretKey; 38 | } 39 | 40 | public String getBucketName() { 41 | return bucketName; 42 | } 43 | 44 | public void setBucketName(String bucketName) { 45 | this.bucketName = bucketName; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/test/java/com/zyy/WechatMqSpiderApplicationTests.java: -------------------------------------------------------------------------------- 1 | package com.zyy; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.zyy.wechat.mq.spider.dao.ArticleRepository; 5 | import com.zyy.wechat.mq.spider.entity.Article; 6 | import com.zyy.wechat.mq.spider.service.ArticleService; 7 | import com.zyy.wechat.mq.spider.utils.QiniuUtil; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.jsoup.nodes.Element; 11 | import org.junit.Test; 12 | import org.junit.runner.RunWith; 13 | import org.springframework.beans.factory.annotation.Autowired; 14 | import org.springframework.boot.test.context.SpringBootTest; 15 | import org.springframework.test.context.junit4.SpringRunner; 16 | 17 | import java.io.IOException; 18 | import java.util.List; 19 | 20 | @RunWith(SpringRunner.class) 21 | @SpringBootTest 22 | public class WechatMqSpiderApplicationTests { 23 | 24 | @Autowired 25 | private ArticleService articleService; 26 | @Autowired 27 | private ArticleRepository articleRepository; 28 | 29 | @Test 30 | public void contextLoads() { 31 | Article article = new Article(); 32 | article.setContent("123123"); 33 | article.setBiz("4321"); 34 | article.setFieldId(12321412L); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/entity/SpiderQueue.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.entity; 2 | 3 | import javax.persistence.*; 4 | 5 | /** 6 | * Created by akinoneko on 2017/3/25. 7 | * 采集队列 8 | */ 9 | @Entity 10 | @Table(name = "spider_queue") 11 | public class SpiderQueue { 12 | 13 | @Id 14 | @GeneratedValue(strategy = GenerationType.AUTO) 15 | private Integer id; 16 | 17 | //文章地址 18 | private String contentUrl; 19 | 20 | //读取中标志 21 | private Integer loading = 0; 22 | 23 | private Long datetime; 24 | 25 | public Integer getId() { 26 | return id; 27 | } 28 | 29 | public void setId(Integer id) { 30 | this.id = id; 31 | } 32 | 33 | public String getContentUrl() { 34 | return contentUrl; 35 | } 36 | 37 | public void setContentUrl(String contentUrl) { 38 | this.contentUrl = contentUrl; 39 | } 40 | 41 | public Integer getLoading() { 42 | return loading; 43 | } 44 | 45 | public void setLoading(Integer loading) { 46 | this.loading = loading; 47 | } 48 | 49 | public Long getDatetime() { 50 | return datetime; 51 | } 52 | 53 | public void setDatetime(Long datetime) { 54 | this.datetime = datetime; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/config/SpiderProperties.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.config; 2 | 3 | import org.springframework.boot.context.properties.ConfigurationProperties; 4 | 5 | /** 6 | * Created by akinoneko on 2017/3/27. 7 | */ 8 | @ConfigurationProperties(prefix = "wechat.mq.spider") 9 | public class SpiderProperties { 10 | 11 | /** 12 | * 图片存储域名 13 | */ 14 | private String imgUrlDomain; 15 | 16 | private String accessKey; 17 | 18 | private String secretKey; 19 | 20 | private String bucketName; 21 | 22 | public String getImgUrlDomain() { 23 | return imgUrlDomain; 24 | } 25 | 26 | public void setImgUrlDomain(String imgUrlDomain) { 27 | this.imgUrlDomain = imgUrlDomain; 28 | } 29 | 30 | public String getAccessKey() { 31 | return accessKey; 32 | } 33 | 34 | public void setAccessKey(String accessKey) { 35 | this.accessKey = accessKey; 36 | } 37 | 38 | public String getSecretKey() { 39 | return secretKey; 40 | } 41 | 42 | public void setSecretKey(String secretKey) { 43 | this.secretKey = secretKey; 44 | } 45 | 46 | public String getBucketName() { 47 | return bucketName; 48 | } 49 | 50 | public void setBucketName(String bucketName) { 51 | this.bucketName = bucketName; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/entity/WechatMq.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.entity; 2 | 3 | import javax.persistence.*; 4 | 5 | /** 6 | * Created by akinoneko on 2017/3/25. 7 | * 微信公众号列表 8 | */ 9 | @Entity 10 | @Table(name = "wechat_mq") 11 | public class WechatMq { 12 | 13 | @Id 14 | @GeneratedValue(strategy = GenerationType.AUTO) 15 | private Integer id; 16 | 17 | //公众号唯一标识biz 18 | @Column 19 | private String biz = ""; 20 | 21 | //记录采集时间的时间戳 22 | @Column(length = 11) 23 | 24 | //公众号昵称 25 | private String name; 26 | 27 | //公众号头像 28 | private String icon; 29 | 30 | public String getName() { 31 | return name; 32 | } 33 | 34 | public void setName(String name) { 35 | this.name = name; 36 | } 37 | 38 | public String getIcon() { 39 | return icon; 40 | } 41 | 42 | public void setIcon(String icon) { 43 | this.icon = icon; 44 | } 45 | 46 | private Long collect = 1L; 47 | 48 | public Integer getId() { 49 | return id; 50 | } 51 | 52 | public void setId(Integer id) { 53 | this.id = id; 54 | } 55 | 56 | public String getBiz() { 57 | return biz; 58 | } 59 | 60 | public void setBiz(String biz) { 61 | this.biz = biz; 62 | } 63 | 64 | public Long getCollect() { 65 | return collect; 66 | } 67 | 68 | public void setCollect(Long collect) { 69 | this.collect = collect; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/task/ImageDownloadTask.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.task; 2 | 3 | import com.qiniu.common.QiniuException; 4 | import com.qiniu.common.Zone; 5 | import com.qiniu.http.Response; 6 | import com.qiniu.storage.BucketManager; 7 | import com.qiniu.storage.Configuration; 8 | import com.qiniu.util.Auth; 9 | import com.zyy.wechat.mq.spider.entity.SpiderConfig; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.scheduling.annotation.Scheduled; 14 | import org.springframework.stereotype.Component; 15 | 16 | import java.util.LinkedList; 17 | import java.util.List; 18 | import java.util.Queue; 19 | 20 | /** 21 | * Created by akinoneko on 2017/3/29. 22 | * 定时检查队列,下载图片 23 | */ 24 | @Component 25 | public class ImageDownloadTask { 26 | 27 | private final static Logger LOGGER = LoggerFactory.getLogger(ImageDownloadTask.class); 28 | 29 | private static Queue imageDownloadQueue = new LinkedList<>(); 30 | 31 | @Autowired 32 | private SpiderConfig spiderConfig; 33 | 34 | //五秒钟下载一次队列中的图片 35 | @Scheduled(fixedDelay = 5000L) 36 | public void downloadImage() { 37 | //图片抓取 38 | LOGGER.info("抓取图片到云存储服务器,队列长度" + imageDownloadQueue.size()); 39 | Zone zone = Zone.autoZone(); 40 | Configuration configuration = new Configuration(zone); 41 | Auth auth = Auth.create(spiderConfig.getAccessKey(), spiderConfig.getSecretKey()); 42 | BucketManager bucketManager = new BucketManager(auth, configuration); 43 | String[] urlKV = null; 44 | while ((urlKV = imageDownloadQueue.poll()) != null && urlKV.length == 2) { 45 | try { 46 | bucketManager.fetch(urlKV[0], spiderConfig.getBucketName(), urlKV[1]); 47 | } catch (QiniuException e) { 48 | Response response = e.response; 49 | LOGGER.error(response.toString()); 50 | } 51 | } 52 | } 53 | 54 | public static void addImageUrlToQueue(String[] url) { 55 | imageDownloadQueue.add(url); 56 | } 57 | 58 | 59 | public static void addImageUrlToQueue(List urls) { 60 | imageDownloadQueue.addAll(urls); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/utils/StringUtils.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.utils; 2 | 3 | /** 4 | * Created by akinoneko on 2017/4/13. 5 | */ 6 | public class StringUtils extends org.springframework.util.StringUtils{ 7 | 8 | public static String filterEmoji(String source) { 9 | if (!containsEmoji(source)) { 10 | return source; 11 | } 12 | //含有emoji表情,过滤 13 | StringBuffer sb = null; 14 | int len = source.length(); 15 | for (int i = 0; i < len; i++) { 16 | char codePoint = source.charAt(i); 17 | if (isEmojiCharacter(codePoint)) { 18 | if (sb == null) { 19 | sb = new StringBuffer(source.length()); 20 | } 21 | sb.append(codePoint); 22 | } 23 | } 24 | 25 | if (sb == null) { 26 | return source; 27 | } else { 28 | if (sb.length() == len) { //优化 29 | sb = null; 30 | return source; 31 | } else { 32 | return sb.toString(); 33 | } 34 | } 35 | } 36 | 37 | private static boolean containsEmoji(String source) { 38 | if (isEmpty(source)) { 39 | return false; 40 | } 41 | 42 | int len = source.length(); 43 | 44 | for (int i = 0; i < len; i++) { 45 | char codePoint = source.charAt(i); 46 | if (isEmojiCharacter(codePoint)) { 47 | return true; 48 | } 49 | } 50 | return false; 51 | } 52 | 53 | private static boolean isEmojiCharacter(char codePoint) { 54 | return (codePoint == 0x0) || 55 | (codePoint == 0x9) || 56 | (codePoint == 0xA) || 57 | (codePoint == 0xD) || 58 | ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || 59 | ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || 60 | ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF)); 61 | } 62 | 63 | public static void main(String[] args) { 64 | String testStr = "\uD83D\uDE42\uD83D\uDE2D这是有🐩很多的emoji表情,\uD83D\uDE08\uD83D\uDE01你好,我是一个✋👂."; 65 | String res = StringUtils.filterEmoji(testStr); 66 | System.out.println(testStr); 67 | System.out.println(res); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/utils/QiniuUtil.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.utils; 2 | 3 | import com.qiniu.common.QiniuException; 4 | import com.qiniu.common.Zone; 5 | import com.qiniu.http.Response; 6 | import com.qiniu.storage.BucketManager; 7 | import com.qiniu.storage.Configuration; 8 | import com.qiniu.util.Auth; 9 | import com.zyy.wechat.mq.spider.entity.SpiderConfig; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.Map; 17 | 18 | /** 19 | * Created by akinoneko on 2017/3/28. 20 | */ 21 | @Component 22 | public class QiniuUtil { 23 | 24 | @Autowired 25 | private SpiderConfig spiderConfigAutoWired; 26 | 27 | private static SpiderConfig spiderConfig; 28 | 29 | private final static Logger LOGGER = LoggerFactory.getLogger(QiniuUtil.class); 30 | 31 | private static Zone zone = Zone.autoZone(); 32 | 33 | private static Auth auth; 34 | 35 | @PostConstruct 36 | private void init() { 37 | spiderConfig = spiderConfigAutoWired; 38 | auth = Auth.create(spiderConfig.getAccessKey(), spiderConfig.getSecretKey()); 39 | } 40 | 41 | public static void asyncDownloadImage(Map urls) { 42 | //异步下载图片 43 | new Thread(new DownloadImage(urls)).start(); 44 | 45 | 46 | } 47 | 48 | private static class DownloadImage implements Runnable { 49 | private Map urls; 50 | 51 | private DownloadImage(Map urls) { 52 | this.urls = urls; 53 | } 54 | 55 | @Override 56 | public void run() { 57 | //图片抓取 58 | LOGGER.debug("抓取图片到云存储服务器"); 59 | Configuration configuration = new Configuration(zone); 60 | BucketManager bucketManager = new BucketManager(auth, configuration); 61 | for (Map.Entry entry : urls.entrySet()) { 62 | try { 63 | bucketManager.fetch(entry.getKey(), spiderConfig.getBucketName(), entry.getValue()); 64 | } catch (QiniuException e) { 65 | Response response = e.response; 66 | LOGGER.error(response.toString()); 67 | } 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | com.zyy 7 | wechat-mq-spider 8 | 0.0.1-SNAPSHOT 9 | jar 10 | 11 | wechat-mq-spider 12 | Wechat MQ Spider 13 | 14 | 15 | org.springframework.boot 16 | spring-boot-starter-parent 17 | 1.5.2.RELEASE 18 | 19 | 20 | 21 | 22 | UTF-8 23 | UTF-8 24 | 1.8 25 | 26 | 27 | 28 | 29 | org.springframework.boot 30 | spring-boot-starter-data-jpa 31 | 32 | 33 | org.springframework.boot 34 | spring-boot-starter-web 35 | 36 | 37 | org.springframework.boot 38 | spring-boot-starter-aop 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-configuration-processor 43 | true 44 | 45 | 46 | 47 | mysql 48 | mysql-connector-java 49 | runtime 50 | 51 | 52 | org.springframework.boot 53 | spring-boot-starter-test 54 | test 55 | 56 | 57 | com.alibaba 58 | fastjson 59 | 1.2.21 60 | 61 | 62 | 63 | 64 | org.jsoup 65 | jsoup 66 | 1.10.2 67 | 68 | 69 | 70 | 71 | com.qiniu 72 | qiniu-java-sdk 73 | 7.2.6 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.springframework.boot 81 | spring-boot-maven-plugin 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/aspectj/ServiceLogAspect.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.aspectj; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.zyy.wechat.mq.spider.annotation.ServiceLog; 5 | import org.aspectj.lang.JoinPoint; 6 | import org.aspectj.lang.ProceedingJoinPoint; 7 | import org.aspectj.lang.annotation.*; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.stereotype.Component; 11 | 12 | import java.lang.reflect.Method; 13 | 14 | /** 15 | * Created by akinoneko on 2017/4/13. 16 | */ 17 | @Aspect 18 | @Component 19 | public class ServiceLogAspect { 20 | 21 | private static final Logger LOGGER = LoggerFactory.getLogger(ServiceLogAspect.class); 22 | 23 | //service调用日志记录点 24 | @Pointcut("@annotation(com.zyy.wechat.mq.spider.annotation.ServiceLog)") 25 | public void serviceMethodLog() { 26 | } 27 | 28 | @Before("serviceMethodLog()") 29 | public void doServiceBefore(JoinPoint joinPoint) { 30 | 31 | } 32 | 33 | @Around("serviceMethodLog()") 34 | public void doServiceAround(ProceedingJoinPoint joinPoint) throws Throwable { 35 | String targetName = joinPoint.getTarget().getClass().getName(); 36 | String methodName = targetName + "." + joinPoint.getSignature().getName(); 37 | long start = System.currentTimeMillis(); 38 | joinPoint.proceed(); 39 | long executeTime = System.currentTimeMillis() - start; 40 | LOGGER.info("方法<{}>执行时间{}ms", methodName, executeTime); 41 | } 42 | 43 | @After("serviceMethodLog()") 44 | public void doServiceAfter(JoinPoint joinPoint) { 45 | 46 | } 47 | 48 | @AfterThrowing(pointcut = "serviceMethodLog()", throwing = "e") 49 | public void doServiceThrowing(JoinPoint joinPoint, Throwable e) throws Exception { 50 | String params = JSON.toJSONString(joinPoint.getArgs()); 51 | String targetName = joinPoint.getTarget().getClass().getName(); 52 | String methodName = targetName + "." + joinPoint.getSignature().getName(); 53 | String message = "\r\n=====SERVICE异常记录开始=====" + 54 | "\r\n异常方法:" + methodName + 55 | "\r\n异常信息:" + e.getMessage() + 56 | "\r\n异常参数:" + params + 57 | "\r\n方法描述:" + getServiceMethodDescription(joinPoint) + 58 | "\r\n=====SERVICE异常记录结束====="; 59 | LOGGER.error("发生异常,异常信息{} {}", e.getMessage(), message); 60 | } 61 | 62 | /** 63 | * 解析注解中对于方法的描述信息 64 | * 65 | * @param joinPoint 切点 66 | * @return 方法描述 67 | * @throws Exception 68 | */ 69 | public static String getServiceMethodDescription(JoinPoint joinPoint) throws Exception { 70 | String targetName = joinPoint.getTarget().getClass().getName(); 71 | String methodName = joinPoint.getSignature().getName(); 72 | Object[] args = joinPoint.getArgs(); 73 | Class targetClass = Class.forName(targetName); 74 | Method[] methods = targetClass.getMethods(); 75 | String description = ""; 76 | for (Method method : methods) { 77 | if (method.getName().equals(methodName)) { 78 | Class[] clazzs = method.getParameterTypes(); 79 | if (clazzs.length == args.length) { 80 | description = method.getAnnotation(ServiceLog.class).description(); 81 | break; 82 | } 83 | } 84 | } 85 | return description; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/controller/SpiderController.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.controller; 2 | 3 | import com.zyy.wechat.mq.spider.service.ArticleService; 4 | import com.zyy.wechat.mq.spider.service.SpiderQueueService; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.web.bind.annotation.RequestMapping; 9 | import org.springframework.web.bind.annotation.RestController; 10 | import org.springframework.web.util.HtmlUtils; 11 | 12 | import java.io.IOException; 13 | import java.io.UnsupportedEncodingException; 14 | import java.net.URLDecoder; 15 | 16 | /** 17 | * Created by akinoneko on 2017/3/24. 18 | */ 19 | @RestController 20 | @RequestMapping("spider") 21 | public class SpiderController { 22 | 23 | private final static Logger LOGGER = LoggerFactory.getLogger(SpiderController.class); 24 | 25 | @Autowired 26 | private ArticleService articleService; 27 | @Autowired 28 | private SpiderQueueService spiderQueueService; 29 | 30 | @RequestMapping(value = "getWechatHistory") 31 | public Object getWechatHistory() throws IOException { 32 | LOGGER.debug("获取微信公众号历史消息"); 33 | LOGGER.debug("输出代理的内容"); 34 | String url = spiderQueueService.getHistoryPageNextUrl(); 35 | //注入js脚本到微信的网页中 36 | return ""; 37 | // return ""; 38 | } 39 | 40 | @RequestMapping(value = "getWechatMsgJson") 41 | public Object getWechatMsgJson(String str, String url) throws IOException { 42 | LOGGER.debug("获取微信公众号JSON消息"); 43 | LOGGER.debug("原始消息:" + str); 44 | url = URLDecoder.decode(url, "UTF-8"); 45 | if (url.contains("action=home")) { //首页内容是html转义内容需要转换一下 46 | str = HtmlUtils.htmlUnescape(URLDecoder.decode(str, "UTF-8")); 47 | } else { //其余页面均是JSON格式 48 | str = URLDecoder.decode(str, "UTF-8"); 49 | } 50 | articleService.parseWechatMqHistory(str, url); 51 | return "getWechatMsgJson request success"; 52 | } 53 | 54 | @RequestMapping(value = "getWechatMsgExt") 55 | public Object getWechatMsgExt(String str, String url) throws IOException { 56 | LOGGER.debug("获取微信公众号阅读量数据"); 57 | str = URLDecoder.decode(str, "UTF-8"); 58 | url = URLDecoder.decode(url, "UTF-8"); 59 | articleService.updateArticleReadNumAndLikeNum(str, url); 60 | return "getWechatMsgExt request success"; 61 | } 62 | 63 | @RequestMapping(value = "getWechatPost") 64 | public Object getWechatPost() throws IOException { 65 | LOGGER.debug("获取微信公众号下一跳地址"); 66 | String url = spiderQueueService.getArticlePageNextUrl(); 67 | //注入js脚本到微信的网页中 68 | if (url == null) { 69 | return ""; 70 | } else { 71 | return ""; 72 | } 73 | } 74 | 75 | @RequestMapping(value = "saveWechatArticle") 76 | public Object saveWechatArticle(String str, String url) throws UnsupportedEncodingException { 77 | LOGGER.debug("抓取文章内容与图片"); 78 | try { 79 | str = HtmlUtils.htmlUnescape(URLDecoder.decode(str, "UTF-8")); 80 | url = URLDecoder.decode(url, "UTF-8"); 81 | articleService.saveArticlePage(str, url); 82 | } catch (Exception e) { 83 | LOGGER.error("保存文章异常!url->" + url); 84 | } 85 | return "saveWechatArticle request success"; 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/service/SpiderQueueService.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.service; 2 | 3 | import com.zyy.wechat.mq.spider.annotation.ServiceLog; 4 | import com.zyy.wechat.mq.spider.dao.SpiderQueueRepository; 5 | import com.zyy.wechat.mq.spider.dao.WechatMqRepository; 6 | import com.zyy.wechat.mq.spider.entity.SpiderQueue; 7 | import com.zyy.wechat.mq.spider.entity.WechatMq; 8 | import org.springframework.beans.factory.annotation.Autowired; 9 | import org.springframework.data.domain.Page; 10 | import org.springframework.data.domain.PageRequest; 11 | import org.springframework.data.domain.Pageable; 12 | import org.springframework.data.domain.Sort; 13 | import org.springframework.stereotype.Service; 14 | 15 | import javax.transaction.Transactional; 16 | import java.util.List; 17 | 18 | /** 19 | * Created by akinoneko on 2017/3/25. 20 | */ 21 | @Service 22 | public class SpiderQueueService { 23 | 24 | @Autowired 25 | private SpiderQueueRepository spiderQueueRepository; 26 | @Autowired 27 | private WechatMqRepository wechatMqRepository; 28 | 29 | @ServiceLog 30 | @Transactional 31 | public String getHistoryPageNextUrl() { 32 | String url = null; 33 | //删除loading为1的记录 34 | spiderQueueRepository.deleteLoading(); 35 | //取出队列中最早的一条记录 36 | Pageable pageable = new PageRequest(0, 10, Sort.Direction.ASC, "datetime"); 37 | Page spiderQueuePage = spiderQueueRepository.findByPage(pageable); 38 | List spiderQueueList = spiderQueuePage.getContent(); 39 | if (spiderQueueList.size() > 0) { 40 | SpiderQueue spiderQueue = spiderQueueList.get(0); 41 | spiderQueue.setLoading(1); 42 | spiderQueueRepository.save(spiderQueue); //更新loading为1 43 | url = spiderQueue.getContentUrl(); 44 | } else { 45 | //队列为空 46 | pageable = new PageRequest(0, 1, Sort.Direction.ASC, "collect"); 47 | Page wechatMqPage = wechatMqRepository.findByPage(pageable); 48 | List wechatMqList = wechatMqPage.getContent(); 49 | if (wechatMqList.size() > 0) { 50 | WechatMq wechatMq = wechatMqList.get(0); 51 | url = "http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=" + wechatMq.getBiz() 52 | + "#wechat_webview_type=1&wechat_redirect"; 53 | wechatMq.setCollect(System.currentTimeMillis()); 54 | wechatMqRepository.save(wechatMq); 55 | } 56 | } 57 | return url; 58 | } 59 | 60 | @ServiceLog 61 | @Transactional 62 | public String getArticlePageNextUrl() { 63 | String url = null; 64 | spiderQueueRepository.deleteLoading(); 65 | Pageable pageable = new PageRequest(0, 10, Sort.Direction.ASC, "id"); 66 | Page spiderQueuePage = spiderQueueRepository.findByPage(pageable); 67 | List spiderQueueList = spiderQueuePage.getContent(); 68 | if (spiderQueueList.size() > 1) { //当队列还剩下一条的时候,从存储的公众号biz表中取出一个biz 69 | SpiderQueue spiderQueue = spiderQueueList.get(0); 70 | spiderQueue.setLoading(1); 71 | spiderQueueRepository.save(spiderQueue); //更新loading为1 72 | url = spiderQueue.getContentUrl(); 73 | } else { 74 | pageable = new PageRequest(0, 1, Sort.Direction.ASC, "collect"); 75 | Page wechatMqPage = wechatMqRepository.findByPage(pageable); 76 | List wechatMqList = wechatMqPage.getContent(); 77 | WechatMq wechatMq = wechatMqList.get(0); 78 | if (System.currentTimeMillis()-wechatMq.getCollect()<86400000L){ 79 | return null; 80 | } 81 | // url = "http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=" + wechatMq.getBiz() 82 | // + "#wechat_webview_type=1&wechat_redirect"; 83 | url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + wechatMq.getBiz() 84 | + "&scene=124#wechat_redirect";//拼接公众号历史消息url地址(第二种页面形式) 85 | wechatMq.setCollect(System.currentTimeMillis()); 86 | wechatMqRepository.save(wechatMq); 87 | } 88 | return url; 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/entity/Article.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.entity; 2 | 3 | import javax.persistence.*; 4 | 5 | /** 6 | * Created by akinoneko on 2017/3/25. 7 | * 公众号文章 8 | */ 9 | @Entity 10 | @Table(name = "wechat_article") 11 | public class Article { 12 | 13 | @Id 14 | @GeneratedValue(strategy = GenerationType.AUTO) 15 | private Integer id; 16 | 17 | //文章对应的公众号biz 18 | @Column(nullable = false) 19 | private String biz; 20 | 21 | //微信定义的一个id,每条文章唯一 22 | @Column(nullable = false) 23 | private Long fieldId; 24 | 25 | //文章标题 26 | private String title = ""; 27 | 28 | //文章编码,防止文章出现emoji 29 | private String title_encode; 30 | 31 | //文章摘要 32 | private String digest = ""; 33 | 34 | //文章地址 35 | private String contentUrl; 36 | 37 | //阅读原文地址 38 | private String sourceUrl; 39 | 40 | //封面图片 41 | private String cover; 42 | 43 | //是否多图文 44 | private Integer isMulti; 45 | 46 | //是否头条 47 | private Integer isTop; 48 | 49 | //文章时间戳 50 | private Long datetime; 51 | 52 | //文章阅读量 53 | private Integer readNum = 1; 54 | 55 | //文章点赞量 56 | private Integer likeNum = 0; 57 | 58 | private String content; 59 | 60 | private String author; 61 | 62 | private Long createTime; 63 | 64 | private Long updateTime; 65 | 66 | public Integer getId() { 67 | return id; 68 | } 69 | 70 | public void setId(Integer id) { 71 | this.id = id; 72 | } 73 | 74 | public String getBiz() { 75 | return biz; 76 | } 77 | 78 | public void setBiz(String biz) { 79 | this.biz = biz; 80 | } 81 | 82 | public Long getFieldId() { 83 | return fieldId; 84 | } 85 | 86 | public void setFieldId(Long fieldId) { 87 | this.fieldId = fieldId; 88 | } 89 | 90 | public String getTitle() { 91 | return title; 92 | } 93 | 94 | public void setTitle(String title) { 95 | this.title = title; 96 | } 97 | 98 | public String getTitle_encode() { 99 | return title_encode; 100 | } 101 | 102 | public void setTitle_encode(String title_encode) { 103 | this.title_encode = title_encode; 104 | } 105 | 106 | public String getDigest() { 107 | return digest; 108 | } 109 | 110 | public void setDigest(String digest) { 111 | this.digest = digest; 112 | } 113 | 114 | public String getContentUrl() { 115 | return contentUrl; 116 | } 117 | 118 | public void setContentUrl(String contentUrl) { 119 | this.contentUrl = contentUrl; 120 | } 121 | 122 | public String getSourceUrl() { 123 | return sourceUrl; 124 | } 125 | 126 | public void setSourceUrl(String sourceUrl) { 127 | this.sourceUrl = sourceUrl; 128 | } 129 | 130 | public String getCover() { 131 | return cover; 132 | } 133 | 134 | public void setCover(String cover) { 135 | this.cover = cover; 136 | } 137 | 138 | public Integer getIsMulti() { 139 | return isMulti; 140 | } 141 | 142 | public void setIsMulti(Integer isMulti) { 143 | this.isMulti = isMulti; 144 | } 145 | 146 | public Integer getIsTop() { 147 | return isTop; 148 | } 149 | 150 | public void setIsTop(Integer isTop) { 151 | this.isTop = isTop; 152 | } 153 | 154 | public Long getDatetime() { 155 | return datetime; 156 | } 157 | 158 | public void setDatetime(Long datetime) { 159 | this.datetime = datetime; 160 | } 161 | 162 | public Integer getReadNum() { 163 | return readNum; 164 | } 165 | 166 | public void setReadNum(Integer readNum) { 167 | this.readNum = readNum; 168 | } 169 | 170 | public Integer getLikeNum() { 171 | return likeNum; 172 | } 173 | 174 | public void setLikeNum(Integer likeNum) { 175 | this.likeNum = likeNum; 176 | } 177 | 178 | public String getContent() { 179 | return content; 180 | } 181 | 182 | public void setContent(String content) { 183 | this.content = content; 184 | } 185 | 186 | public String getAuthor() { 187 | return author; 188 | } 189 | 190 | public void setAuthor(String author) { 191 | this.author = author; 192 | } 193 | 194 | public Long getCreateTime() { 195 | return createTime; 196 | } 197 | 198 | public void setCreateTime(Long createTime) { 199 | this.createTime = createTime; 200 | } 201 | 202 | public Long getUpdateTime() { 203 | return updateTime; 204 | } 205 | 206 | public void setUpdateTime(Long updateTime) { 207 | this.updateTime = updateTime; 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 爬虫配置项说明 2 | 3 | ##### 1. 代理服务器配置 4 | 5 | 使用Anyproxy作为代理服务器,需要安装Node环境,安装anyproxy 6 | 7 | ```shell 8 | npm -g install anyproxy 9 | ``` 10 | 11 | 修改anyproxy的规则文件,macOS下为`/usr/local/lib/node_modules/anyproxy/lib/`,修改rule_default.js中的replaceServerResDataAsync方法 12 | 13 | ```javascript 14 | if (/mp\/getmasssendmsg/i.test(req.url)) { //当链接地址为公众号历史消息页面时(第一种页面形式) 15 | if (serverResData.toString() !== "") { 16 | try { //防止报错退出程序 17 | var reg = /msgList = (.*?);/; //定义历史消息正则匹配规则 18 | var ret = reg.exec(serverResData.toString()); //转换变量为string 19 | HttpPost(ret[1], req.url, "/spider/getWechatMsgJson.do"); //这个函数是后文定义的,将匹配到的历史消息json发送到自己的服务器 20 | var http = require('http'); 21 | console.log(123); 22 | http.get('http://127.0.0.1:8080/spider/getWechatHistory.do', 23 | function(res) { //这个地址是自己服务器上的一个程序,目的是为了获取到下一个链接地址,将地址放在一个js脚本中,将页面自动跳转到下一页。后文将介绍getWxHis.php的原理。 24 | console.log(res); 25 | res.on('data', 26 | function(chunk) { 27 | callback(chunk + serverResData); //将返回的代码插入到历史消息页面中,并返回显示出来 28 | }) 29 | }); 30 | } catch(e) { //如果上面的正则没有匹配到,那么这个页面内容可能是公众号历史消息页面向下翻动的第二页,因为历史消息第一页是html格式的,第二页就是json格式的。 31 | try { 32 | var json = JSON.parse(serverResData.toString()); 33 | if (json.general_msg_list != []) { 34 | HttpPost(json.general_msg_list, req.url, "/spider/getWechatMsgJson.do"); //这个函数和上面的一样是后文定义的,将第二页历史消息的json发送到自己的服务器 35 | } 36 | } catch(e) { 37 | console.log(e); //错误捕捉 38 | } 39 | callback(serverResData); //直接返回第二页json内容 40 | } 41 | } else { 42 | callback(serverResData); //直接返回第二页json内容 43 | } 44 | } else if (/mp\/profile_ext\?action=home/i.test(req.url)) { //当链接地址为公众号历史消息页面时(第二种页面形式) 45 | try { 46 | console.log("enter profile_ext!!!!!!!"); 47 | var reg = /var msgList = \'(.*?)\';/; //定义历史消息正则匹配规则(和第一种页面形式的正则不同) 48 | var ret = reg.exec(serverResData.toString()); //转换变量为string 49 | HttpPost(ret[1], req.url, "/spider/getWechatMsgJson.do"); //这个函数是后文定义的,将匹配到的历史消息json发送到自己的服务器 50 | var http = require('http'); 51 | http.get('http://127.0.0.1:8080/spider/getWechatHistory.do', 52 | function(res) { //这个地址是自己服务器上的一个程序,目的是为了获取到下一个链接地址,将地址放在一个js脚本中,将页面自动跳转到下一页。后文将介绍getWxHis.php的原理。 53 | console.log("statusCode: " + res.statusCode); 54 | res.on('data', 55 | function(chunk) { 56 | console.log("chunk ->" + chunk); 57 | //console.log("chunk_serda ->" +(serverResData+chunk)); 58 | callback(serverResData + chunk); //将返回的代码插入到历史消息页面中,并返回显示出来 59 | }) 60 | }); 61 | } catch(e) { 62 | callback(serverResData); 63 | } 64 | } else if (/mp\/profile_ext\?action=getmsg/i.test(req.url)) { //第二种页面表现形式的向下翻页后的json 65 | try { 66 | var json = JSON.parse(serverResData.toString()); 67 | if (json.general_msg_list != []) { 68 | HttpPost(json.general_msg_list, req.url, "/spider/getWechatMsgJson.do"); //这个函数和上面的一样是后文定义的,将第二页历史消息的json发送到自己的服务器 69 | } 70 | } catch(e) { 71 | console.log(e); 72 | } 73 | callback(serverResData); 74 | } else if (/mp\/getappmsgext/i.test(req.url)) { //当链接地址为公众号文章阅读量和点赞量时 75 | try { 76 | HttpPost(serverResData, req.url, "/spider/getWechatMsgExt.do"); //函数是后文定义的,功能是将文章阅读量点赞量的json发送到服务器 77 | } catch(e) { 78 | 79 | } 80 | callback(serverResData); 81 | } else if (/s\?__biz/i.test(req.url) || /mp\/rumor/i.test(req.url)) { //当链接地址为公众号文章时(rumor这个地址是公众号文章被辟谣了) 82 | try { 83 | var http = require('http'); 84 | HttpPost(serverResData.toString(), req.url, "/spider/saveWechatArticle.do"); //将文章的html页发送给服务器处理 85 | http.get('http://127.0.0.1:8080/spider/getWechatPost.do', 86 | function(res) { //这个地址是自己服务器上的另一个程序,目的是为了获取到下一个链接地址,将地址放在一个js脚本中,将页面自动跳转到下一页。后文将介绍getWxPost.php的原理。 87 | res.on('data', 88 | function(chunk) { 89 | console.log(serverResData + chunk); 90 | callback(serverResData + chunk); 91 | }) 92 | }); 93 | } catch(e) { 94 | callback(serverResData); 95 | } 96 | } else { 97 | callback(serverResData); 98 | } 99 | ``` 100 | 101 | 在rule_default.js文件的底部加上 102 | 103 | ```javascript 104 | function HttpPost(str, url, path) { //将json发送到服务器,str为json内容,url为历史消息页面地址,path是接收程序的路径和文件名 105 | var http = require('http'); 106 | var data = { 107 | str: encodeURIComponent(str), 108 | url: encodeURIComponent(url) 109 | }; 110 | content = require('querystring').stringify(data); 111 | var options = { 112 | method: "POST", 113 | host: "www.domain.com", 114 | //注意没有http://,这是服务器的域名。 115 | port: 80, 116 | path: path, 117 | //接收程序的路径和文件名 118 | headers: { 119 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 120 | "Content-Length": content.length 121 | } 122 | }; 123 | var req = http.request(options, 124 | function(res) { 125 | res.setEncoding('utf8'); 126 | res.on('data', 127 | function(chunk) { 128 | console.log('BODY: ' + chunk); 129 | }); 130 | }); 131 | req.on('error', 132 | function(e) { 133 | console.log('problem with request: ' + e.message); 134 | }); 135 | req.write(content); 136 | req.end(); 137 | } 138 | ``` 139 | 140 | `sudo anyproxy --root`生成证书后,使用`sudo anyproxy -i`启动程序,在终端上设置WiFi的代理地址为代理服务器的地址,端口8001,终端访问[http://localhost:8002/fetchCrtFile]([http://localhost:8002/fetchCrtFile)安装证书 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /src/main/java/com/zyy/wechat/mq/spider/service/ArticleService.java: -------------------------------------------------------------------------------- 1 | package com.zyy.wechat.mq.spider.service; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.alibaba.fastjson.JSONObject; 5 | import com.zyy.wechat.mq.spider.annotation.ServiceLog; 6 | import com.zyy.wechat.mq.spider.dao.ArticleRepository; 7 | import com.zyy.wechat.mq.spider.dao.SpiderQueueRepository; 8 | import com.zyy.wechat.mq.spider.dao.WechatMqRepository; 9 | import com.zyy.wechat.mq.spider.entity.Article; 10 | import com.zyy.wechat.mq.spider.entity.SpiderConfig; 11 | import com.zyy.wechat.mq.spider.entity.SpiderQueue; 12 | import com.zyy.wechat.mq.spider.entity.WechatMq; 13 | import com.zyy.wechat.mq.spider.task.ImageDownloadTask; 14 | import com.zyy.wechat.mq.spider.utils.StringUtils; 15 | import org.jsoup.Jsoup; 16 | import org.jsoup.nodes.Document; 17 | import org.jsoup.nodes.Element; 18 | import org.slf4j.Logger; 19 | import org.slf4j.LoggerFactory; 20 | import org.springframework.beans.factory.annotation.Autowired; 21 | import org.springframework.dao.DataIntegrityViolationException; 22 | import org.springframework.stereotype.Service; 23 | import org.springframework.web.util.HtmlUtils; 24 | 25 | import javax.transaction.Transactional; 26 | import java.io.UnsupportedEncodingException; 27 | import java.net.URLEncoder; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | import java.util.regex.Matcher; 31 | import java.util.regex.Pattern; 32 | 33 | /** 34 | * Created by akinoneko on 2017/3/25. 35 | */ 36 | @Service 37 | public class ArticleService { 38 | 39 | private final static Logger LOGGER = LoggerFactory.getLogger(ArticleService.class); 40 | @Autowired 41 | private ArticleRepository articleRepository; 42 | @Autowired 43 | private SpiderQueueRepository spiderQueueRepository; 44 | @Autowired 45 | private WechatMqRepository wechatMqRepository; 46 | 47 | @Autowired 48 | private SpiderConfig spiderConfig; 49 | 50 | @ServiceLog 51 | public void parseWechatMqHistory(String str, String url) throws UnsupportedEncodingException { 52 | String biz = null; 53 | for (String param : url.substring(url.indexOf("?") + 1).split("&")) { 54 | String key = param.split("=")[0]; 55 | if ("__biz".equals(key)) { 56 | biz = param.substring(param.indexOf("=") + 1); 57 | break; 58 | } 59 | } 60 | //检查公众号是否被录入 61 | if (wechatMqRepository.findByBiz(biz) == null) { 62 | WechatMq wechatMq = new WechatMq(); 63 | wechatMq.setBiz(biz); 64 | wechatMq.setCollect(System.currentTimeMillis()); 65 | wechatMqRepository.save(wechatMq); 66 | } 67 | //解析公众号历史消息记录 68 | JSONObject json = null; 69 | JSONArray list = null; 70 | try { 71 | json = JSONObject.parseObject(str); 72 | list = json.getJSONArray("list"); 73 | } catch (Exception e) { 74 | LOGGER.error("JSON数据解析失败" + e.getMessage()); 75 | // LOGGER.error("json->" + str); 76 | return; 77 | } 78 | for (Object object : list) { 79 | JSONObject item = (JSONObject) object; 80 | JSONObject commMsgInfo = item.getJSONObject("comm_msg_info"); 81 | JSONObject appMsgExtInfo = item.getJSONObject("app_msg_ext_info"); 82 | Article article = null; 83 | int type = commMsgInfo.getIntValue("type"); 84 | if (49 == type) { //图文消息 85 | String contentUrl = HtmlUtils.htmlUnescape(appMsgExtInfo.getString("content_url")) 86 | .replace("\\", ""); 87 | int isMulti = appMsgExtInfo.getIntValue("is_multi"); 88 | long datetime = commMsgInfo.getLongValue("datetime"); 89 | if (articleRepository.findOneByContentUrl(contentUrl) == null) { 90 | article = new Article(); 91 | article.setBiz(biz); 92 | article.setIsMulti(isMulti); 93 | article.setDatetime(datetime); 94 | article.setContentUrl(contentUrl); 95 | article.setFieldId(appMsgExtInfo.getLong("fileid")); 96 | article.setTitle(StringUtils.filterEmoji( 97 | HtmlUtils.htmlUnescape(appMsgExtInfo.getString("title")))); 98 | article.setTitle_encode(URLEncoder.encode(appMsgExtInfo.getString("title") 99 | .replace(" ", ""), "UTF-8")); 100 | article.setDigest(HtmlUtils.htmlUnescape(appMsgExtInfo.getString("digest"))); 101 | article.setSourceUrl(HtmlUtils.htmlUnescape(appMsgExtInfo.getString("source_url")) 102 | .replace("\\", "")); 103 | article.setCover(HtmlUtils.htmlUnescape(appMsgExtInfo.getString("cover")) 104 | .replace("\\", "")); 105 | article.setIsTop(1); 106 | article.setCreateTime(System.currentTimeMillis()); 107 | article.setUpdateTime(System.currentTimeMillis()); 108 | try { 109 | SpiderQueue spiderQueue = new SpiderQueue(); 110 | spiderQueue.setContentUrl(contentUrl); 111 | spiderQueue.setDatetime(System.currentTimeMillis()); 112 | spiderQueueRepository.save(spiderQueue); 113 | articleRepository.save(article); 114 | LOGGER.info("头条标题:" + article.getTitle()); 115 | } catch (DataIntegrityViolationException e) { 116 | LOGGER.debug("数据库已经存在该记录,插入失败!"); 117 | } catch (Exception e) { 118 | LOGGER.error("文章保存失败" + e.getMessage()); 119 | } 120 | } 121 | if (isMulti > 0) { 122 | JSONArray multiAppMsgItemList = appMsgExtInfo.getJSONArray("multi_app_msg_item_list"); 123 | for (Object oneMultiItem : multiAppMsgItemList) { 124 | JSONObject multiItem = (JSONObject) oneMultiItem; 125 | contentUrl = HtmlUtils.htmlUnescape(multiItem.getString("content_url")) 126 | .replace("\\", ""); 127 | if (articleRepository.findOneByContentUrl(contentUrl) == null) { 128 | article = new Article(); 129 | article.setBiz(biz); 130 | article.setIsMulti(isMulti); 131 | article.setDatetime(datetime); 132 | article.setContentUrl(contentUrl); 133 | article.setFieldId(multiItem.getLong("fileid")); 134 | article.setTitle(StringUtils.filterEmoji(multiItem.getString("title"))); 135 | article.setTitle_encode(URLEncoder.encode(multiItem.getString("title") 136 | .replace(" ", ""), "UTF-8")); 137 | article.setDigest(HtmlUtils.htmlUnescape(multiItem.getString("digest"))); 138 | article.setSourceUrl(HtmlUtils.htmlUnescape(multiItem.getString("source_url")) 139 | .replace("\\", "")); 140 | article.setCover(HtmlUtils.htmlUnescape(multiItem.getString("cover")) 141 | .replace("\\", "")); 142 | article.setCreateTime(System.currentTimeMillis()); 143 | article.setUpdateTime(System.currentTimeMillis()); 144 | try { 145 | SpiderQueue spiderQueue = new SpiderQueue(); 146 | spiderQueue.setContentUrl(contentUrl); 147 | spiderQueue.setDatetime(System.currentTimeMillis()); 148 | spiderQueueRepository.save(spiderQueue); 149 | articleRepository.save(article); 150 | LOGGER.info("标题:" + article.getTitle()); 151 | } catch (DataIntegrityViolationException e) { 152 | LOGGER.debug("数据库已经存在该记录,插入失败!"); 153 | } catch (Exception e) { 154 | LOGGER.error("文章保存失败" + e.getMessage()); 155 | } 156 | } 157 | } 158 | } 159 | 160 | } 161 | } 162 | } 163 | 164 | @ServiceLog 165 | @Transactional 166 | public void updateArticleReadNumAndLikeNum(String str, String url) { 167 | String biz = null; 168 | String sn = null; 169 | for (String param : url.substring(url.indexOf("?") + 1).split("&")) { 170 | String key = param.split("=")[0]; 171 | if ("__biz".equals(key)) { 172 | biz = param.substring(param.indexOf("=") + 1); 173 | } else if ("sn".equals(key)) { 174 | sn = param.substring(param.indexOf("=") + 1); 175 | } 176 | 177 | } 178 | try { 179 | //解析文章数据 180 | JSONObject json = JSONObject.parseObject(str); 181 | JSONObject appMsgStat = json.getJSONObject("appmsgstat"); 182 | int readNum = appMsgStat.getIntValue("read_num"); 183 | int likeNum = appMsgStat.getIntValue("like_num"); 184 | Article article = articleRepository.findBySnAndBiz(sn, biz); 185 | if (article != null) { 186 | //更新文章的阅读数和点赞数 187 | article.setReadNum(readNum); 188 | article.setLikeNum(likeNum); 189 | article.setUpdateTime(System.currentTimeMillis()); 190 | articleRepository.save(article); 191 | //删除采集队列中的记录 192 | spiderQueueRepository.deleteBySn(sn); 193 | } 194 | } catch (Exception e) { 195 | LOGGER.error("SN=" + sn + "\nBIZ=" + biz + "\nSTR=" + str + "\nURL=" + url, e); 196 | } 197 | } 198 | 199 | @ServiceLog 200 | @Transactional 201 | public void saveArticlePage(String str, String url) { 202 | boolean imgDown = false; 203 | String biz = null; 204 | String sn = null; 205 | for (String param : url.substring(url.indexOf("?") + 1).split("&")) { 206 | String key = param.split("=")[0]; 207 | if ("__biz".equals(key)) { 208 | biz = param.substring(param.indexOf("=") + 1); 209 | } else if ("sn".equals(key)) { 210 | sn = param.substring(param.indexOf("=") + 1); 211 | } 212 | 213 | } 214 | List urls = new ArrayList<>(); 215 | // Map urls = new HashMap<>(); 216 | Document page = Jsoup.parse(str); 217 | Element postUserElement = page.getElementById("post-user"); 218 | String postUser = null; 219 | if (postUserElement != null) { //防止取不到文章的作者 220 | postUser = StringUtils.filterEmoji(postUserElement.text().trim()); 221 | } 222 | String pageContentHtml = null; 223 | Element pageContentElement = page.getElementById("js_content"); 224 | if (pageContentElement != null) { //防止文章内容没有的情况 225 | pageContentHtml = page.getElementById("js_content").toString(); 226 | //获取图片下载地址 227 | int start = 0, end = 0; //切割标记 228 | StringBuilder htmlBuilder = new StringBuilder(pageContentHtml.length()); 229 | for (Element element : page.getElementById("js_content").getElementsByTag("img")) { 230 | String originSrc = element.attr("data-src"); 231 | end = pageContentHtml.indexOf(originSrc); 232 | originSrc = originSrc.substring(0, originSrc.lastIndexOf("?")); 233 | String newSrc = "origin=" + originSrc; 234 | htmlBuilder.append(pageContentHtml.substring(start, end)); 235 | htmlBuilder.append("http://").append(spiderConfig.getImgUrlDomain()) 236 | .append("/").append(newSrc); 237 | start = end + element.attr("data-src").length(); 238 | urls.add(new String[]{originSrc, newSrc}); 239 | } 240 | htmlBuilder.append(pageContentHtml.substring(start)); //尾部拼接 241 | pageContentHtml = htmlBuilder.toString(); 242 | //替换视频与图片地址 243 | pageContentHtml = pageContentHtml.replace("preview.html", "player.html"); 244 | Document pageContent = Jsoup.parse(pageContentHtml.replace("data-src", "src")); 245 | String content = pageContent.getElementById("js_content").html().trim(); 246 | Article article = articleRepository.findBySnAndBiz(sn, biz); 247 | if (null != article) { 248 | imgDown = article.getContent() == null; 249 | article.setAuthor(postUser); 250 | article.setContent(content); 251 | article.setUpdateTime(System.currentTimeMillis()); 252 | articleRepository.save(article); 253 | LOGGER.info("更新文章内容和图片,标题:" + article.getTitle()); 254 | } 255 | //获取公众号的昵称和头像 256 | String icon = null; 257 | String name = null; 258 | Pattern pattern = Pattern.compile("var ori_head_img_url = \"(.*?)\";"); //匹配头像地址 259 | Matcher matcher = pattern.matcher(page.toString()); 260 | while (matcher.find()) { 261 | icon = matcher.group(); 262 | icon = icon.substring(icon.indexOf("http"), icon.length() - 2); 263 | } 264 | pattern = Pattern.compile("var nickname = \"(.*?)\";"); 265 | matcher = pattern.matcher(page.toString()); 266 | while (matcher.find()) { 267 | name = matcher.group(); 268 | name = name.substring(16, name.length() - 2); 269 | } 270 | WechatMq wechatMq = wechatMqRepository.findByBiz(biz); 271 | if (null != name && icon != null) { 272 | String newSrc = "origin=" + icon; 273 | // urls.put(icon, newSrc); //下载头像 274 | urls.add(new String[]{icon, newSrc}); 275 | if (!name.equals(wechatMq.getName()) || !icon.equals(wechatMq.getIcon())) { 276 | wechatMq.setName(name); 277 | wechatMq.setIcon(spiderConfig.getImgUrlDomain() + "/origin=" + icon); 278 | wechatMqRepository.save(wechatMq); 279 | } 280 | } 281 | //添加图片到下载任务队列中 282 | if (imgDown) { 283 | ImageDownloadTask.addImageUrlToQueue(urls); 284 | // AsyncDownloadImage.asyncDownloadImage(urls); 285 | } 286 | } 287 | } 288 | 289 | } 290 | --------------------------------------------------------------------------------