├── .gitignore ├── modules ├── src │ └── main │ │ └── java │ │ └── com │ │ └── fullstackyang │ │ └── crawler │ │ └── weibo │ │ ├── parser │ │ ├── WeiboBaseHandler.java │ │ ├── feed │ │ │ ├── WeiboFeedHandler.java │ │ │ ├── FeedMidHandler.java │ │ │ ├── FeedPubTimeHandler.java │ │ │ ├── FeedNicknameHandler.java │ │ │ ├── FeedOriginPubTimeHandler.java │ │ │ ├── FeedOriginMidlHandler.java │ │ │ ├── FeedUrlHandler.java │ │ │ ├── FeedOriginUrlHandler.java │ │ │ ├── FeedOidHandler.java │ │ │ ├── FeedOriginForwardCountHandler.java │ │ │ ├── FeedForwardCountHandler.java │ │ │ ├── FeedOriginContentHandler.java │ │ │ ├── FeedContentHandler.java │ │ │ ├── FeedHasOriginHandler.java │ │ │ └── WeiboFeedParser.java │ │ ├── user │ │ │ ├── WeiboUserHandler.java │ │ │ ├── ForwardUserHandler.java │ │ │ └── WeiboForwardParser.java │ │ ├── detail │ │ │ ├── WeiboDetailHandler.java │ │ │ ├── DetailContentHandler.java │ │ │ └── WeiboDetailParser.java │ │ └── WeiboBaseParser.java │ │ ├── dto │ │ ├── OriginWeiboFeed.java │ │ ├── WeiboDetail.java │ │ ├── WeiboUser.java │ │ ├── NormalWeiboFeed.java │ │ ├── WeiboFeed.java │ │ ├── converter │ │ │ └── DetailFeedConverter.java │ │ └── AbstractDTO.java │ │ └── client │ │ └── WeiboClient.java └── pom.xml ├── commons ├── src │ └── main │ │ └── java │ │ └── com │ │ └── fullstackyang │ │ └── crawler │ │ └── weibo │ │ └── utils │ │ ├── EncodeConvertor.java │ │ └── DateConvertor.java └── pom.xml └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | #IDEA 2 | .idea/ 3 | *.iml 4 | 5 | #java 6 | *.jar 7 | *.war 8 | *.ear 9 | 10 | #log 11 | logs/ 12 | *log* 13 | 14 | #maven 15 | target/ 16 | 17 | #property files 18 | config/ 19 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/WeiboBaseHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | public interface WeiboBaseHandler { 6 | 7 | String getFieldName(); 8 | 9 | void parse(T t, Element element); 10 | } 11 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/WeiboFeedHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseHandler; 5 | 6 | public interface WeiboFeedHandler extends WeiboBaseHandler { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/user/WeiboUserHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.user; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboUser; 4 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseHandler; 5 | 6 | public interface WeiboUserHandler extends WeiboBaseHandler { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/detail/WeiboDetailHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.detail; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboDetail; 4 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseHandler; 5 | 6 | public interface WeiboDetailHandler extends WeiboBaseHandler { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/OriginWeiboFeed.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import lombok.Data; 4 | import lombok.EqualsAndHashCode; 5 | import lombok.NoArgsConstructor; 6 | 7 | @Data 8 | @EqualsAndHashCode(callSuper = false) 9 | @NoArgsConstructor 10 | public class OriginWeiboFeed extends WeiboFeed { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/WeiboDetail.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import lombok.Data; 4 | import lombok.EqualsAndHashCode; 5 | import lombok.NoArgsConstructor; 6 | 7 | @Data 8 | @EqualsAndHashCode(callSuper = false) 9 | @NoArgsConstructor 10 | public class WeiboDetail extends WeiboFeed { 11 | 12 | private String detail; 13 | } 14 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/WeiboUser.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | import lombok.NoArgsConstructor; 6 | 7 | @Data 8 | @NoArgsConstructor 9 | @AllArgsConstructor 10 | public class WeiboUser extends AbstractDTO { 11 | 12 | private String nickname; 13 | 14 | private String url; 15 | 16 | private String pageid; 17 | 18 | public String getHomeUrl() { 19 | return "http://www.weibo.com/p/" + pageid + "/home"; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/NormalWeiboFeed.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | import lombok.EqualsAndHashCode; 6 | import lombok.NoArgsConstructor; 7 | 8 | @Data 9 | @EqualsAndHashCode(callSuper = false) 10 | @NoArgsConstructor 11 | @AllArgsConstructor 12 | public class NormalWeiboFeed extends WeiboFeed{ 13 | 14 | private String oid; // 发布微博用户的pid 15 | 16 | private boolean isSticky; // 置顶 17 | private boolean isOrigin; // 是否原创 18 | 19 | private OriginWeiboFeed originWeibo; 20 | private boolean originRemoved; // 源博文 有可能被删除 21 | 22 | } 23 | -------------------------------------------------------------------------------- /commons/src/main/java/com/fullstackyang/crawler/weibo/utils/EncodeConvertor.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.utils; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | public class EncodeConvertor { 7 | 8 | private static final Pattern pattern = Pattern.compile("(\\\\u(\\p{XDigit}{4}))"); 9 | 10 | public static String unicode2String(String str) { 11 | Matcher matcher = pattern.matcher(str); 12 | char ch; 13 | while (matcher.find()) { 14 | ch = (char) Integer.parseInt(matcher.group(2), 16); 15 | str = str.replace(matcher.group(1), ch + ""); 16 | } 17 | return str; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/user/ForwardUserHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.user; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboUser; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class ForwardUserHandler implements WeiboUserHandler { 7 | 8 | @Override 9 | public String getFieldName() { 10 | return "url"; 11 | } 12 | 13 | @Override 14 | public void parse(WeiboUser weiboUser, Element element) { 15 | Element userElement = element.select("div.WB_face a[usercard]").first(); 16 | if (userElement != null) 17 | weiboUser.setUrl("http://weibo.com" + userElement.attr("href")); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/detail/DetailContentHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.detail; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboDetail; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class DetailContentHandler implements WeiboDetailHandler { 7 | 8 | @Override 9 | public void parse(WeiboDetail weiboDetail, Element element) { 10 | Element contentElement = element.select("div.WB_feed_detail div[node-type=feed_list_content]").first(); 11 | if (contentElement != null) { 12 | weiboDetail.setDetail(contentElement.text().trim()); 13 | } 14 | } 15 | 16 | @Override 17 | public String getFieldName() { 18 | return "detail"; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedMidHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class FeedMidHandler implements WeiboFeedHandler { 7 | 8 | /** 9 | * mid为请求评论和转发时所需参数 10 | * 11 | * @param weiboFeed 12 | * @param element 13 | */ 14 | @Override 15 | public void parse(WeiboFeed weiboFeed, Element element) { 16 | Element midElement = element.select("div[action-type=feed_list_item]").first(); 17 | if (midElement != null) { 18 | weiboFeed.setMid(midElement.attr("mid")); 19 | } 20 | } 21 | 22 | @Override 23 | public String getFieldName() { 24 | return "mid"; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedPubTimeHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import com.fullstackyang.crawler.weibo.utils.DateConvertor; 5 | import org.jsoup.nodes.Element; 6 | 7 | import java.time.LocalDateTime; 8 | 9 | public class FeedPubTimeHandler implements WeiboFeedHandler { 10 | 11 | @Override 12 | public void parse(WeiboFeed weiboFeed, Element element) { 13 | Element timeElement = element.select("a[node-type=feed_list_item_date]").first(); 14 | String time = timeElement.text(); 15 | LocalDateTime datetime = DateConvertor.convert(time); 16 | weiboFeed.setPubTime(datetime); 17 | } 18 | 19 | @Override 20 | public String getFieldName() { 21 | return "pubTime"; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedNicknameHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class FeedNicknameHandler implements WeiboFeedHandler { 7 | 8 | @Override 9 | public void parse(WeiboFeed weiboFeed, Element element) { 10 | Element nicknameElement = element.select("div.WB_info a[suda-uatrack]").first(); 11 | if (nicknameElement != null) { 12 | String nickname = nicknameElement.text(); 13 | if (nickname.startsWith("@")) 14 | nickname = nickname.substring(1); 15 | weiboFeed.setNickname(nickname); 16 | } 17 | } 18 | 19 | @Override 20 | public String getFieldName() { 21 | return "nickname"; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOriginPubTimeHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import com.fullstackyang.crawler.weibo.utils.DateConvertor; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedOriginPubTimeHandler implements WeiboFeedHandler { 8 | 9 | @Override 10 | public void parse(WeiboFeed weiboFeed, Element element) { 11 | Element dateElement = element.select(".WB_func .WB_from a[node-type=feed_list_item_date]").first(); 12 | if (dateElement != null) { 13 | String text = dateElement.text().trim(); 14 | weiboFeed.setPubTime(DateConvertor.convert(text)); 15 | } 16 | } 17 | 18 | @Override 19 | public String getFieldName() { 20 | return "pubTime"; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOriginMidlHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class FeedOriginMidlHandler implements WeiboFeedHandler { 7 | 8 | @Override 9 | public void parse(WeiboFeed weiboFeed, Element element) { 10 | Element midElement = element.select("div.WB_info a[suda-uatrack]").first(); 11 | if (midElement != null) { 12 | String mid = midElement.attr("suda-uatrack"); 13 | if (mid.contains(":")) { 14 | mid = mid.substring(mid.indexOf(":") + 1); 15 | weiboFeed.setMid(mid); 16 | } 17 | } 18 | } 19 | 20 | @Override 21 | public String getFieldName() { 22 | return "mid"; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/WeiboFeed.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore; 4 | import lombok.Data; 5 | import lombok.EqualsAndHashCode; 6 | 7 | import java.time.LocalDateTime; 8 | 9 | @Data 10 | @EqualsAndHashCode(callSuper = false, of = {"mid", "url"}) 11 | public abstract class WeiboFeed extends AbstractDTO { 12 | 13 | protected String nickname; 14 | protected String mid; // feed唯一标识 15 | protected String url; 16 | protected String content; // 内容 17 | @JsonIgnore 18 | protected LocalDateTime pubTime; 19 | 20 | protected int forwardCount=0; 21 | 22 | protected boolean hasMore;//展开正文 23 | 24 | public String getDisplayPubTime(){ 25 | return this.pubTime.toLocalDate().toString()+" "+this.pubTime.toLocalTime().toString(); 26 | } 27 | 28 | 29 | } 30 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedUrlHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import com.google.common.base.Strings; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedUrlHandler implements WeiboFeedHandler { 8 | @Override 9 | public void parse(WeiboFeed weiboFeed, Element element) { 10 | Element timeElement = element.select("a[node-type=feed_list_item_date]").first(); 11 | String attr = timeElement.attr("href"); 12 | attr = attr.substring(0, attr.indexOf("?")); 13 | String url = (!Strings.isNullOrEmpty(attr) && !attr.startsWith("http:")) ? "http://weibo.com" + attr : attr; 14 | weiboFeed.setUrl(url); 15 | } 16 | 17 | @Override 18 | public String getFieldName() { 19 | return "url"; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOriginUrlHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import com.google.common.base.Strings; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedOriginUrlHandler implements WeiboFeedHandler { 8 | 9 | @Override 10 | public void parse(WeiboFeed weiboFeed, Element element) { 11 | Element timeElement = element.select("a[node-type=feed_list_item_date]").first(); 12 | if (timeElement != null) { 13 | String attr = timeElement.attr("href"); 14 | String url = (!Strings.isNullOrEmpty(attr) && !attr.startsWith("http:")) ? "http://weibo.com" + attr : attr; 15 | weiboFeed.setUrl(url); 16 | } 17 | } 18 | 19 | @Override 20 | public String getFieldName() { 21 | return "url"; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOidHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.NormalWeiboFeed; 4 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedOidHandler implements WeiboFeedHandler{ 8 | 9 | /** 10 | *

出现在Feed中的微博不一定是当前微博账号所发布的,例如其点赞别人的weibo

11 | *

oid记录该微博的原始博主

12 | * 13 | * @param weiboFeed 14 | * @param element 15 | */ 16 | @Override 17 | public void parse(WeiboFeed weiboFeed, Element element) { 18 | String oid = element.attr("tbinfo"); 19 | oid = oid.substring(oid.indexOf("ouid=") + 5, oid.contains("&") ? oid.indexOf("&") : oid.length()); 20 | ((NormalWeiboFeed)weiboFeed).setOid(oid); 21 | 22 | } 23 | 24 | @Override 25 | public String getFieldName() { 26 | return "oid"; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOriginForwardCountHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.apache.commons.lang3.StringUtils; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedOriginForwardCountHandler implements WeiboFeedHandler { 8 | 9 | @Override 10 | public void parse(WeiboFeed weiboFeed, Element element) { 11 | Element forwardElement = element.select(".WB_func ul li a em.ficon_forward").first(); 12 | if (forwardElement != null) { 13 | Element sibling = forwardElement.nextElementSibling(); 14 | String text = sibling.text().trim(); 15 | if (!text.contains("转发") && StringUtils.isNumeric(text)) { 16 | weiboFeed.setForwardCount(Integer.parseInt(text)); 17 | } 18 | } 19 | } 20 | 21 | @Override 22 | public String getFieldName() { 23 | return "forwardCount"; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedForwardCountHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.apache.commons.lang3.StringUtils; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedForwardCountHandler implements WeiboFeedHandler { 8 | 9 | @Override 10 | public void parse(WeiboFeed weiboFeed, Element element) { 11 | Element forwardElement = element.select(".WB_feed_handle ul li a[action-type=fl_forward]").first(); 12 | if (forwardElement != null) { 13 | Element em = forwardElement.select("em").last(); 14 | 15 | String text = em.text().trim(); 16 | if (!text.contains("转发") && StringUtils.isNumeric(text)) { 17 | weiboFeed.setForwardCount(Integer.parseInt(text)); 18 | } 19 | 20 | } 21 | } 22 | 23 | @Override 24 | public String getFieldName() { 25 | return "forwardCount"; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedOriginContentHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 4 | import org.jsoup.nodes.Element; 5 | 6 | public class FeedOriginContentHandler implements WeiboFeedHandler { 7 | 8 | @Override 9 | public void parse(WeiboFeed weiboFeed, Element element) { 10 | Element contentElement = element.select(".WB_text").first(); 11 | if (contentElement != null) { 12 | contentElement.select("a[action-type=feed_list_url]").remove(); 13 | String content = contentElement.text().replaceAll("\\?\\?\\?\\?", ""); 14 | weiboFeed.setContent(content); 15 | } 16 | boolean more = element.select("div[node-type=feed_list_forwardContent] a[action-type=fl_unfold]") 17 | .isEmpty(); 18 | weiboFeed.setHasMore(!more); 19 | } 20 | 21 | @Override 22 | public String getFieldName() { 23 | return "content"; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /commons/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | com.fullstackyang.crawler 8 | weibo-crawler 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 4.0.0 13 | commons 14 | jar 15 | the commons code libraries subproject 16 | 17 | 18 | 19 | org.apache.commons 20 | commons-text 21 | 1.1 22 | 23 | 24 | commons-beanutils 25 | commons-beanutils 26 | 1.9.3 27 | 28 | 29 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedContentHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.NormalWeiboFeed; 4 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 5 | import org.jsoup.nodes.Element; 6 | 7 | public class FeedContentHandler implements WeiboFeedHandler { 8 | 9 | @Override 10 | public void parse(WeiboFeed weiboFeed, Element element) { 11 | Element contentElement = element.select("div[node-type=feed_list_content]").first(); 12 | contentElement.select("a[action-type=feed_list_url]").remove(); 13 | String content = contentElement.text().replaceAll("\\?\\?\\?\\?", ""); 14 | if (content.startsWith("置顶")) { 15 | content=content.replace("置顶","").trim(); 16 | ((NormalWeiboFeed)weiboFeed).setSticky(true); 17 | } 18 | weiboFeed.setContent(content); 19 | boolean more = element.select("div[node-type=feed_list_content] a[action-type=fl_unfold]").isEmpty(); 20 | weiboFeed.setHasMore(!more);//正文不完整,需要点击展开 21 | } 22 | 23 | @Override 24 | public String getFieldName() { 25 | return "content"; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/converter/DetailFeedConverter.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto.converter; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.NormalWeiboFeed; 4 | import com.fullstackyang.crawler.weibo.dto.WeiboDetail; 5 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 6 | import com.google.common.base.Converter; 7 | import org.apache.commons.beanutils.BeanUtilsBean; 8 | import org.apache.commons.beanutils.converters.DateConverter; 9 | import org.springframework.beans.BeanUtils; 10 | 11 | import java.time.LocalDateTime; 12 | 13 | public class DetailFeedConverter extends Converter { 14 | @Override 15 | public WeiboDetail doForward(WeiboFeed weiboFeed) { 16 | WeiboDetail weiboDetail = new WeiboDetail(); 17 | BeanUtilsBean beanUtilsBean=BeanUtilsBean.getInstance(); 18 | beanUtilsBean.getConvertUtils().register(new DateConverter(null),LocalDateTime.class); 19 | BeanUtils.copyProperties(weiboFeed, weiboDetail); 20 | 21 | return weiboDetail; 22 | } 23 | 24 | @Override 25 | public WeiboFeed doBackward(WeiboDetail weiboDetail) { 26 | WeiboFeed weiboFeed = new NormalWeiboFeed(); 27 | BeanUtils.copyProperties( weiboDetail,weiboFeed); 28 | return weiboFeed; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/dto/AbstractDTO.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.dto; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore; 4 | import com.fasterxml.jackson.core.JsonParser; 5 | import com.fasterxml.jackson.core.JsonProcessingException; 6 | import com.fasterxml.jackson.databind.ObjectMapper; 7 | import com.google.common.collect.Lists; 8 | import org.json.JSONObject; 9 | 10 | import java.lang.reflect.Field; 11 | import java.util.Collections; 12 | import java.util.List; 13 | 14 | public class AbstractDTO { 15 | 16 | public JSONObject toJSON(){ 17 | ObjectMapper objectMapper=new ObjectMapper(); 18 | objectMapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, true) ; 19 | 20 | try { 21 | String json = objectMapper.writeValueAsString(this); 22 | return new JSONObject(json); 23 | } catch (JsonProcessingException e) { 24 | e.printStackTrace(); 25 | } 26 | return null; 27 | } 28 | 29 | @JsonIgnore 30 | public List getAllFields() { 31 | List result = Lists.newArrayList(); 32 | for (Class clazz = this.getClass(); clazz != null && clazz != Object.class; clazz = clazz.getSuperclass()) { 33 | Collections.addAll(result, clazz.getDeclaredFields()); 34 | } 35 | return result; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/detail/WeiboDetailParser.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.detail; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboDetail; 4 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseParser; 5 | import com.google.common.base.Strings; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | 10 | import java.util.Map; 11 | import java.util.Objects; 12 | 13 | public class WeiboDetailParser extends WeiboBaseParser { 14 | 15 | private Map htmlMap; 16 | 17 | public WeiboDetailParser(final String url,final String html) { 18 | super(url,html, new DetailContentHandler()); 19 | init(); 20 | } 21 | 22 | private void init() { 23 | if (Strings.isNullOrEmpty(html)) 24 | return; 25 | this.htmlMap = getPLHTML(html, Pl_OFFICIAL_WEIBODETAIL); 26 | } 27 | 28 | public WeiboDetail parse() { 29 | String html = htmlMap.get(Pl_OFFICIAL_WEIBODETAIL); 30 | Document document = Jsoup.parse(html); 31 | return toWeiboDetail(new WeiboDetail(), document); 32 | } 33 | 34 | public WeiboDetail toWeiboDetail(WeiboDetail weiboDetail, Element element) { 35 | weiboDetail.getAllFields().stream().map(f -> 36 | getHanlder(f.getName())).filter(Objects::nonNull).forEach(p -> ((WeiboDetailHandler) p).parse(weiboDetail, element)); 37 | return weiboDetail; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/user/WeiboForwardParser.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.user; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.WeiboUser; 4 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseParser; 5 | import com.fullstackyang.crawler.weibo.parser.feed.WeiboFeedHandler; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.Objects; 14 | 15 | import static java.util.stream.Collectors.toList; 16 | 17 | public class WeiboForwardParser extends WeiboBaseParser { 18 | 19 | private Map htmlMap; 20 | 21 | public WeiboForwardParser(final String url, final String html) { 22 | super(url, html, new ForwardUserHandler()); 23 | } 24 | 25 | public WeiboForwardParser(final String url, final String html, WeiboFeedHandler... handlers) { 26 | super(url, html, handlers); 27 | } 28 | 29 | 30 | public List parse() { 31 | Document document = Jsoup.parse(html); 32 | Elements feedItems = document.select("div[action-type=feed_list_item]"); 33 | return feedItems.stream().map(e -> toWeiboUser(new WeiboUser(), e)).collect(toList()); 34 | } 35 | 36 | public WeiboUser toWeiboUser(WeiboUser weiboUser, Element element) { 37 | weiboUser.getAllFields().stream().map(f -> 38 | getHanlder(f.getName())).filter(Objects::nonNull).forEach(p -> ((WeiboUserHandler) p).parse(weiboUser, element)); 39 | return weiboUser; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/FeedHasOriginHandler.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.NormalWeiboFeed; 4 | import com.fullstackyang.crawler.weibo.dto.OriginWeiboFeed; 5 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 6 | import org.jsoup.nodes.Element; 7 | 8 | public class FeedHasOriginHandler implements WeiboFeedHandler { 9 | 10 | private WeiboFeedParser originParser = new WeiboFeedParser(null,null, 11 | new FeedOriginContentHandler(), 12 | new FeedOriginMidlHandler(), 13 | new FeedOriginPubTimeHandler(), 14 | new FeedOriginUrlHandler(), 15 | new FeedOriginForwardCountHandler(), 16 | new FeedNicknameHandler()); 17 | 18 | @Override 19 | public void parse(WeiboFeed weiboFeed, Element element) { 20 | NormalWeiboFeed normalWeiboFeed = (NormalWeiboFeed) weiboFeed; 21 | normalWeiboFeed.setOrigin(!(element.hasAttr("isforward") && element.attr("isforward").equals("1"))); 22 | if (!normalWeiboFeed.isOrigin()) { 23 | Element originElement = element.select("div[node-type=feed_list_forwardContent]").first(); 24 | if (originElement != null) { 25 | Element originDateElement = originElement.select("a[node-type=feed_list_item_date]").first(); 26 | if (originDateElement == null) { 27 | normalWeiboFeed.setOriginRemoved(true); 28 | } else { 29 | OriginWeiboFeed originWeibo = (OriginWeiboFeed) originParser.toWeiboFeed(new OriginWeiboFeed(), originElement); 30 | normalWeiboFeed.setOriginWeibo(originWeibo); 31 | } 32 | } 33 | 34 | } 35 | } 36 | 37 | @Override 38 | public String getFieldName() { 39 | return "isOrigin"; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/feed/WeiboFeedParser.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser.feed; 2 | 3 | import com.fullstackyang.crawler.weibo.dto.NormalWeiboFeed; 4 | import com.fullstackyang.crawler.weibo.dto.WeiboFeed; 5 | import com.fullstackyang.crawler.weibo.parser.WeiboBaseParser; 6 | import com.google.common.base.Strings; 7 | import com.google.common.collect.Lists; 8 | import lombok.extern.slf4j.Slf4j; 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | import org.jsoup.select.Elements; 13 | 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.Objects; 17 | 18 | import static java.util.stream.Collectors.toList; 19 | 20 | @Slf4j 21 | public class WeiboFeedParser extends WeiboBaseParser { 22 | 23 | private Map htmlMap; 24 | 25 | public WeiboFeedParser(final String url, final String html) { 26 | super(url, html, new FeedContentHandler(), 27 | new FeedPubTimeHandler(), 28 | new FeedOidHandler(), 29 | new FeedMidHandler(), 30 | new FeedUrlHandler(), 31 | new FeedHasOriginHandler(), 32 | new FeedForwardCountHandler(), 33 | new FeedNicknameHandler()); 34 | init(); 35 | } 36 | 37 | public WeiboFeedParser(final String url, final String html, WeiboFeedHandler... handlers) { 38 | super(url, html, handlers); 39 | init(); 40 | } 41 | 42 | private void init() { 43 | if (Strings.isNullOrEmpty(html)) 44 | return; 45 | this.htmlMap = getPLHTML(html, Pl_OFFICIAL_MyProfileFeed); 46 | } 47 | 48 | public List parse() { 49 | String html = htmlMap.get(Pl_OFFICIAL_MyProfileFeed); 50 | if (html == null) { 51 | log.error(this.url+" cannot get html!"); 52 | return Lists.newArrayList(); 53 | } 54 | Document document = Jsoup.parse(html); 55 | Elements feedItems = document.select("div[action-type=feed_list_item]"); 56 | return feedItems.stream().map(e -> toWeiboFeed(new NormalWeiboFeed(), e)).collect(toList()); 57 | } 58 | 59 | public WeiboFeed toWeiboFeed(WeiboFeed weiboFeed, Element element) { 60 | weiboFeed.getAllFields().stream().map(f -> 61 | getHanlder(f.getName())).filter(Objects::nonNull).forEach(p -> ((WeiboFeedHandler) p).parse(weiboFeed, element)); 62 | return weiboFeed; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /modules/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.fullstackyang.crawler 7 | weibo-crawler 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | weibo-crawler-modules 13 | jar 14 | weibo crawler reusable components 15 | 16 | 17 | 18 | com.fullstackyang 19 | fullstackyang-httpclient 20 | 1.0-SNAPSHOT 21 | 22 | 23 | org.slf4j 24 | slf4j-simple 25 | 26 | 27 | 28 | 29 | com.fullstackyang.crawler 30 | commons 31 | 1.0-SNAPSHOT 32 | 33 | 34 | com.fasterxml.jackson.core 35 | jackson-databind 36 | ${jackson.version} 37 | 38 | 39 | com.fasterxml.jackson.core 40 | jackson-annotations 41 | 42 | 43 | 44 | 45 | com.fasterxml.jackson.core 46 | jackson-annotations 47 | ${jackson.version} 48 | 49 | 50 | org.apache.commons 51 | commons-text 52 | 1.1 53 | 54 | 55 | commons-beanutils 56 | commons-beanutils 57 | 1.9.3 58 | 59 | 60 | org.springframework 61 | spring-context-support 62 | provided 63 | 64 | 65 | org.slf4j 66 | slf4j-simple 67 | 1.7.5 68 | provided 69 | 70 | 71 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.fullstackyang.crawler 8 | weibo-crawler 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | 13 | modules 14 | commons 15 | draw-collector 16 | 17 | 18 | 19 | 1.8 20 | UTF-8 21 | 2.9.1 22 | 1.5.1.RELEASE 23 | 21.0 24 | 25 | 26 | 27 | 28 | 29 | org.springframework.boot 30 | spring-boot-starter-parent 31 | ${spring-boot.version} 32 | pom 33 | import 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | com.google.guava 42 | guava 43 | ${google.guava.version} 44 | 45 | 46 | junit 47 | junit 48 | 4.11 49 | test 50 | 51 | 52 | 53 | 54 | 55 | 56 | org.apache.maven.plugins 57 | maven-compiler-plugin 58 | 3.5.1 59 | 60 | ${maven.compiler.target} 61 | ${maven.compiler.target} 62 | ${maven.compiler.encoding} 63 | 64 | 65 | 66 | org.apache.maven.plugins 67 | maven-surefire-plugin 68 | 2.19 69 | 70 | 71 | org.apache.maven.surefire 72 | surefire-junit47 73 | 2.19 74 | 75 | 76 | 77 | true 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /commons/src/main/java/com/fullstackyang/crawler/weibo/utils/DateConvertor.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.utils; 2 | 3 | import com.google.common.base.Strings; 4 | 5 | import java.time.LocalDateTime; 6 | import java.time.LocalTime; 7 | import java.util.Arrays; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.regex.Matcher; 13 | import java.util.regex.Pattern; 14 | 15 | import static java.util.stream.Collectors.toList; 16 | 17 | public class DateConvertor { 18 | 19 | private static final String PATTERN_1 = "^刚刚$"; 20 | private static final String PATTERN_2 = "^\\d{1,2}秒前$"; 21 | private static final String PATTERN_3 = "^\\d{1,2}分钟前$"; 22 | private static final String PATTERN_4 = "^\\d{1,2}小时前$"; 23 | private static final String PATTERN_5 = "^\\d+天前$"; 24 | private static final String PATTERN_6 = "^\\d+月前$"; 25 | private static final String PATTERN_7 = "^\\d{4}(\\-)\\d{1,2}(\\-)\\d{1,2}$"; 26 | private static final String PATTERN_8 = "^\\d{4}(\\-)\\d{1,2}(\\-)\\d{1,2}(\\s+)\\d{1,2}:\\d{1,2}(:\\d{1,2})?$"; 27 | private static final String PATTERN_9 = "^[今|昨]天(\\s*)\\d{1,2}:\\d{1,2}(:\\d{1,2})?$"; 28 | 29 | private static final Map map = new HashMap() { 30 | { 31 | put(1, PATTERN_1); 32 | put(2, PATTERN_2); 33 | put(3, PATTERN_3); 34 | put(4, PATTERN_4); 35 | put(5, PATTERN_5); 36 | put(6, PATTERN_6); 37 | put(7, PATTERN_7); 38 | put(8, PATTERN_8); 39 | put(9, PATTERN_9); 40 | } 41 | }; 42 | 43 | public static LocalDateTime convert(String date) { 44 | return convert(date, LocalDateTime.now()); 45 | } 46 | 47 | public static LocalDateTime convert(final String dateTimeStr, LocalDateTime dateTime) { 48 | if (Strings.isNullOrEmpty(dateTimeStr)) 49 | return dateTime; 50 | String temp = dateTimeStr.trim().replaceAll("\\s{2,}", " ").replaceAll("年|/", "-"); 51 | if (dateTimeStr.contains("月") && dateTimeStr.contains("日")) { 52 | temp = temp.replace("月", "-").replace("日", ""); 53 | if (!dateTimeStr.contains("年")) 54 | temp = LocalDateTime.now().getYear() + "-" + temp; 55 | } 56 | 57 | for (Entry entry : map.entrySet()) { 58 | int key = entry.getKey(); 59 | String value = entry.getValue(); 60 | Matcher matcher = Pattern.compile(value).matcher(temp); 61 | if (matcher.find()) { 62 | return calculate(key, temp, dateTime); 63 | } 64 | } 65 | return dateTime; 66 | } 67 | 68 | private static LocalDateTime calculate(int key, String dateTimeStr, LocalDateTime dateTime) { 69 | switch (key) { 70 | case 1:// "^刚刚$"; 71 | break; 72 | case 2:// "^\\d{1,2}秒前$" 73 | return dateTime.minusSeconds(Long.parseLong(dateTimeStr.replace("秒前", ""))); 74 | case 3:// "^\\d{1,2}分钟前$" 75 | return dateTime.minusMinutes(Long.parseLong(dateTimeStr.replace("分钟前", ""))); 76 | case 4:// "^\\d{1,2}小时前$" 77 | return dateTime.minusHours(Long.parseLong(dateTimeStr.replace("小时前", ""))); 78 | case 5:// "^\\d+天前$"; 79 | return dateTime.minusDays(Long.parseLong(dateTimeStr.replace("天前", ""))); 80 | case 6:// "^\\d+月前$" 81 | return dateTime.minusMonths(Long.parseLong(dateTimeStr.replace("月前", ""))); 82 | case 7:// "^\\d{4}(\\-)\\d{1,2}(\\-)\\d{1,2}$"; 83 | return LocalDateTime.parse(padding(dateTimeStr, "-")); 84 | case 8:// "^\\d{4}(\\-)\\d{1,2}(\\-)\\d{1,2}(\\s+)\\d{1,2}:\\d{1,2}(:\\d{1,2})?$" 85 | String[] parts = dateTimeStr.split(" "); 86 | return LocalDateTime.parse(padding(parts[0], "-") + "T" + padding(parts[1], ":")); 87 | case 9:// "^[今|昨]天(\\s*)\\d{2}:\\d{2}$" 88 | String time = padding(dateTimeStr.replaceAll("[今|昨]天(\\s*)", ""), ":"); 89 | if (dateTimeStr.contains("昨")) 90 | return dateTime.toLocalDate().minusDays(1).atTime(LocalTime.parse(time)); 91 | else return dateTime.toLocalDate().atTime(LocalTime.parse(time)); 92 | default: 93 | break; 94 | } 95 | return dateTime; 96 | } 97 | 98 | private static String padding(String dateTimeStr, String delimiter) { 99 | List list = Arrays.stream(dateTimeStr.split(delimiter)).map(s -> Strings.padStart(s, 2, '0')).collect(toList()); 100 | return String.join(delimiter, list); 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/parser/WeiboBaseParser.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.parser; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.google.common.collect.Maps; 5 | import lombok.Getter; 6 | import lombok.extern.slf4j.Slf4j; 7 | import org.json.JSONObject; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.jsoup.nodes.Element; 11 | import org.jsoup.select.Elements; 12 | 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | @Slf4j 19 | public class WeiboBaseParser { 20 | 21 | public static final String PL_CORE_USER_INFO = "Pl_Core_UserInfo__"; 22 | 23 | public static final String PL_CORE_T8_CUSTOM_TRI_COLUMN = "Pl_Core_T8CustomTriColumn__"; 24 | 25 | public static final String PL_OFFICIAL_HEADERV6 = "Pl_Official_Headerv6__"; 26 | 27 | public static final String PL_OFFICIAL_RIGHT_GROW_NEW = "Pl_Official_RightGrowNew__"; 28 | 29 | public static final String PL_OFFICIAL_PERSONAL_INFO = "Pl_Official_PersonalInfo__"; 30 | 31 | public static final String Pl_OFFICIAL_MyProfileFeed = "Pl_Official_MyProfileFeed__"; 32 | 33 | public static final String Pl_OFFICIAL_WEIBODETAIL = "Pl_Official_WeiboDetail__"; 34 | 35 | private final Map handlerMap; 36 | 37 | protected String url; 38 | protected String html; 39 | 40 | @Getter 41 | protected JSONObject configObject; 42 | 43 | public WeiboBaseParser(final String url, final String html, final WeiboBaseHandler... handlers) { 44 | this.url = url; 45 | setHtml(html); 46 | this.handlerMap = Maps.newConcurrentMap(); 47 | for (final WeiboBaseHandler handler : handlers) { 48 | handlerMap.put(handler.getFieldName(), handler); 49 | } 50 | } 51 | 52 | public void setHtml(String html) { 53 | if (html != null) { 54 | this.html = escape(html); 55 | this.configObject = getHTMLConfig(this.html); 56 | } 57 | } 58 | 59 | private String escape(String html) { 60 | return html.replaceAll("\\\\\"", "\"") 61 | .replaceAll("\\\\/", "/") 62 | .replaceAll("\\\\r|\\\\n|\\\\t|\\u200B|\\u200b", ""); 63 | 64 | } 65 | 66 | public void showConfig() { 67 | log.info(configObject.toString(2)); 68 | } 69 | 70 | public boolean checkConfig() { 71 | return html != null && !configObject.isNull("page_id") && configObject.has("page_id"); 72 | } 73 | 74 | /** 75 | * 每一个微博页面基本上都有CONFIG,其中包含了page_id,oid等基本信息 76 | * 77 | * @param html 78 | * @return 79 | */ 80 | public JSONObject getHTMLConfig(String html) { 81 | Document document = Jsoup.parse(html); 82 | Elements elements = document.select("script[type=text/javascript]"); 83 | JSONObject jsonObject = new JSONObject(); 84 | if (elements.isEmpty()) 85 | return jsonObject; 86 | 87 | String text = ""; 88 | for (Element element : elements) { 89 | String temp = element.html(); 90 | if (temp.contains("$CONFIG")) { 91 | text = temp; 92 | break; 93 | } 94 | } 95 | String[] parts = text.split(";"); 96 | for (String str : parts) { 97 | if (!str.contains("=")) 98 | continue; 99 | String[] strings = str.trim().split("="); 100 | String key = strings[0].replaceAll("\\$CONFIG|\\[|\\]|'", ""); 101 | String value = strings[1].replaceAll("'", "").trim(); 102 | jsonObject.put(key, value + ""); 103 | } 104 | return jsonObject; 105 | } 106 | 107 | /** 108 | * 抽取微博页面底部pl开头的各种子页面 109 | * 110 | * @param html 111 | * @param pl 112 | * @return 113 | */ 114 | public static Map getPLHTML(String html, String... pl) { 115 | Map map = Maps.newHashMap(); 116 | List pls = Lists.newArrayList(pl); 117 | Document document = Jsoup.parse(html); 118 | Elements elements = document.select("script"); 119 | if (elements.isEmpty()) 120 | return map; 121 | for (Element element : elements) { 122 | String text = element.html(); 123 | if (!text.startsWith("FM.view")) 124 | continue; 125 | for (String domid : pls) { 126 | String str = text.substring(text.indexOf("(") + 1, text.lastIndexOf(")")); 127 | 128 | if (str.contains("\"domid\":\"" + domid)) { 129 | Pattern pattern = Pattern.compile("\"domid\":\"" + domid + ".+}?"); 130 | Matcher matcher = pattern.matcher(str); 131 | if (matcher.find()) { 132 | String value = matcher.group(); 133 | if (value.contains("\"html\":\"")) { 134 | value = value.substring(value.indexOf("html\":\"") + 7, value.lastIndexOf("}") - 1); 135 | map.put(domid, value); 136 | pls.remove(domid); 137 | } 138 | } 139 | break; 140 | } 141 | } 142 | } 143 | return map; 144 | } 145 | 146 | protected WeiboBaseHandler getHanlder(final String name) { 147 | return this.handlerMap.get(name); 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /modules/src/main/java/com/fullstackyang/crawler/weibo/client/WeiboClient.java: -------------------------------------------------------------------------------- 1 | package com.fullstackyang.crawler.weibo.client; 2 | 3 | import com.fullstackyang.httpclient.HttpClientInstance; 4 | import com.fullstackyang.httpclient.HttpRequestUtils; 5 | import com.google.common.base.Strings; 6 | import com.google.common.collect.Maps; 7 | import com.google.common.net.HttpHeaders; 8 | import lombok.NoArgsConstructor; 9 | import lombok.extern.slf4j.Slf4j; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.http.client.config.CookieSpecs; 12 | import org.apache.http.client.config.RequestConfig; 13 | import org.apache.http.client.methods.HttpGet; 14 | import org.apache.http.client.methods.HttpPost; 15 | import org.json.JSONObject; 16 | 17 | import java.io.UnsupportedEncodingException; 18 | import java.math.BigDecimal; 19 | import java.net.URLEncoder; 20 | import java.util.Map; 21 | import java.util.concurrent.locks.Lock; 22 | import java.util.concurrent.locks.ReentrantLock; 23 | 24 | /** 25 | * 微博免登陆请求客户端 26 | * 27 | * @author fullstackyang 28 | */ 29 | @Slf4j 30 | public class WeiboClient { 31 | 32 | private static CookieFetcher cookieFetcher = new CookieFetcher(); 33 | 34 | private volatile String cookie; 35 | 36 | public WeiboClient() { 37 | this.cookie = cookieFetcher.getCookie(); 38 | } 39 | 40 | private static Lock lock = new ReentrantLock(); 41 | 42 | /** 43 | * when getting html failed, call this to get a new cookie and try again 44 | */ 45 | public void cookieReset() { 46 | if (lock.tryLock()) { 47 | try { 48 | HttpClientInstance.instance().changeProxy(); 49 | this.cookie = cookieFetcher.getCookie(); 50 | log.info("cookie :" + cookie); 51 | } finally { 52 | lock.unlock(); 53 | } 54 | } 55 | } 56 | 57 | /** 58 | * for example: 59 | *

WeiboClient weiboClient=new WeiboClient();

60 | *

weiboClient. get("http://weibo.com");

61 | * 62 | * @param url weibo.com/* 63 | * @return 64 | */ 65 | public String get(String url) { 66 | if (Strings.isNullOrEmpty(url)) 67 | return ""; 68 | 69 | while (true) { 70 | HttpGet httpGet = new HttpGet(url); 71 | httpGet.addHeader(HttpHeaders.COOKIE, cookie); 72 | httpGet.addHeader(HttpHeaders.HOST, "weibo.com"); 73 | httpGet.addHeader("Upgrade-Insecure-Requests", "1"); 74 | 75 | httpGet.setConfig(RequestConfig.custom().setSocketTimeout(3000) 76 | .setConnectTimeout(3000).setConnectionRequestTimeout(3000).build()); 77 | String html = HttpClientInstance.instance().tryExecute(httpGet, null, null); 78 | if (html == null) 79 | cookieReset(); 80 | else return html; 81 | } 82 | } 83 | 84 | 85 | /** 86 | * 获取访问微博时必需的Cookie 87 | */ 88 | @NoArgsConstructor 89 | static class CookieFetcher { 90 | 91 | static final String PASSPORT_URL = "https://passport.weibo.com/visitor/visitor?entry=miniblog&a=enter&url=http://weibo.com/?category=2" 92 | + "&domain=.weibo.com&ua=php-sso_sdk_client-0.6.23"; 93 | 94 | static final String GEN_VISITOR_URL = "https://passport.weibo.com/visitor/genvisitor"; 95 | 96 | static final String VISITOR_URL = "https://passport.weibo.com/visitor/visitor?a=incarnate"; 97 | 98 | private String getCookie() { 99 | Map map; 100 | while (true) { 101 | map = getCookieParam(); 102 | if (map.containsKey("SUB") && map.containsKey("SUBP") && 103 | StringUtils.isNoneEmpty(map.get("SUB"), map.get("SUBP"))) 104 | break; 105 | HttpClientInstance.instance().changeProxy(); 106 | } 107 | return " YF-Page-G0=" + "; _s_tentry=-; SUB=" + map.get("SUB") + "; SUBP=" + map.get("SUBP"); 108 | } 109 | 110 | private Map getCookieParam() { 111 | String time = System.currentTimeMillis() + ""; 112 | time = time.substring(0, 9) + "." + time.substring(9, 13); 113 | String passporturl = PASSPORT_URL + "&_rand=" + time; 114 | 115 | String tid = ""; 116 | String c = ""; 117 | String w = ""; 118 | { 119 | String str = postGenvisitor(passporturl); 120 | if (str.contains("\"retcode\":20000000")) { 121 | JSONObject jsonObject = new JSONObject(str).getJSONObject("data"); 122 | tid = jsonObject.optString("tid"); 123 | try { 124 | tid = URLEncoder.encode(tid, "utf-8"); 125 | } catch (UnsupportedEncodingException e) { 126 | } 127 | c = jsonObject.has("confidence") ? "000" + jsonObject.getInt("confidence") : "100"; 128 | w = jsonObject.optBoolean("new_tid") ? "3" : "2"; 129 | } 130 | } 131 | String s = ""; 132 | String sp = ""; 133 | { 134 | if (StringUtils.isNoneEmpty(tid, w, c)) { 135 | String str = getVisitor(tid, w, c, passporturl); 136 | str = str.substring(str.indexOf("(") + 1, str.indexOf(")")); 137 | if (str.contains("\"retcode\":20000000")) { 138 | JSONObject jsonObject = new JSONObject(str).getJSONObject("data"); 139 | s = jsonObject.getString("sub"); 140 | sp = jsonObject.getString("subp"); 141 | } 142 | 143 | } 144 | } 145 | Map map = Maps.newHashMap(); 146 | map.put("SUB", s); 147 | map.put("SUBP", sp); 148 | return map; 149 | } 150 | 151 | private String postGenvisitor(String passporturl) { 152 | 153 | Map headers = Maps.newHashMap(); 154 | headers.put(HttpHeaders.ACCEPT, "*/*"); 155 | headers.put(HttpHeaders.ORIGIN, "https://passport.weibo.com"); 156 | headers.put(HttpHeaders.REFERER, passporturl); 157 | 158 | Map params = Maps.newHashMap(); 159 | params.put("cb", "gen_callback"); 160 | params.put("fp", fp()); 161 | 162 | HttpPost httpPost = HttpRequestUtils.createHttpPost(GEN_VISITOR_URL, headers, params); 163 | 164 | String str = HttpClientInstance.instance().execute(httpPost, null); 165 | return str.substring(str.indexOf("(") + 1, str.lastIndexOf("")); 166 | } 167 | 168 | private String getVisitor(String tid, String w, String c, String passporturl) { 169 | String url = VISITOR_URL + "&t=" + tid + "&w=" + "&c=" + c.substring(c.length() - 3) 170 | + "&gc=&cb=cross_domain&from=weibo&_rand=0." + rand(); 171 | 172 | Map headers = Maps.newHashMap(); 173 | headers.put(HttpHeaders.ACCEPT, "*/*"); 174 | headers.put(HttpHeaders.HOST, "passport.weibo.com"); 175 | headers.put(HttpHeaders.COOKIE, "tid=" + tid + "__0" + c); 176 | headers.put(HttpHeaders.REFERER, passporturl); 177 | 178 | HttpGet httpGet = HttpRequestUtils.createHttpGet(url, headers); 179 | httpGet.setConfig(RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build()); 180 | return HttpClientInstance.instance().execute(httpGet, null); 181 | } 182 | 183 | private static String rand() { 184 | return new BigDecimal(Math.floor(Math.random() * 10000000000000000L)).toString(); 185 | } 186 | 187 | private static String fp() { 188 | JSONObject jsonObject = new JSONObject(); 189 | jsonObject.put("os", "1"); 190 | jsonObject.put("browser", "Chrome59,0,3071,115"); 191 | jsonObject.put("fonts", "undefined"); 192 | jsonObject.put("screenInfo", "1680*1050*24"); 193 | jsonObject.put("plugins", 194 | "Enables Widevine licenses for playback of HTML audio/video content. (version: 1.4.8.984)::widevinecdmadapter.dll::Widevine Content Decryption Module|Shockwave Flash 26.0 r0::pepflashplayer.dll::Shockwave Flash|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client|Portable Document Format::internal-pdf-viewer::Chrome PDF Viewer"); 195 | return jsonObject.toString(); 196 | } 197 | } 198 | } 199 | --------------------------------------------------------------------------------